library(componentSkeleton) execute <- function(cf) { #inputs iterations <- as.integer(get.parameter(cf, 'iterations')) skip <- split.trim(get.parameter(cf, 'skipCol'), ',') classCol <- get.parameter(cf, 'classCol') seed <- as.integer(get.parameter(cf, 'seed')) libraries <- split.trim(get.parameter(cf, 'libraries'), ',') input <- CSV.read(get.input(cf, 'in')) #output array.out.dir <- get.output(cf, 'out') if (!file.exists(array.out.dir)) { dir.create(array.out.dir, recursive=TRUE) } array.out<-list() #remove unwanted columns if (!identical(skip,"")) { for (cols in skip) { input <- input[,-which(names(input)==cols),drop=FALSE] } } y <- factor(input[,classCol]) input <- input[,-which(names(input)==classCol),drop=FALSE] # Start selecting features if ('propOverlap' %in% libraries) { array.out[['propOverlap']] <- tryCatch({ sel_propOverlap(input,y,seed) }, error=function(e) { not_working('propOverlap') } ) } if ('rfUtilities' %in% libraries) { array.out[['rfUtilities']] <- tryCatch({ sel_rfUtilities(input,y,seed) }, error=function(e) { not_working('rfUtilities') } ) } if ('RRF' %in% libraries) { array.out[['RRF']] <- tryCatch({ sel_RRF(input,y,seed,iterations) }, error=function(e) { not_working('RRF') } ) } if ('Biocomb' %in% libraries) { tryCatch({ start_selection("Biocomb") }, error=function(e) { not_working('Biocomb') } ) Biocomb.methods<-c("auc","HUM","Chi-square","InformationGain","symmetrical.uncertainty","Relief", "FastFilter","CFS","CorrSF","Chi2-algorithm") for (i in Biocomb.methods) { set.seed(seed) i_safe <- gsub("-","_",i) array.out[[paste('Biocomb',i_safe,sep="_")]] <- tryCatch({ sel_Biocomb(input,y,i) }, error=function(e) { not_working('Biocomb') } ) } tryCatch({ end_selection("Biocomb") }, error=function(e) { not_working('Biocomb') } ) } ## if ('varSelRF' %in% libraries) { array.out[['varSelRF']] <- tryCatch({ sel_varSelRF(input,y,seed) }, error=function(e) { not_working('varSelRF') } ) } ## if ('AUCRF' %in% libraries) { if (length(unique(y))==2) { array.out[['AUCRF']] <- tryCatch({ sel_AUCRF(input,y,seed) }, error=function(e) { not_working('AUCRF') } ) } else { print("Skipping AUCRF, requires a two class problem") } } ## if ('VSURF' %in% libraries) { VSURF_selection <- tryCatch({ sel_VSURF(input,y,seed) }, error=function(e) { not_working('VSURF') } ) array.out[['VSURF_thresholding']] <- VSURF_selection[['VSURF_thresholding']] array.out[['VSURF_interpretation']] <- VSURF_selection[['VSURF_interpretation']] array.out[['VSURF_prediction']] <- VSURF_selection[['VSURF_prediction']] } ## if ('FeaLect' %in% libraries) { array.out[['FeaLect']] <- tryCatch({ sel_FeaLect(input,y,seed,iterations) }, error=function(e) { not_working('FeaLect') } ) } ## if ('CORElearn' %in% libraries) { tryCatch({ start_selection("CORElearn") }, error=function(e) { not_working('CORElearn') } ) for (i in c("ReliefFequalK","ReliefFexpRank","ReliefFbestK","Relief","InfGain","GainRatio","MDL","Gini","MyopicReliefF","Accuracy","ReliefFmerit","ReliefFdistance","ReliefFsqrDistance","DKM","ReliefFexpC","ReliefFavgC","ReliefFpe","ReliefFpa","ReliefFsmp","GainRatioCost","DKMcost","ReliefKukar","MDLsmp","ImpurityEuclid","ImpurityHellinger","UniformDKM","UniformGini","UniformInf","UniformAccuracy","EqualDKM","EqualGini","EqualInf","EqualHellinger","DistHellinger","DistAUC","DistAngle","DistEuclid")){ #,"RReliefFequalK" regression models: ,"ReliefFexpRank","RReliefFbestK","RReliefFwithMSE","MSEofMean","MSEofModel","MAEofModel","RReliefFdistance","RReliefFsqrDistance")) { set.seed(seed) array.out[[paste('CORElearn',i,sep="_")]] <- tryCatch({ sel_CORElearn(input,y,i) }, error=function(e) { not_working('CORElearn') } ) } tryCatch({ end_selection("CORElearn") }, error=function(e) { not_working('CORElearn') } ) } ## if ('Boruta' %in% libraries) { array.out[['Boruta']] <- tryCatch({ sel_Boruta(input,y,seed,iterations) }, error=function(e) { not_working('Boruta') } ) } ## if ('FSelector' %in% libraries) { tryCatch({ start_selection("FSelector") }, error=function(e) { not_working('FSelector') } ) suppressMessages(library('e1071', character.only=TRUE, quietly=TRUE)) for (i in c('bestFirst','forward','backward','hillClimbing')){#,'exhaustive')) { print(paste(" * method:",i)) array.out[[paste('FSelector',i,sep="_")]] <- tryCatch({ sel_SVM(input,y,seed,i) }, error=function(e) { not_working('FSelector') } ) } detach(paste("package",'e1071',sep=":"), unload=TRUE, character.only=TRUE) tryCatch({ end_selection("FSelector") }, error=function(e) { not_working('FSelector') } ) } ## Summary overlap ## Sensible selectors to combine ## VSURF_prediction,AUCRF,Boruta,propOverlap,rfUtilites,varSelRF, ## Biocomb_* sensible=c('RRF',"VSURF_prediction","AUCRF","Boruta","propOverlap","rfUtilites","varSelRF") for (n in names(array.out)[grep("Biocomb", names(array.out))]) { sensible<-append(sensible,n) } for (n in names(array.out)[grep("FSelector", names(array.out))]) { sensible<-append(sensible,n) } sum_features<-rep(0,ncol(input)) for (n in sensible) { sum_features<-sum_features+get_matches(input, array.out[[n]][,1]) } write.log(cf,paste("Accounting",length(sensible),"different algorithms in the summary")) sum_features<-as.matrix(cbind(colnames(input),sum_features), nc=2) sum_features<-cbind(sum_features[order(as.numeric(sum_features[,2]), decreasing=T),], 1:nrow(sum_features)) colnames(sum_features)<-c("Feature","Frequency","Index") array.out[['Ensemble']]<-sum_features # Write output array array.out.object <- Array.new() for(i in 1:length(array.out)) { key = names(array.out)[i] filename = paste(key, ".csv", sep="") CSV.write(paste(array.out.dir, "/", filename, sep=""), array.out[[i]]) array.out.object <- Array.add(array.out.object, key, filename) } Array.write(cf, array.out.object, 'out') return(0) } start_selection <- function (s) { print(paste("Selecting features with package:",s)) suppressMessages(library(s, character.only=TRUE, quietly=TRUE)) } end_selection <- function(s) { detach(paste("package",s,sep=":"), unload=TRUE, character.only=TRUE) } not_working <- function(s) { write(paste("Package",s,"did not load, or algorithm failed"),stderr()) return(NULL) } get_matches <- function(data,n) { return(as.numeric(!is.na(match(colnames(data),n)))) } sel_propOverlap <- function(data,y,seed) { start_selection("propOverlap") set.seed(seed) s<-Sel.Features(t(array(data)),y) selection<-data.frame(cbind( colnames(data)[s$Features[,1]], s$Features[,2] )) selection<-selection[order(selection[,2], decreasing=T),] colnames(selection)<-c('Feature','Importance') end_selection("propOverlap") return(selection) } sel_rfUtilities <- function(data,y,seed) { start_selection("rfUtilities") r <- rf.modelSel(data,y, plot.imp=F,seed=seed, r=c(0.01,0.25,0.5,0.75,0.99)) selection<-data.frame(cbind( r$selvars, r$importance )) selection<-selection[order(selection[,2], decreasing=T),] colnames(selection)<-c('Feature','Importance') rownames(selection)<-NULL end_selection("rfUtilities") return(selection) } sel_Biocomb <- function(data,y,method) { print(paste(" * method:",method)) b<-select.process(cbind(data,y), method=method) selection<-matrix(colnames(data)[b],nc=1) colnames(selection)<-c('Feature') return(selection) } sel_varSelRF <- function(data,y,seed) { start_selection("varSelRF") set.seed(seed) suppressWarnings(v<-varSelRF(data,y)) selection<-matrix(v$selected.vars,nc=1) colnames(selection)<-c('Feature') end_selection("varSelRF") return(selection) #~ # RFBoot takes a lot of time, is it really worth it? #~ # vb<-varSelRFBoot(data,y,usingCluster=F) #~ # vb$all.data.randomForest$importance } sel_AUCRF <- function(data,y,seed) { start_selection("AUCRF") set.seed(seed) # Requires factor of 0 or 1 d <- data.frame(X=data, Y=factor(match(y,sort(unique(y))) - 1)) a<-AUCRF(Y~.,data=d, pdel=0) selection<-data.frame(cbind(gsub("^X\\.","",a$Xopt), a$ranking[a$Xopt])) colnames(selection)<-c('Feature','Importance') rownames(selection)<-NULL end_selection("AUCRF") return(selection) } sel_VSURF <- function(data,y,seed) { start_selection("VSURF") print("...This will take time") set.seed(seed) VS<-VSURF(data,y) # Selections after Thresholding, Interpretation and Prediction phases selection<-list() selection[['VSURF_thresholding']] <- cbind( colnames(data)[VS[['varselect.thres']]], VS$imp.varselect.thres ) colnames(selection[['VSURF_thresholding']])<-c('Feature','Importance') selection[['VSURF_interpretation']]<-cbind( colnames(data)[VS[['varselect.interp']]] ) colnames(selection[['VSURF_interpretation']])<-c('Feature') selection[['VSURF_prediction']]<-cbind( colnames(data)[VS[['varselect.pred']]] ) colnames(selection[['VSURF_prediction']])<-c('Feature') end_selection("VSURF") return(selection) } sel_FeaLect <- function(data,y,seed,num.of.models) { start_selection("FeaLect") set.seed(seed) print(paste(" Selecting from",num.of.models,"models")) y<-match(y,sort(unique(y))) - 1 f <-FeaLect(F=as.matrix(data),L=y, maximum.features.num=10,total.num.of.models=num.of.models,talk=TRUE, plot.scores=F,return_linear.models=F) selection<-cbind( names(f$log.scores), f$log.scores ) selection<-selection[order(selection[,2], decreasing=T),] colnames(selection)<-c('Feature','Importance') rownames(selection)<-NULL end_selection("FeaLect") return(selection) } sel_CORElearn <- function(data,y,method) { print(paste(" * method:",method)) selection<-sort(attrEval("y",cbind(data,y),method), decreasing=T) selection<-cbind(names(selection), selection) colnames(selection)<-c('Feature','Importance') rownames(selection)<-NULL return(selection) } sel_Boruta <- function(data,y,seed,runs) { start_selection("Boruta") set.seed(seed) print("...This will take time") B<-Boruta(data,y,maxRuns=runs) selection<-data.frame(cbind(names(B$finalDecision),B$finalDecision)) colnames(selection)<-c('Feature','Importance') rownames(selection)<-NULL selection<-selection[selection[,2]!="3",] selection<-selection[order(selection[,2], decreasing=T),] selection[,2]<-as.numeric(selection[,2])/2 end_selection("Boruta") return(selection) } sel_RRF <- function (data,y,seed,iters) { start_selection("RRF") suppressMessages(library("randomForest", character.only=TRUE, quietly=TRUE)) set.seed(seed) subsets<-list() importances<-list() error.rates <- matrix(rep(0,iters), nc=1) for (i in 1:iters) { impRF <- RRF(y=y, x=data, importance=TRUE, flagReg=0)$importance[,"MeanDecreaseAccuracy"] imp=impRF/(max(impRF)) coefReg=0.9*0.8+0.1*imp # Weighted average? model <- RRF(y=y, x=data, importance=TRUE, mtry=ncol(data),coefReg=coefReg) important=model$importance important=important[,"MeanDecreaseGini"]>0 subsets[[i]] <- rownames(model$importance)[important] importances[[i]] <- model$importance[important,"MeanDecreaseGini"] model <- randomForest(y=y, x=data[,which(important),drop=FALSE], importance=TRUE) error.rates[i] <- mean(model$err.rate[,'OOB']) } best <- which.min(error.rates) selection<- data.frame(cbind(subsets[[best]], importances[[best]])) colnames(selection)<-c('Feature','Importance') rownames(selection)<-NULL selection<-selection[order(selection[,2], decreasing=T),] detach(paste("package","randomForest",sep=":"), unload=TRUE, character.only=TRUE) end_selection("RRF") return(selection) } sel_SVM <- function(data,y,seed,search.engine) { SVM.classifier <- function(subset){ value <- svm(data[,subset,drop=FALSE],y) ttable <- table(predict(value,data[,subset,drop=FALSE]),y) auc <- 0.5* ( ttable[2,2]/(ttable[2,2]+ttable[2,1]) + ttable[1,1]/(ttable[1,1]+ttable[1,2]) ) return(auc) } features<-colnames(data) perf <- data.frame(subset=character(0), perf=numeric(0),stringsAsFactors=FALSE) if(search.engine == 'bestFirst') subset <- best.first.search(attributes=features,eval.fun=SVM.classifier) if(search.engine == 'forward') subset <- forward.search(attributes=features,eval.fun=SVM.classifier) if(search.engine == 'backward') subset <- backward.search(attributes=features,eval.fun=SVM.classifier) if(search.engine == 'hillClimbing') subset <- hill.climbing.search(attributes=features,eval.fun=SVM.classifier) if(search.engine == 'exhaustive') subset <- exhaustive.search(attributes=features,eval.fun=SVM.classifier) index <- perf[,'subset'] %in% paste(subset,collapse=',') selection<- matrix(subset, nc=1) colnames(selection)<-c('Feature') rownames(selection)<-NULL return(selection) } main(execute)