✒️ BORUTA_Feature-selection

My Guthub repo include pipeline for running random forest-based algorithm on SRA fastq files using Boruta tool. The pipeline is written in an R programming language.

⚙️ Technologies & Tools

#!/usr/bin/Rscript

#Load packages
library(tidyverse)
library(Boruta)


set.seed(123)
setwd("/home/data")

# read in reads file
data <- read.delim("input.txt", sep='\t', header=TRUE, stringsAsFactors=FALSE)

# Filtering lowely express genes
group <- paste(c(rep("AD",200), rep("Ctrl",400)))
y <- DGEList(counts= data, group=group)
keep <- filterByExpr(y)
y <- y[keep,,keep.lib.sizes=FALSE] 
y <- calcNormFactors(y,method = "TMM") # TMM normalization method
reads_norm <- cpm(y, normalized.lib.sizes = T)

# log transform the reads to make the counts closer to a normal distribution
reads_norm <- log2(reads_norm+1)
datExpr=t(reads_norm)
dim(datExpr)


# Now we read in the physiological trait data
datTraits <- read.delim("meta_df.txt", sep='\t', header=TRUE, stringsAsFactors=FALSE)
datTraits <- dplyr::select(datTraits, c('Sample', 'Project_Numeric', 'Strategy_Numeric', 'Condition', 'Age', 'Sex'))
head(datTraits)
# Form a data frame analogous to expression data that will hold the clinical traits.
rownames(datTraits) = datTraits$Sample
# show that row names agree
table(rownames(datTraits)==rownames(datExpr)) #should return TRUE if datasets align correctly.

# select covariate columns for machine learning
var <- c('Sample', 'Condition', 'Age', 'Sex')
ml_covar <- datTraits[,names(datTraits) %in% var]

# combine genes and covariates into one object 
covar_genes <- cbind(datExpr, ml_covar)

# save full cohort with covariate columns for machine learning regression
saveRDS(covar_genes, "ml_regression_input_full_cohort.rds")

Input data format

	AD01	AD02	AD03	Ctrl01	Ctrl02	Ctrl03
ENST00000456328.2	7.85718	3.52895	2.15625	0	0	0.466484
ENST00000450305.2	1.2187	1.39157	3.47069	0.0013632	0.0355961	0.0544532

# balanced sub cohort for machine learning classification
set.seed(123)
mldata <- covar_genes

mldata <- mldata[complete.cases(mldata),] #Return a logical vector indicating which cases are complete, i.e., have no missing values.
mldata <- droplevels(mldata) #The function droplevels is used to drop unused levels from a factor or, more commonly, from factors in a data frame.
contrast_labels <- mldata$Condition

rm_cols <- var <- c('Sample', 'Condition', 'Age', 'Sex')
mldata <- mldata[,!(names(mldata) %in% rm_cols)]

saveRDS(mldata,"data_case_vs_ctrl_ml_class.rds")
saveRDS(contrast_labels,"labels_case_vs_ctrl_ml_class.rds")

# input data set up - the labels and data files are produced by the "prepare_for_m_learn" script
labels <- readRDS("labels_case_vs_ctrl_ml_class.rds")
data <- readRDS("data_case_vs_ctrl_ml_class.rds")

# Case VS Control
data_labels <- cbind(data, labels)
boruta_model=Boruta(labels ~ ., data=data_labels, doTrace = 2 , maxRuns = 20000)
print(boruta_model)
saveRDS(boruta_model,file="boruta_model.rds")
final_boruta_model=TentativeRoughFix(boruta_model)
Case_VS_Ctrl_model_sel_feat=getSelectedAttributes(final_boruta_model)
saveRDS(Case_VS_Ctrl_model_sel_feat,file="Case_VS_Ctrl_model_sel_feat.rds")
write.csv(Case_VS_Ctrl_model_sel_feat,file="Case_VS_Ctrl_model_selelected_feat.csv")

k <-lapply(1:ncol(boruta_model$ImpHistory),function(i)
  boruta_model$ImpHistory[is.finite(boruta_model$ImpHistory[,i]),i])
names(k) <- colnames(boruta_model$ImpHistory)
boruta_model_feature_rank <- sort(sapply(k,median))
write.csv(boruta_model_feature_rank,file="boruta_model_feature_rank.csv")

Onput data format

ensembl_transcript_id	Z.score
ENST00000624169.1	1.032175543
ENST00000534336.1	1.042689192

# Stack-plot for max-Zscores
setwd(path.expand("/home/STACK-plot") )
ENR <- read.delim("boruta_model_feature_rank.txt", header=TRUE, fill = TRUE)
ggplot(ENR, aes(x=hgnc_symbol, y=Z_Score, fill=Contrast)) + geom_bar(stat='identity') +
    coord_flip() +
    scale_fill_brewer(palette="Paired") +
    scale_x_discrete(limits = rev(levels(ENR))) +
theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggsave("AD-Ctrl.eps")

Name		Name	Last commit message	Last commit date
Latest commit History 28 Commits
README.md		README.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

✒️ BORUTA_Feature-selection

⚙️ Technologies & Tools

Input data format

Onput data format

Z.score stack-plot

About

Releases

Packages

KosarHooshmand/BORUTA_Feature-selection

Folders and files

Latest commit

History

Repository files navigation

✒️ BORUTA_Feature-selection

⚙️ Technologies & Tools

Input data format

Onput data format

Z.score stack-plot

About

Topics

Resources

Stars

Watchers

Forks

Releases

Packages 0

Packages