-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLDA_code.R
More file actions
95 lines (79 loc) · 4.24 KB
/
Copy pathLDA_code.R
File metadata and controls
95 lines (79 loc) · 4.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
library(tidyverse)
library(RColorBrewer)
na_to_median <- function(x){
x[is.na(x)] <- median(x, na.rm = TRUE)
return(x)
}
setwd( "C:/Users/pdhingra/OneDrive - Palatin Technologies, Inc/Documents/PROJECTS/DiabeticRetinopathy/proteomics/PAPER_PROTEOMICS/")
# 1. Data loading & preprocessing ---------------------------------------------------------
sample_info <- read_tsv('../../samples_order.txt')
protein_data <- read_tsv("../../protein_data_loess_norm.tsv")
protein_data<-protein_data[,-c(24,25,26,33,34,35,36)]
proteins <- protein_data %>%
select(-c(ProteinID,ProteinName), -starts_with('Pool')) %>%
as.matrix() %>%
t()
#colnames(proteins)<-proteins[1,]
#proteins<-proteins[-c(1),]
#dim(proteins)
proteins <- apply(proteins, 2, na_to_median) # replace missing values
colnames(proteins)<-protein_data$ProteinName
proteins <- proteins[, apply(proteins, 2, var) > 0 & apply(proteins, 2, function(y) !all(is.na(y)))] # remove empty proteins
# 2. PCA decomposition ----------------------------------------------------
res_pca <- prcomp(proteins, center = TRUE, scale = TRUE)
res_pca_var <- res_pca$sdev ^ 2 / sum(res_pca$sdev ^ 2) # variance explained
pca_data <- res_pca$x %>%
as.data.frame() %>%
mutate(sample_name = rownames(.)) %>%
merge(sample_info, by = "sample_name") %>%
mutate(col = as.factor(Treatment),
shape = as.factor(Treatment))
# Filter out components by 95% variance explained
res_pca_var_cum <- cumsum(res_pca_var)
bp <- barplot(res_pca_var_cum, ylim = c(0, 1.2))
abline(h = 0.95)
text(bp, res_pca_var_cum * 1.05, labels = seq_along(res_pca_var_cum), cex = .7)
which(cumsum(res_pca_var) > 0.95)[1]
#screeplot
p<-PCAtools::pca(t(proteins))
screeplot(p,vline = 'PC31',hline=95)
# LDA analysis
lda_model <- MASS::lda(pca_data[, paste("PC", 1:31, sep = "")], grouping = factor(pca_data$Treatment))
res_lda <- predict(lda_model, pca_data[, paste("PC", 1:31, sep = "")], type = 'raw')
ggplot(data = as.data.frame(res_lda$x[, 1:2]),
aes(x=LD1, y=LD2, colour= as.factor(pca_data$Treatment))) +
geom_point(size = 4) +
# theme_minimal() +
scale_color_manual(values = c("#02BA26","#AD07E3","light pink","magenta","#000000")) +
theme(axis.ticks.x = element_blank(),
panel.background = element_blank(),
axis.ticks.y = element_blank(),
axis.title = element_blank(),
axis.line = element_line(color="black"),
axis.text=element_text(size=16))+xlab("LD1")+ylab("LD2")
#obtain LDA coefficients for selected PC
lda_coeffs <- lda_model$scaling # LDA coefficients for each selected PC
#Transform LDA coefficients to gene space:
gene_importance <- abs(res_pca$rotation[, 1:37] %*% lda_coeffs)
#This matrix links genes to LDA components that explain class separation.
#Higher values indicate greater gene importance for classification.
#Sort genes by their importance and select the top ones:
top_genes <- names(sort(rowSums(gene_importance), decreasing = TRUE)[1:100]) # Top 50 genes
prot<-t(proteins)
sub_protein<-subset(prot,rownames(prot)%in%top_genes)
dim(sub_protein)
sub_protein<-subset(sub_protein,rownames(sub_protein)%in%prolist)
dim(sub_protein)
sample_info <- read.table('sample_description.txt',header=TRUE)
sub_protein<-sub_protein[,match(sample_info[order(sample_info$Treatment,sample_info$Response),]$sample_name,colnames(sub_protein))]
metaheatmap<-sample_info[match(colnames(sub_protein),sample_info$sample_name),]
#make heatmap
ann<-data.frame(metaheatmap$Treatment,metaheatmap$Response)
colnames(ann)<-c("Treatment","Response")
colours<-list('Treatment'=c('Healthy, naive'="blue","STZ, 0.05mg/kg PL9654"="purple",
"STZ, 0.1mg/kg PL9654"="#5D3A9B","STZ, 0.5mg/kg PL9654"="#CC79A7",
"STZ, 1.0mg/kg PL8177"="#E7B800","STZ, vehicle"="red"),
'Response'=c('STZ'='red','Healthy'='blue','NR'="grey",'R'="darkgreen"))
colAnn<-ComplexHeatmap::HeatmapAnnotation(df=ann,which='col',col=colours)
ComplexHeatmap::Heatmap(as.matrix(sub_protein),top_annotation = colAnn,cluster_columns = FALSE,
row_km=2,row_km_repeats = 100,column_names_gp = grid::gpar(fontsize = 8),row_names_gp = grid::gpar(fontsize = 8))