DR_paper_code/LDA_code.R at main · PDhingra2016/DR_paper_code · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
library(tidyverse)
library(RColorBrewer)

na_to_median <- function(x){
    x[is.na(x)] <- median(x, na.rm = TRUE)
    return(x)
}

setwd( "C:/Users/pdhingra/OneDrive - Palatin Technologies, Inc/Documents/PROJECTS/DiabeticRetinopathy/proteomics/PAPER_PROTEOMICS/")
# 1. Data loading & preprocessing ---------------------------------------------------------
sample_info <- read_tsv('../../samples_order.txt')
protein_data <- read_tsv("../../protein_data_loess_norm.tsv")
protein_data<-protein_data[,-c(24,25,26,33,34,35,36)]

proteins <- protein_data %>%
    select(-c(ProteinID,ProteinName), -starts_with('Pool')) %>%
    as.matrix() %>%
    t()
#colnames(proteins)<-proteins[1,]
#proteins<-proteins[-c(1),]
#dim(proteins)

proteins <- apply(proteins, 2, na_to_median) #  replace missing values
colnames(proteins)<-protein_data$ProteinName
proteins <- proteins[, apply(proteins, 2, var) > 0 & apply(proteins, 2, function(y) !all(is.na(y)))] #  remove empty proteins

# 2. PCA decomposition ----------------------------------------------------

res_pca <- prcomp(proteins, center = TRUE, scale = TRUE)
res_pca_var <- res_pca$sdev ^ 2 / sum(res_pca$sdev ^ 2) #  variance explained

pca_data <- res_pca$x %>%
    as.data.frame() %>%
    mutate(sample_name = rownames(.)) %>%
    merge(sample_info, by = "sample_name") %>%
    mutate(col = as.factor(Treatment),
           shape = as.factor(Treatment))

# Filter out components by 95% variance explained
res_pca_var_cum <- cumsum(res_pca_var)
bp <- barplot(res_pca_var_cum, ylim = c(0, 1.2))
abline(h = 0.95)
text(bp, res_pca_var_cum * 1.05, labels = seq_along(res_pca_var_cum), cex = .7)
which(cumsum(res_pca_var) > 0.95)[1]

#screeplot
p<-PCAtools::pca(t(proteins))
screeplot(p,vline = 'PC31',hline=95)


# LDA analysis
lda_model <- MASS::lda(pca_data[, paste("PC", 1:31, sep = "")], grouping = factor(pca_data$Treatment))
res_lda <- predict(lda_model, pca_data[, paste("PC", 1:31, sep = "")], type = 'raw')

ggplot(data = as.data.frame(res_lda$x[, 1:2]),
       aes(x=LD1, y=LD2, colour= as.factor(pca_data$Treatment))) +
    geom_point(size = 4) +
  #  theme_minimal() +
    scale_color_manual(values = c("#02BA26","#AD07E3","light pink","magenta","#000000")) +
    theme(axis.ticks.x = element_blank(),
          panel.background = element_blank(),
          axis.ticks.y = element_blank(),
          axis.title = element_blank(),
          axis.line = element_line(color="black"),
          axis.text=element_text(size=16))+xlab("LD1")+ylab("LD2")

   #obtain LDA coefficients for selected PC
lda_coeffs <- lda_model$scaling  # LDA coefficients for each selected PC
#Transform LDA coefficients to gene space:
gene_importance <- abs(res_pca$rotation[, 1:37] %*% lda_coeffs)

#This matrix links genes to LDA components that explain class separation.
#Higher values indicate greater gene importance for classification.
#Sort genes by their importance and select the top ones:
top_genes <- names(sort(rowSums(gene_importance), decreasing = TRUE)[1:100])  # Top 50 genes
prot<-t(proteins)
sub_protein<-subset(prot,rownames(prot)%in%top_genes)
dim(sub_protein)
sub_protein<-subset(sub_protein,rownames(sub_protein)%in%prolist)
dim(sub_protein)
sample_info <- read.table('sample_description.txt',header=TRUE)
sub_protein<-sub_protein[,match(sample_info[order(sample_info$Treatment,sample_info$Response),]$sample_name,colnames(sub_protein))]
metaheatmap<-sample_info[match(colnames(sub_protein),sample_info$sample_name),]

#make heatmap
ann<-data.frame(metaheatmap$Treatment,metaheatmap$Response)
colnames(ann)<-c("Treatment","Response")

colours<-list('Treatment'=c('Healthy, naive'="blue","STZ, 0.05mg/kg PL9654"="purple",
                            "STZ, 0.1mg/kg PL9654"="#5D3A9B","STZ, 0.5mg/kg PL9654"="#CC79A7",
                            "STZ, 1.0mg/kg PL8177"="#E7B800","STZ, vehicle"="red"),
              'Response'=c('STZ'='red','Healthy'='blue','NR'="grey",'R'="darkgreen"))
colAnn<-ComplexHeatmap::HeatmapAnnotation(df=ann,which='col',col=colours)
ComplexHeatmap::Heatmap(as.matrix(sub_protein),top_annotation = colAnn,cluster_columns = FALSE,
                        row_km=2,row_km_repeats = 100,column_names_gp = grid::gpar(fontsize = 8),row_names_gp = grid::gpar(fontsize = 8))