-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.R
144 lines (124 loc) · 4.63 KB
/
utils.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
##### Funciones relacionadas con rutas metabólicas #####
# Función para leer las rutas metabólicas contenidas en un archivo .gmt y
# devolverlas en forma de lista, obtenida de fgsea versión 1.21.0
# https://github.com/ctlab/fgsea/blob/master/R/pathways.R
gmtPathways <- function(gmt.file) {
pathwayLines <- strsplit(readLines(gmt.file), "\t")
pathways <- lapply(pathwayLines, tail, -2)
names(pathways) <- sapply(pathwayLines, head, 1)
pathways
}
# Función casera para calcular el nº de rutas metabólicas en las que participa
# un gen
num_of_pathways <- function (gmtfile, overlapgenes){
# Computes the sample covariance between two vectors.
#
# Argumentos:
# gmtfile: Ruta (relativa o absoluta) al archivo .gmt con las rutas metabólicas
# overlapgenes: vector de tipo "character" con los nombres de los genes a buscar en el archivo .gmt.
# Deben ser genes presentes tanto en el objeto sce de interés como en el archivo gmtfile
#
# Output:
# Un dataframe de dimensiones overlapgenes X 1 que contiene el nº de rutas en las que participa
# cada gen
pathways <- gmtPathways(gmtfile)
pathway_names <- names(pathways)
filter_pathways <- list()
for (p in pathway_names){
genes <- pathways[[p]]
common_genes <- intersect(genes,overlapgenes)
if(length(common_genes>=5)){
filter_pathways[[p]] <- common_genes
}
}
all_genes <- unique(as.vector(unlist(filter_pathways)))
gene_times <- data.frame(num =rep(0,length(all_genes)),row.names = all_genes)
for(p in pathway_names){
for(g in filter_pathways[[p]]){
gene_times[g,"num"] = gene_times[g,"num"]+1
}
}
gene_times
}
# runGSEA_preRank.R
runGSEA_preRank<-function(preRank.matrix,gmt.file,outname){
# descending numerical order
# dump preRank into a tab-delimited txt file
write.table(x = preRank.matrix,
file = "prerank.rnk",
quote = F,
sep = "\t",
col.names = F,
row.names = T)
# call java gsea version
command <- paste('java -Xmx512m -cp ../gsea-3.0.jar xtools.gsea.GseaPreranked -gmx ', gmt.file, ' -norm meandiv -nperm 1000 -rnk prerank.rnk ',
' -scoring_scheme weighted -make_sets true -rnd_seed 123456 -set_max 500 -set_min 15 -zip_report false ',
' -out preRankResults -create_svgs true -gui false -rpt_label ',outname, sep='')
if(get_os() == "win"){
system(command, show.output.on.console = F)
} else {
system(command)
}
unlink(c('prerank.txt'))
}
##### Funciones de normalizado #####
# Función de normalizado de DESeq2 versión 1.35.0, obtenida de:
# https://github.com/mikelove/DESeq2/blob/master/R/core.R
estimateSizeFactorsForMatrix <- function(counts, locfunc = stats::median,
geoMeans, controlGenes,
type = c("ratio", "poscounts")) {
type <- match.arg(type, c("ratio","poscounts"))
if (missing(geoMeans)) {
incomingGeoMeans <- FALSE
if (type == "ratio") {
loggeomeans <- rowMeans(log(counts))
} else if (type == "poscounts") {
lc <- log(counts)
lc[!is.finite(lc)] <- 0
loggeomeans <- rowMeans(lc)
allZero <- rowSums(counts) == 0
loggeomeans[allZero] <- -Inf
}
} else {
incomingGeoMeans <- TRUE
if (length(geoMeans) != nrow(counts)) {
stop("geoMeans should be as long as the number of rows of counts")
}
loggeomeans <- log(geoMeans)
}
if (all(is.infinite(loggeomeans))) {
stop("every gene contains at least one zero, cannot compute log geometric means")
}
sf <- if (missing(controlGenes)) {
apply(counts, 2, function(cnts) {
exp(locfunc((log(cnts) - loggeomeans)[is.finite(loggeomeans) & cnts > 0]))
})
} else {
if ( !( is.numeric(controlGenes) | is.logical(controlGenes) ) ) {
stop("controlGenes should be either a numeric or logical vector")
}
loggeomeansSub <- loggeomeans[controlGenes]
apply(counts[controlGenes,,drop=FALSE], 2, function(cnts) {
exp(locfunc((log(cnts) - loggeomeansSub)[is.finite(loggeomeansSub) & cnts > 0]))
})
}
if (incomingGeoMeans) {
# stabilize size factors to have geometric mean of 1
sf <- sf/exp(mean(log(sf)))
}
sf
}
##### Funciones misceláneas #####
# Función de rappdirs versión 0.3.3 para detectar el sistema operativo empleado,
# obtenida de: https://github.com/r-lib/rappdirs/blob/master/R/utils.r#L1
get_os <- function() {
if (.Platform$OS.type == "windows") {
"win"
} else if (Sys.info()["sysname"] == "Darwin") {
"mac"
} else if (.Platform$OS.type == "unix") {
"unix"
} else {
stop("Unknown OS")
}
}