-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_ldsc_inputs.py
73 lines (63 loc) · 2.51 KB
/
get_ldsc_inputs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python3
import os
import pickle
import sys
import numpy as np
import pandas as pd
def load_gene(data, cluster, gene, genes_dir):
plasma_path = os.path.join(genes_dir, gene, "combined", "plasma_i0.pickle")
try:
with open(plasma_path, "rb") as plasma_file:
plasma_data = pickle.load(plasma_file)
except (FileNotFoundError, pickle.UnpicklingError) as e:
# print(e) ####
return
plasma_clust = plasma_data.get(cluster)
# print(plasma_clust) ####
if cluster is None:
return
informative_snps = plasma_clust["informative_snps"]
cred = plasma_clust.get("causal_set_indep")
if cred is None:
return
# print(informative_snps) ####
for ind in informative_snps:
causal = cred[ind]
if causal == 0:
continue
contig = plasma_data["_gen"]["snp_pos"][ind][0]
# print(plasma_data["_gen"]["snp_pos"][ind]) ####
pos = int(plasma_data["_gen"]["snp_pos"][ind][1]) + 1
rsid = plasma_data["_gen"]["snp_ids"][ind]
ppa = plasma_clust["ppas_indep"][ind]
data.append([contig, pos, pos + 1, rsid, ppa, gene])
def write_bed(data, out_path):
with open(out_path, "w") as out_file:
out_file.writelines(f"{' '.join(map(str, i))}\n" for i in data)
def load_cluster(cluster, clusters_dir, genes_dir, out_dir, threshs):
cluster_path = os.path.join(clusters_dir, f"{cluster}.csv")
df = pd.read_csv(cluster_path, sep="\t")
# df = df.iloc[:100] ####
data = []
# print(df.columns) ####
cutoffs = {int(len(df) * i): i for i in threshs}
max_cutoff = max(cutoffs.keys())
for ind, gene in enumerate(df["Gene"]):
load_gene(data, cluster, gene, genes_dir)
if ind + 1 in cutoffs:
thr = cutoffs[ind + 1]
out_path = os.path.join(out_dir, f"{cluster}_{thr}.bed")
write_bed(data, out_path)
if ind >= max_cutoff:
break
def get_ldsc_inputs(clusters_dir, genes_dir, out_dir, threshs):
for i in os.listdir(clusters_dir):
cluster = i.split(".")[0]
load_cluster(cluster, clusters_dir, genes_dir, out_dir, threshs)
if __name__ == '__main__':
data_path_kellis = "/agusevlab/awang/sc_kellis"
genes_dir = os.path.join(data_path_kellis, "genes_429")
out_dir = os.path.join(data_path_kellis, "ldsc_429")
clusters_dir = "/agusevlab/awang/ase_finemap_results/sc_results/kellis_429/cell_type_spec"
threshs = [0.1, 0.2, 0.5]
get_ldsc_inputs(clusters_dir, genes_dir, out_dir, threshs)