-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscaffolds.py
executable file
·202 lines (178 loc) · 8.17 KB
/
scaffolds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/env python3
import argparse
import os
import subprocess
import sys
# compering methods
modes = [
'RINGS_WITH_LINKERS_1', 'RINGS_WITH_LINKERS_2', 'MURCKO_1', 'MURCKO_2', 'OPREA_1', 'OPREA_2', 'OPREA_3',
'SCHUFFENHAUER_1', 'SCHUFFENHAUER_2', 'SCHUFFENHAUER_3', 'SCHUFFENHAUER_4', 'SCHUFFENHAUER_5'
]
# shortcuts for compering methods
short_modes = [
'rwl1', 'rwl2', 'mur1', 'mur2', 'opr1', 'opr2', 'opr3', 'sch1', 'sch2', 'sch3', 'sch4', 'sch5'
]
def main():
parser = argparse.ArgumentParser(description='Searching for same scaffolds in Homo sapiens enzymes inhibitors')
parser.add_argument('target', type=str, help='enzyme target')
parser.add_argument('mode', type=str, help='inhibitors compering method', choices=modes)
parser.add_argument('-o', '--output', type=str, help='output file')
args = parser.parse_args()
# using chembl web client, download inhibitors
download_inhibitors(args.target)
# strip them using strip-it and gets scaffolds
strip(args.target)
# merge inhibitors with same scaffolds to dictionary
scaffolds = merge(args.target, args.mode)
# show results on stdout and write to file if demanded
show_results(scaffolds, args.output if 'output' in args else None)
def download_inhibitors(target: str):
# download inhibitors from chembl
from chembl_webresource_client.new_client import new_client
if not os.path.isfile(target):
print('[*] Downloading inhibitors from ChEMBL...')
chembl_target = new_client.target
# get receptor protein only from homo sapiens and get only it's chembl_id
targets = chembl_target.filter(target_synonym__icontains=target, organism='Homo sapiens').only(['target_chembl_id'])
# multiple targets are not supported
if len(targets) > 1:
print('[-] Found multiple targets')
sys.exit(1)
# if no targets found it's error too
elif len(targets) == 0:
print('[-] Not found any targets')
sys.exit(1)
# get one and only receptor chembl id
enzyme_target_id = targets[0]['target_chembl_id']
activities = new_client.activity
# get maximum 100 inhibitors
activities.query.limit = 100
# get only smiles and chembl ids
inhibitors = activities.filter(target_chembl_id=enzyme_target_id, standard_type='IC50').only(
['canonical_smiles', 'molecule_chembl_id'])
print('[+] Done!')
print('[*] Writing inhibitors to file...')
with open(target, 'w') as f:
for i in inhibitors:
f.write(i['canonical_smiles'] + ' ' + i['molecule_chembl_id'] + '\n')
print('[+] Done!')
def strip(target: str):
# strip-it on downloaded inhibitors
good_ligands = f'{target}_good_ligands'
wrong_ligands = f'{target}_wrong_ligands'
scaffolds_filename = f'{target}_scaffolds'
if not os.path.isfile(scaffolds_filename):
print(f'[*] Removing duplicates from file: {target}')
with open(target, 'r') as scaffs_file:
data = scaffs_file.readlines()
# remove duplicates
data = set([line.strip() for line in data])
data_dict = {line.split()[0]: line.split()[1] for line in data if len(line.split()) >= 2}
# sort dict by id
data_dict = {k: v for k, v in sorted(data_dict.items(), key=lambda item: item[1])}
# invert to id: smile and change id -> [<index>]_id to preserve original indexes
data_dict = {f'INDEX_{index}_{chem_id}': smile for index, (smile, chem_id) in enumerate(data_dict.items())}
# make list from dict with smile and chem_id separated with space
data_list = [f'{smile} {chem_id}' for chem_id, smile in data_dict.items()]
print(f'[*] Perfoming strip-it on inhibitors to file: {scaffolds_filename}')
while True:
with open(good_ligands, 'w') as scaffs_file:
scaffs_file.write('\n'.join(data_list))
ret = subprocess.call(
['strip-it', '--inputFormat', 'smiles', '--input', good_ligands, '--output',
scaffolds_filename])
# strip-it failure
if ret:
print('[*] strip-it error occurred, trying to fix database...')
# check if there's chance to fix database by removing ligand which causes error
with open(scaffolds_filename, 'r') as scaffs:
# read last good ligand
last_ligand_id = scaffs.readlines()[-1].split()[0]
ligand = f'{data_dict[last_ligand_id]} {last_ligand_id}'
# get next ligand which causes failure
wrong_ligand_index = data_list.index(ligand) + 1
wrong_ligand = data_list[wrong_ligand_index] if wrong_ligand_index < len(data_list) else None
if wrong_ligand:
print(f'[*] Successfully removed wrong ligand, it will be saved to {wrong_ligands}')
with open(wrong_ligands, 'a') as del_ligands:
del_ligands.write(f'{wrong_ligand}\n')
# safely delete, original indexes are preserved in chem_ids
del data_list[wrong_ligand_index]
# if no wrong ligand it means we finished fixing database
else:
break
# TODO: instead doing again whole scaffolding, start from last position
# for now, just delete file and start again
os.remove(scaffolds_filename)
else:
break
print('[+] Done!')
return scaffolds_filename
def merge(target: str, mode: str):
name_index = 0
molecule_index = 1
# allow use full name or just shortcut of compering method
if mode in modes:
method_index = modes.index(mode) + 2 # because id and molecule go first
elif mode in short_modes:
method_index = short_modes.index(mode) + 2
else:
# default mode
print(f'[*] Couldn\'t find {mode}, use {modes[0]} instead')
method_index = 0 + 2
# store scaffolds as dict which holds list of ligands info
scaffolds = {}
print('[*] Merging molecules with same scaffold...')
with open(target + '_scaffolds') as s:
# read line with header
s.readline()
i = 0
for line in s:
inhib = line.split()
name = inhib[name_index]
# get rid of _ligand suffix
if name.endswith('_ligand'):
name = name[:-len('_ligand')]
# get original index from name
if name.startswith('INDEX_'):
# remove INDEX_ prefix
name = name[len('INDEX_'):]
# split with _
name_parts = name.split('_')
# check if it's index, if yes remove it from name
if name_parts[0].isdigit():
i = int(name_parts[0])
name = '_'.join(name_parts[1:])
# molecule in smiles
molecule = inhib[molecule_index]
# scaffold in smiles
scaff = inhib[method_index]
# if new scaffold, add list to dict
if scaff not in scaffolds:
scaffolds[scaff] = []
# append to list in dict info about ligand with its global index
scaffolds[scaff].append(f'[{i}] {name}\t{molecule}')
i += 1
# sort dict by length of list
scaffolds = {k: scaffolds[k] for k in sorted(scaffolds, key=lambda k: len(scaffolds[k]))}
print('[+] Done!\n')
return scaffolds
def show_results(scaffolds, output: str = None):
result = []
for scaffold in scaffolds:
# first goes scaffold
result.append(scaffold + ':\n')
# then it's list of ligands with this scaffold
for name in scaffolds[scaffold]:
result.append('\t' + name + '\n')
result.append('\n')
if output:
print('[*] Writing results to file...')
with open(output, 'w') as o:
o.writelines(result)
print('[+] Done!\n')
print('Scaffold:\n\t<ID>\t<SMILE>\n')
print(''.join(result))
return ''.join(result)
if __name__ == '__main__':
main()