-
Notifications
You must be signed in to change notification settings - Fork 411
/
Copy pathtools.py
executable file
·398 lines (341 loc) · 14.8 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
from utils import *
import time
import arxiv
import os, re
import io, sys
import numpy as np
import concurrent.futures
from pypdf import PdfReader
from datasets import load_dataset
from psutil._common import bytes2human
from datasets import load_dataset_builder
from semanticscholar import SemanticScholar
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
import traceback
import concurrent.futures
class HFDataSearch:
def __init__(self, like_thr=3, dwn_thr=50) -> None:
"""
Class for finding relevant huggingface datasets
:param like_thr:
:param dwn_thr:
"""
self.dwn_thr = dwn_thr
self.like_thr = like_thr
self.ds = load_dataset("nkasmanoff/huggingface-datasets")["train"]
# Initialize lists to collect filtered data
filtered_indices = []
filtered_descriptions = []
filtered_likes = []
filtered_downloads = []
# Iterate over the dataset and filter based on criteria
for idx, item in enumerate(self.ds):
# Get likes and downloads, handling None values
likes = int(item['likes']) if item['likes'] is not None else 0
downloads = int(item['downloads']) if item['downloads'] is not None else 0
# Check if likes and downloads meet the thresholds
if likes >= self.like_thr and downloads >= self.dwn_thr:
# Check if the description is a non-empty string
description = item['description']
if isinstance(description, str) and description.strip():
# Collect the data
filtered_indices.append(idx)
filtered_descriptions.append(description)
filtered_likes.append(likes)
filtered_downloads.append(downloads)
# Check if any datasets meet all criteria
if not filtered_indices:
print("No datasets meet the specified criteria.")
self.ds = []
self.descriptions = []
self.likes_norm = []
self.downloads_norm = []
self.description_vectors = None
return # Exit the constructor
# Filter the datasets using the collected indices
self.ds = self.ds.select(filtered_indices)
# Update descriptions, likes, and downloads
self.descriptions = filtered_descriptions
self.likes = np.array(filtered_likes)
self.downloads = np.array(filtered_downloads)
# Normalize likes and downloads
self.likes_norm = self._normalize(self.likes)
self.downloads_norm = self._normalize(self.downloads)
# Vectorize the descriptions
self.vectorizer = TfidfVectorizer()
self.description_vectors = self.vectorizer.fit_transform(self.descriptions)
def _normalize(self, arr):
min_val = arr.min()
max_val = arr.max()
if max_val - min_val == 0:
return np.zeros_like(arr, dtype=float)
return (arr - min_val) / (max_val - min_val)
def retrieve_ds(self, query, N=10, sim_w=1.0, like_w=0.0, dwn_w=0.0):
"""
Retrieves the top N datasets matching the query, weighted by likes and downloads.
:param query: The search query string.
:param N: The number of results to return.
:param sim_w: Weight for cosine similarity.
:param like_w: Weight for likes.
:param dwn_w: Weight for downloads.
:return: List of top N dataset items.
"""
if not self.ds or self.description_vectors is None:
print("No datasets available to search.")
return []
query_vector = self.vectorizer.transform([query])
cosine_similarities = linear_kernel(query_vector, self.description_vectors).flatten()
# Normalize cosine similarities
cosine_similarities_norm = self._normalize(cosine_similarities)
# Compute final scores
final_scores = (
sim_w * cosine_similarities_norm +
like_w * self.likes_norm +
dwn_w * self.downloads_norm
)
# Get top N indices
top_indices = final_scores.argsort()[-N:][::-1]
# Convert indices to Python ints
top_indices = [int(i) for i in top_indices]
top_datasets = [self.ds[i] for i in top_indices]
# check if dataset has a test & train set
has_test_set = list()
has_train_set = list()
ds_size_info = list()
for i in top_indices:
try:
dbuilder = load_dataset_builder(self.ds[i]["id"], trust_remote_code=True).info
except Exception as e:
has_test_set.append(False)
has_train_set.append(False)
ds_size_info.append((None, None, None, None))
continue
if dbuilder.splits is None:
has_test_set.append(False)
has_train_set.append(False)
ds_size_info.append((None, None, None, None))
continue
# Print number of examples for
has_test, has_train = "test" in dbuilder.splits, "train" in dbuilder.splits
has_test_set.append(has_test)
has_train_set.append(has_train)
test_dwn_size, test_elem_size = None, None
train_dwn_size, train_elem_size = None, None
if has_test:
test_dwn_size = bytes2human(dbuilder.splits["test"].num_bytes)
test_elem_size = dbuilder.splits["test"].num_examples
if has_train:
train_dwn_size = bytes2human(dbuilder.splits["train"].num_bytes)
train_elem_size = dbuilder.splits["train"].num_examples
ds_size_info.append((test_dwn_size, test_elem_size, train_dwn_size, train_elem_size))
for _i in range(len(top_datasets)):
top_datasets[_i]["has_test_set"] = has_test_set[_i]
top_datasets[_i]["has_train_set"] = has_train_set[_i]
top_datasets[_i]["test_download_size"] = ds_size_info[_i][0]
top_datasets[_i]["test_element_size"] = ds_size_info[_i][1]
top_datasets[_i]["train_download_size"] = ds_size_info[_i][2]
top_datasets[_i]["train_element_size"] = ds_size_info[_i][3]
return top_datasets
def results_str(self, results):
"""
Provide results as list of results in human-readable format.
:param results: (list(dict)) list of results from search
:return: (list(str)) list of results in human-readable format
"""
result_strs = list()
for result in results:
res_str = f"Dataset ID: {result['id']}\n"
res_str += f"Description: {result['description']}\n"
res_str += f"Likes: {result['likes']}\n"
res_str += f"Downloads: {result['downloads']}\n"
res_str += f"Has Testing Set: {result['has_test_set']}\n"
res_str += f"Has Training Set: {result['has_train_set']}\n"
res_str += f"Test Download Size: {result['test_download_size']}\n"
res_str += f"Test Dataset Size: {result['test_element_size']}\n"
res_str += f"Train Download Size: {result['train_download_size']}\n"
res_str += f"Train Dataset Size: {result['train_element_size']}\n"
result_strs.append(res_str)
return result_strs
class SemanticScholarSearch:
def __init__(self):
self.sch_engine = SemanticScholar(retry=False)
def find_papers_by_str(self, query, N=10):
paper_sums = list()
results = self.sch_engine.search_paper(query, limit=N, min_citation_count=3, open_access_pdf=True)
for _i in range(len(results)):
paper_sum = f'Title: {results[_i].title}\n'
paper_sum += f'Abstract: {results[_i].abstract}\n'
paper_sum += f'Citations: {results[_i].citationCount}\n'
paper_sum += f'Release Date: year {results[_i].publicationDate.year}, month {results[_i].publicationDate.month}, day {results[_i].publicationDate.day}\n'
paper_sum += f'Venue: {results[_i].venue}\n'
paper_sum += f'Paper ID: {results[_i].externalIds["DOI"]}\n'
paper_sums.append(paper_sum)
return paper_sums
def retrieve_full_paper_text(self, query):
pass
class ArxivSearch:
def __init__(self):
# Construct the default API client.
self.sch_engine = arxiv.Client()
def _process_query(self, query: str) -> str:
"""Process query string to fit within MAX_QUERY_LENGTH while preserving as much information as possible"""
MAX_QUERY_LENGTH = 300
if len(query) <= MAX_QUERY_LENGTH:
return query
# Split into words
words = query.split()
processed_query = []
current_length = 0
# Add words while staying under the limit
# Account for spaces between words
for word in words:
# +1 for the space that will be added between words
if current_length + len(word) + 1 <= MAX_QUERY_LENGTH:
processed_query.append(word)
current_length += len(word) + 1
else:
break
return ' '.join(processed_query)
def find_papers_by_str(self, query, N=20):
processed_query = self._process_query(query)
max_retries = 3
retry_count = 0
while retry_count < max_retries:
try:
search = arxiv.Search(
query="abs:" + processed_query,
max_results=N,
sort_by=arxiv.SortCriterion.Relevance)
paper_sums = list()
# `results` is a generator; you can iterate over its elements one by one...
for r in self.sch_engine.results(search):
paperid = r.pdf_url.split("/")[-1]
pubdate = str(r.published).split(" ")[0]
paper_sum = f"Title: {r.title}\n"
paper_sum += f"Summary: {r.summary}\n"
paper_sum += f"Publication Date: {pubdate}\n"
paper_sum += f"Categories: {' '.join(r.categories)}\n"
paper_sum += f"arXiv paper ID: {paperid}\n"
paper_sums.append(paper_sum)
time.sleep(2.0)
return "\n".join(paper_sums)
except Exception as e:
retry_count += 1
if retry_count < max_retries:
# 递增延时
time.sleep(2 * retry_count)
continue
return None
def retrieve_full_paper_text(self, query):
pdf_text = str()
paper = next(arxiv.Client().results(arxiv.Search(id_list=[query])))
# Download the PDF to the PWD with a custom filename.
paper.download_pdf(filename="downloaded-paper.pdf")
# creating a pdf reader object
reader = PdfReader('downloaded-paper.pdf')
# Iterate over all the pages
for page_number, page in enumerate(reader.pages, start=1):
# Extract text from the page
try:
text = page.extract_text()
except Exception as e:
os.remove("downloaded-paper.pdf")
time.sleep(2.0)
return "EXTRACTION FAILED"
# Do something with the text (e.g., print it)
pdf_text += f"--- Page {page_number} ---"
pdf_text += text
pdf_text += "\n"
os.remove("downloaded-paper.pdf")
time.sleep(2.0)
return pdf_text
"""
import multiprocessing
import sys
import io
import traceback
def execute_code(code_str, timeout=180):
if "load_dataset('pubmed" in code_str:
return "pubmed Download took way too long. Program terminated"
def run_code(queue):
# Redirect stdout to capture print outputs
output_capture = io.StringIO()
sys.stdout = output_capture
try:
exec_globals = {}
exec(code_str, exec_globals)
except Exception as e:
output_capture.write(f"[CODE EXECUTION ERROR]: {str(e)}\n")
traceback.print_exc(file=output_capture)
finally:
# Put the output in the queue
queue.put(output_capture.getvalue())
# Restore stdout
sys.stdout = sys.__stdout__
# Create a multiprocessing Queue to capture the output
queue = multiprocessing.Queue()
# Create a new Process
process = multiprocessing.Process(target=run_code, args=(queue,))
process.start()
# Wait for the process to finish or timeout
process.join(timeout)
if process.is_alive():
process.terminate()
process.join()
return f"[CODE EXECUTION ERROR]: Code execution exceeded the timeout limit of {timeout} seconds. You must reduce the time complexity of your code."
else:
# Retrieve the output from the queue
output = queue.get()
return output
"""
import io
import sys
import traceback
import concurrent.futures
import multiprocessing
import io
import sys
import traceback
import multiprocessing
import io
import sys
import traceback
def execute_code(code_str, timeout=60, MAX_LEN=1000):
#print(code_str)
# prevent plotting errors
import matplotlib
matplotlib.use('Agg') # Use the non-interactive Agg backend
import matplotlib.pyplot as plt
# Preventing execution of certain resource-intensive datasets
if "load_dataset('pubmed" in code_str:
return "[CODE EXECUTION ERROR] pubmed Download took way too long. Program terminated"
if "exit(" in code_str:
return "[CODE EXECUTION ERROR] The exit() command is not allowed you must remove this."
#print(code_str)
# Capturing the output
output_capture = io.StringIO()
sys.stdout = output_capture
# Create a new global context for exec
exec_globals = globals()
def run_code():
try:
# Executing the code in the global namespace
exec(code_str, exec_globals)
except Exception as e:
output_capture.write(f"[CODE EXECUTION ERROR]: {str(e)}\n")
traceback.print_exc(file=output_capture)
try:
# Running code in a separate thread with a timeout
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(run_code)
future.result(timeout=timeout)
except concurrent.futures.TimeoutError:
return f"[CODE EXECUTION ERROR]: Code execution exceeded the timeout limit of {timeout} seconds. You must reduce the time complexity of your code."
except Exception as e:
return f"[CODE EXECUTION ERROR]: {str(e)}"
finally:
# Restoring standard output
sys.stdout = sys.__stdout__
# Returning the captured output
return output_capture.getvalue()[:MAX_LEN]