This repository has been archived by the owner on Apr 4, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathscan.py
117 lines (103 loc) · 3.67 KB
/
scan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# hackery to make sure to use the bson parser without the C extensions
# since those crash on some of the input.
# https://jira.mongodb.org/browse/PYTHON-571
#a="/usr/lib/python2.7/site-packages/pymongo-2.6_-py2.7.egg/"
import sys
a="/usr/local/lib/python2.7/dist-packages/pymongo-2.6_-py2.7.egg"
sys.path.insert(0,a)
import pymongo
assert not pymongo.has_c(), "you must compile pymongo with --no-ext"
import bson
import datetime
import collections
import mmap,os,struct,pdb,json,time,sys
schema = json.load(open('schema'))
def run(filename):
fd=open(filename,"r")
# we used mmap before, but converted to stdio while chasing down the
# bson parser bug. It's probably safe to change it back. The slowdown
# wasn't that bad though, so I didn't bother.
mm = 0# mmap.mmap(fd,0,access=mmap.ACCESS_READ)
# pdb.set_trace()
scan(filename,fd,mm)
fd.close()
print ('done',dt(),filename)
t0 = time.time()
def dt(): return time.time()-t0
def runall():
run(sys.argv[1])
def filesize(fd):
fd.seek(0,2)
return fd.tell()
def scan(filename,fd,mm):
global k,k1
maxm = filesize(fd)
def int4(loc):
#c = mm[loc:loc+4]
fd.seek(loc)
c=fd.read(4)
n, = struct.unpack('<i',c)
return n
def strn(i,n):
#return mm[i:i+n]
fd.seek(i)
return fd.read(n)
k = k1 = 0
Last = collections.namedtuple('Last',['i','n','b','s','coll'])
last = Last(-1000000,-1000000,'','','none')
for i in xrange(maxm-100):
if i%10000000 == 0: print ('hello',dt(),filename,i,k1)
n = int4(i)
if n > 10 and n < 100000:
if i+n >= maxm: break
k+=1
s = strn(i,n)
try:
b = bson.BSON(s).decode()
except Exception,e:
continue
ms = list(coll for coll,fields in schema.iteritems()
if all(f in b for f in fields))
assert (len(ms) < 2), (b,ms)
if ms:
coll = ms[0]
k1 += 1
# print (dt(),filename, k1,coll,k,i,n)
if n > 10000:
print ('large',dt(),filename,coll,i,n)
if i - last.i < last.n:
print ('overlap',dt(),filename,k1,coll,k,i,n,
last._replace(b='%d bytes'%len(last.b),
s='%d bytes'%len(last.s))
)
ofname = 'overlap-' + filename
dump(ofname, i, coll, b, s)
dump(ofname, last.i, last.coll, last.b, last.s)
dump(filename, i, coll, b, s)
# assert (i - last_i > n), (i,last_i,n)
last = Last(i, n, b, s, coll)
return k
# we use a custom json encoder to dump out some bson types
# that don't map directly to json types, as their string
# equivalents.
class doc_encoder_(json.JSONEncoder):
def default(self, x):
if isinstance(x,datetime.datetime) \
or isinstance(x,bson.objectid.ObjectId):
return str(x)
return json.JSONEncoder.default(self,x)
exts = ['json','offsets2']
Fcached = collections.namedtuple('Fcached', exts)
file_cache = {}
def dump(origin,i,collection, b, s):
filename = "%s-%s"%(origin, collection)
if filename not in file_cache:
file_cache[filename] = \
Fcached(**dict((e,open('%s.%s'%(filename,e),'w'))
for e in exts))
# the indent=3 parameter makes human readable but bulkier output.
j = json.dumps(b, cls=doc_encoder_) #, indent=3)
fc = file_cache[filename]
print >>fc.offsets2, fc.json.tell(), i, len(s), len(j)
print >>fc.json, j
runall()