-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnormalize_enumerated_links.py
49 lines (40 loc) · 2.2 KB
/
normalize_enumerated_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import pandas as pd
import pathlib
from urllib.parse import urlparse, urljoin, ParseResult
in_fname = 'pandas_output.pkl'
out_fname = 'unique_normalized_gov_links.pkl'
df = pd.read_pickle(in_fname)
print("starting out with %d links from %s" % (len(df),in_fname))
# if the link doesn't have a netloc value, use the netloc from the source
def normalize_links(row):
# do this even if the link is an absolute URL, because running this
# through urlparse will make fragments at the end of the path go away
# in parsed.path, which we can use to build the new link, which
# makes searchiung for duplicates a lot easier
tmp_abs_link = urljoin(row['source'],row['link'])
# parse the absolute URL to get new scheme, netloc, path, path suffix parts
parsed = urlparse(tmp_abs_link)
row['scheme'] = parsed.scheme
row['netloc'] = parsed.netloc
row['path'] = parsed.path
path = pathlib.Path(parsed.path)
row['suffix'] = path.suffix
# build a new ParseResult with no fragment data. fragments makes finding duplicates hard.
new_parsed = ParseResult(parsed.scheme,parsed.netloc,parsed.path,parsed.params,parsed.query,'')
# reassemble the normalized link from the new ParsedResult class
row['link'] = new_parsed.geturl()
return row
# apply the netloc normalization function to all rows
print("going to normalize the links:")
print("... rel hrefs into abs hrefs, figure out the suffix, throw away fragments in path. (takes a while) ..." )
normalized_links_df = df.apply(normalize_links, axis=1)
print("finished normalizing the links ...")
# filter all rows that are pointing to a non-.gov site
normalized_gov_df = normalized_links_df[ normalized_links_df['netloc'].str.contains('.gov', case=False)]
print("removed links that point to non-.gov sites ...")
# if we have multiple rows with the same link, filter out all but the first one
unique_normalized_gov_df = normalized_gov_df.drop_duplicates(subset=['link'],keep='first')
print("and if we have multiple rows with the same link value, preserve just the first one")
# write this dataframe out
unique_normalized_gov_df.to_pickle(out_fname)
print("finished. we now have %d unique gov links in %s" % (len(unique_normalized_gov_df),out_fname))