This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyse.py
114 lines (71 loc) · 2.27 KB
/
analyse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import argparse
import collections
import json
import SETTINGS
from lib import options, utils
def _get_arg_parser():
parser = argparse.ArgumentParser()
project_options = options.known_projects
parser.add_argument(
"project",
nargs=1,
type=str,
choices=project_options,
help=f'Project ID, must be one of: {project_options}'
)
parser.add_argument(
"-d",
"--dataset-ids",
nargs=1,
type=str,
default=None,
required=True,
help='List of comma-separated dataset identifiers'
)
return parser
def parse_args():
parser = _get_arg_parser()
args = parser.parse_args()
project = args.project[0]
ds_ids = args.dataset_ids[0].split(',')
return project, ds_ids
def _lookup(item, *keys):
keys = list(keys)
while keys:
k0 = keys[0]
item = item[k0]
keys.remove(k0)
return item
def analyse_characteristic(records, *keys):
results = {}
count = 0
for ds_id, rec in records.items():
result = _lookup(rec, *keys)
results.setdefault(result, [])
results[result].append(ds_id)
count += 1
print(f'\n[INFO] Testing: {keys} - found {len(results)} varieties')
for key in sorted(results):
ds_ids = results[key]
print(f'\t{keys} == {key}: {len(ds_ids)}')
count_ratio = float(len(ds_ids)) / count
if count_ratio < SETTINGS.CONCERN_THRESHOLD:
print(f'\t[WARN] SUGGEST FIX OF {keys} ON:\n\t\t' + '\n\t\t'.join(ds_ids))
def analyse_datasets(project, ds_ids):
"Compares a set of dataset identifiers"
records = load_records(ds_ids)
analyse_characteristic(records, 'data', 'rank')
analyse_characteristic(records, 'coordinates', 'time', 'calendar')
def load_records(ds_ids):
records = collections.OrderedDict()
for ds_id in ds_ids:
grouped_ds_id = utils.get_grouped_ds_id(ds_id)
json_path = SETTINGS.JSON_OUTPUT_PATH.format(**vars())
with open(json_path) as reader:
records[ds_id] = json.load(reader)
return records
def main():
project, ds_ids = parse_args()
analyse_datasets(project, ds_ids)
if __name__ == '__main__':
main()