-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluation_program.py
164 lines (128 loc) · 5.42 KB
/
evaluation_program.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import re
from parser_tools import parse_edb_dataframe, create_fact
import Predicate
import pandas as pd
# Evaluate the whole Datalog program, loop on IDBs to evaluate and return them.
def evaluate(edb_facts, idb_rules):
evaluated_rules = {}
# Step 1 : Parse edb into dataframe (cf. README & parser_tools).
dataframes = parse_edb_dataframe(edb_facts)
# Loop on IDB rules.
for rule in idb_rules:
idbName = rule.getIDBName()
head = rule.paramsHead
body = rule.body
if idbName not in evaluated_rules:
evaluated_rules[idbName] = {
'head': head,
'rows': []
}
ruleResult = []
ruleCount = 0
# Step 2 : Evaluate an IDB rule until there is no more change in the result.
ruleResult = evaluate_rule(head, body, dataframes)
ruleCount = len(ruleResult)
while ruleCount > 0:
ruleCount = 0
for row in ruleResult:
# Save in dataframes the result of the rule evaluation.
if idbName not in dataframes:
dataframes[idbName] = pd.DataFrame(columns=row.keys()) # Add to facts
if ([*row.values()] not in dataframes[idbName].values.tolist()):
ruleCount += 1
dataframes[idbName].loc[len(dataframes[idbName])] = row
evaluated_rules[idbName]['rows'].extend([row])
ruleResult = evaluate_rule(head, body, dataframes)
print_evaluator(evaluated_rules)
return evaluated_rules
# Step 2 : Evaluate an IDB rule.
def evaluate_rule(head, body, dataframes):
result = []
aggregateVariables = []
postAggregateComparisonPredicates = []
# Let's find all the aggregate variables in the body so we can ignore comparisons on them at first run.
for predicate in body:
if type(predicate) is Predicate.AggregationPredicate:
var = predicate.params[-1].strip()
aggregateVariables.append(var)
# Step 2.1 : Evaluate all atomic predicates in the body.
atomPredicateResults = [] # dFs of each Atomic predicate result
for predicate in body:
if type(predicate) is Predicate.AtomicPredicate:
if predicate.getTitle() in dataframes:
df = predicate.eval(dataframes[predicate.getTitle()], head)
atomPredicateResults.append(df)
if not atomPredicateResults:
return result
# Merge all atomic predicate results
df = merge_dataframes(atomPredicateResults)
# Step 2.2 : Evaluate all comparisons predicates in the body. (that does not contain Aggregate variables)
for predicate in body:
bypass = False
if type(predicate) is Predicate.ComparisonPredicate:
for var in aggregateVariables:
if var in predicate.params:
postAggregateComparisonPredicates.append(predicate)
bypass = True
break
if not bypass:
df = predicate.eval(df)
# Step 2.3 : Evaluate all aggregate predicates in the body.
for predicate in body:
if type(predicate) is Predicate.AggregationPredicate:
df = predicate.eval(df, head)
# Step 2.4 : Evaluate all comparisons predicates in the body. (that contains Aggregate variables)
for predicate in postAggregateComparisonPredicates:
df = predicate.eval(df)
# Cleaning up the dataframe
# Delete all #Y# columns generated by the AtomicPredicate.eval() method
df = df.loc[:, ~df.columns.str.contains('#[a-zA-Z0-9]+#')]
# Add the result to the list of results
for _, row in df.iterrows():
# Save the result as a dictionary
result.append(row.to_dict())
return result
def merge_dataframes(atomPredicateResults):
if (len(atomPredicateResults) == 0):
return None
df = atomPredicateResults[0]
for relDf in atomPredicateResults[1:]:
df = pd.merge(df, relDf, how='cross')
# filter rows for matching tokens
tokens = df.columns.tolist()
tokens = map(lambda x: x.split('_')[0], tokens)
tokens = set(tokens)
df = filterTable(df, tokens)
return df
def filterTable(df, tokens):
# tokens are tags that column that contains them in their name must have the same value
# Check for duplicates exact tokens in the columns
currentColumns = df.columns.tolist()
i = 0
for col in currentColumns:
i += 1
if (currentColumns.count(col.split('_')[0]) > 1):
df.columns.values[i] = col + "_" + str(i)
currentColumns[i] = col + "_" + str(i)
for token in tokens:
columns = df.columns.tolist()
columns = filter(lambda x: x.split('_')[0] == token, columns)
columns = list(columns)
if (len(columns) == 1):
continue
# filter rows where rows have the same value for all columns
df = df[df[columns].apply(lambda x: len(set(x)) == 1, axis=1)]
if (len(df) == 0):
return df
# keep only one column
df = df.drop(columns=columns[1:])
# rename column
df = df.rename(columns={columns[0]: token})
return df
def print_evaluator(rules):
print("\n----- Evaluation program ------\n")
for rule in rules:
head, rows = rules[rule]['head'], rules[rule]['rows']
for row in rows:
print(f"{create_fact(rule, head, row)}")
print("\n")