-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
192 lines (166 loc) · 6.72 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# Imports
from helper import *
import psycopg2
# from collections import defaultdict
# Root website
root = "http://fasecolda.colserauto.com/fasecolda.explorador/"
# Urls to download documents and files
urls = {"docs": r"Default.aspx?url=E:\WWWROOT\FASECOLDA\Fasecolda.Web\Archivos\Guias\Documentos",
"files": r"Default.aspx?url=E:\WWWROOT\FASECOLDA\Fasecolda.Web\Archivos\Guias\GuiaValores_NuevoFormato"}
# Path to folder structure mentioned on README file --modify according to location--
paths = {"docs": r"..\\data\\docs",
"files": r"..\\data\\files"}
# Function call to download and store raw static files
collect_static_files(root, urls, paths)
# Data required for month information. These months are mapped to a calendar one month behind
month_map = {'Enero':1,
'Febrero':2,
'Marzo':3,
'Abril':4,
'Mayo':5,
'Junio':6,
'Julio':7,
'Agosto':8,
'Septiembre':9,
'Octubre':10,
'Noviembre':11,
'Diciembre':12}
month_sold_map = {'Enero':12,
'Febrero':1,
'Marzo':2,
'Abril':3,
'Mayo':4,
'Junio':5,
'Julio':6,
'Agosto':7,
'Septiembre':8,
'Octubre':9,
'Noviembre':10,
'Diciembre':11}
# Set filters --given by FusePong--
# Include only following classes
classes = ['AUTOMOVIL','CAMIONETA', 'CAMIONETA PASAJ.','CAMPERO']
# Filter by service
id_service = 1
# Exclude following makes
include_manuf = ["SUZUKI","KIA","HYUNDAI","MITSUBISHI","BMW","HONDA","MAZDA",
"RENAULT","NISSAN","TOYOTA","FORD","CHEVROLET","MERCEDES BENZ","SUBARU","SSANGYONG",
"LAND ROVER","DODGE","JEEP","SKODA","AUDI","VOLKSWAGEN","VOLVO","CHRYSLER","CADILLAC",
"CITROEN","DAEWOO","DAIHATSU","FIAT","ISUZU","JAGUAR","LADA","MINI",
"MG","PEUGEOT","PORSCHE","CHANA","CHERY","JAC","DFSK/DFM/DFZL","JMC",
"FOTON","HUMMER","LIFAN","TMD","ROVER","MASERATI","FERRARI","CHANGAN"]
# Connect to database
DB_HOST= os.environ['DB_HOST']
DB_USER=os.environ['DB_USER']
DB_PASSWORD=os.environ['DB_PASSWORD']
DB_NAME=os.environ['MC_DB_NAME']
conn_str = "dbname='"+DB_NAME+\
"' user='"+DB_USER+\
"' host='"+DB_HOST+\
"' password='"+DB_PASSWORD+"'"
try:
conn = psycopg2.connect(conn_str)
except:
print("Unable to connect to the database")
# Read existing folders
folders_path = os.path.join(os.getcwd(), paths['files'])
folders = os.listdir(path= folders_path)
# Read folders on the DB
# Create cursor
temp_cursor = conn.cursor()
# Create sql string
sql_str = """select reference from guides;"""
# Send to database
temp_cursor.execute(sql_str)
existing_folders = [reg[0] for reg in temp_cursor.fetchall()]
# Get folders not included
req_folders = [fold for fold in folders if int(fold[0:3]) not in existing_folders]
# Iterate over folders
for folder in req_folders:
# Temporary path
temp_path = os.path.join(folders_path,folder)
# Extract temporary information from folder
folder_dic = {
'reference': int(folder.split(sep="_")[0]),
'month_guide': month_map.get(folder.split(sep="_")[1]),
'month_sold': month_sold_map.get(folder.split(sep="_")[1]),
'year_guide': int(folder.split(sep="_")[2])}
print("Started processing guide %(reference)s" % folder_dic)
# List files in folder
temp_files = os.listdir(temp_path)
# Read required columns of code file (see new_sql)
cols = [0,1,2,3,9,22,25]
names = ['novedad', 'make', 'clase', 'id_fasecolda', 'id_servicio', 'estado','um']
# Read codes csv
iter_codes = pd.read_table(os.path.join(temp_path,temp_files[0]),
header=0,sep="|", usecols=cols, iterator=True, chunksize=1000,
dtype={'id_fasecolda':str, 'um':bool, 'id_servicio': int}, names=names)
codes = pd.concat([chunk[(chunk.id_servicio == id_service) &
(chunk.make.isin(include_manuf)) &
(chunk.clase.isin(classes))] for chunk in iter_codes])
# Select relevant codes
codes = codes.drop(labels='id_servicio', axis=1)
# Select right model year and price for each id
file_path = os.path.join(temp_path,temp_files[1])
prices_df = select_models(folder_dic, codes, file_path)
# Send to DB
insertGuide(conn, folder_dic)
insertPriceVariations(conn, prices_df, folder_dic)
# Report
print("Guide %(reference)s for month %(month_guide)s of %(year_guide)s was included" % folder_dic)
# # Create table of models vs monthly prices
# cursor = conn.cursor()
# sql_start = """
# select distinct year_model,month_sold from monthly_prices
# inner join guides on monthly_prices.id_guide = guides.id_guide
# where guides.id_guide = (select min(id_guide) from guides);
# """
# cursor.execute(sql_start)
# start = cursor.fetchall()[0]
# sql_end = """
# select distinct year_model,month_sold from monthly_prices
# inner join guides on monthly_prices.id_guide = guides.id_guide
# where guides.id_guide = (select max(id_guide) from guides);
# """
# cursor.execute(sql_end)
# end = cursor.fetchall()[0]
#
# #Iterate over table and build pandas dataframe
# year = 2008
# months = [7,8,9,10,11,12,1,2,3,4,5,6]
# dict_regs = defaultdict(dict)
# flag = False
#
# #Iterate over months
# while True:
# for month in months:
# #Obtain specific registers from db
# temp_sql = """
# select id_fasecolda, price, monthly_prices.id_guide from monthly_prices
# inner join guides on monthly_prices.id_guide = guides.id_guide
# where year_model="""+str(year)+""" and month_sold="""+str(month)+""";"""
# cursor.execute(temp_sql)
# #Obtain data
# full_regs = cursor.fetchall()
# if full_regs != []:
# ids = [pair[0] for pair in full_regs]
# regs = [pair[1] for pair in full_regs]
# ref = np.unique([pair[2] for pair in full_regs])[0] if full_regs != [] else None
# # Rearrange into data frame
# full_regs_df = pd.DataFrame({'regs':regs}, index=ids)
# #Update nested dictionaries with values for year-month
# temp_key = str(ref)+ "-" +str(year) + "-" + str(month).zfill(2)
# for index, row in full_regs_df.iterrows():
# dict_regs[index][temp_key]= row.regs
# # Report
# print(temp_key+" updated")
# if month == end[1] and year == end[0]:
# flag = True
# break
# if flag==True:
# break
# #Update counter
# year += 1
#
# monthly_prices_df =pd.DataFrame.from_dict(dict_regs,orient='index')
# monthly_prices_df.sort_index(axis=1, inplace=True)