-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathGetting_Elo_Ratings.py
515 lines (454 loc) · 24.3 KB
/
Getting_Elo_Ratings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
import math
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options # Import Options from chrome module
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common import NoSuchElementException
import statistics
import pandas as pd
import club_name_changes
import time
from datetime import datetime
from scipy.stats import poisson
# Use Service to install the ChromeDriver
service = Service(ChromeDriverManager().install())
# Initialize Chrome options
options = Options()
options.add_argument('--headless') # Add headless argument
# Pass both service and options to Chrome WebDriver
driver = webdriver.Chrome(service=service, options=options)
def get_url_with_retries(driver, url, retries=3, delay=5):
for attempt in range(retries):
try:
driver.get(url)
return # Exit if successful
except TimeoutException:
print(f"TimeoutException on attempt {attempt + 1}. Retrying...")
time.sleep(delay) # Wait before retrying
print("Failed to load the URL after multiple attempts")
# gets today's date
today = datetime.today().date()
# Format today's date as 'YYYY-MM-DD'
formatted_today = today.strftime('%Y-%m-%d')
# grabs the elo ratings of over 600 soccer clubs from across Europe
url = 'http://api.clubelo.com/' + str(formatted_today)
# Read the CSV data into a DataFrame
elo_ratings_df = pd.read_csv(url)
# imports the name change dictionaries to match clubs across different datasets
elo_name_changes = club_name_changes.elo_name_changes
football_transfers_name_changes = club_name_changes.football_transfers_name_changes
fbref_name_changes = club_name_changes.fbref_name_changes
# changes the names of clubs in the original Elo Ratings Data Frame
elo_ratings_df['Club'] = elo_ratings_df['Club'].replace(elo_name_changes)
# reads previous match data and drops any incomplete Data and duplicates
matches = pd.read_csv("Matches.csv")
matches.drop_duplicates(inplace=True)
matches.dropna(inplace=True)
matches['Date'] = pd.to_datetime(matches['Date'])
# reads a Data Frame showing the line number last read during web scraping and converts it to a dictionary
last_row_nums = pd.read_csv("Row_Update.csv")
last_row_dict = last_row_nums.set_index('Competition')['Last_Row'].to_dict()
# list of codes for each league
league_comp_codes = {'Premier-League': '9', 'La-Liga': '12', 'Serie-A': '11', 'Ligue-1': '13', 'Bundesliga': '20',
'Primeira-Liga': '32', 'Eredivisie': '23', 'Championship': '10'}
# list of leagues with only 18 teams
eighteen_team_leagues = ['Bundesliga', 'Primeira-Liga', 'Eredivisie']
# codes for European Club Competitions
euro_comp_codes = {'Champions-League': '8', 'Europa-League': '19', 'Europa-Conference-League': '882'}
# lists to be used as columns in the data frame
dates = []
home_teams = []
away_teams = []
home_xgs = []
away_xgs = []
leagues = []
season_start_time = time.time()
for comp, comp_code in euro_comp_codes.items():
# Gets the last row of the last web scrape run
starting_row = last_row_dict[comp]
previous_comp_matches = matches[matches['Competition'] == comp]
date_of_last_update = previous_comp_matches['Date'].max().date()
# gets the url for the season page
print("Getting URL")
url = 'https://fbref.com/en/comps/' + comp_code + '/2024-2025/schedule/2024-2025-' + comp + \
'-Scores-and-Fixtures'
driver.get(url)
print("Got URL. Implicit Wait Begins")
# waits 10 seconds for the page to load
driver.implicitly_wait(10)
print("Implicit Wait Finished")
# finds the elements in the page with match information
rows = driver.find_element(By.XPATH,
"//body[@class='fb']/div[@id='wrap']/div[@id='content']/div[@id='all_sched']"
"/div[@id='switcher_sched']/div[@id='div_sched_all']/table[@id='sched_all']"
"/tbody")
print("starting", comp)
# extracts match information by iterating over row numbers
# starts at 15 rows below to account for jumps in rows without match data
for row_num in range(starting_row - 15, 213):
# if comp == 'Champions-League' and row_num == 132:
# break
# elif comp == 'Europa-League' and row_num == 149:
# break
element_finder = "//tr[@data-row='" + str(row_num) + "']"
data = rows.find_element(By.XPATH, element_finder)
# Use the 'data' element as the context for finding sub-elements
try:
# Try to extract the date
date_str = data.find_element(By.XPATH, ".//td[@data-stat='date']").text
except NoSuchElementException:
# If date cannot be found, skip this row
continue
if date_str != '':
date = pd.Timestamp(datetime.strptime(date_str, '%Y-%m-%d')).date()
if date > today:
break
# elif date < date_of_last_update:
# continue
else:
continue
try:
# Use the 'data' element as the context for finding sub-elements
home_team = data.find_element(By.XPATH, ".//td[@data-stat='home_team']/a").text
except NoSuchElementException:
# If home_team cannot be found, skip this row
continue
home_xg = data.find_element(By.XPATH, ".//td[@data-stat='home_xg']").text
away_team = data.find_element(By.XPATH, ".//td[@data-stat='away_team']/a").text
away_xg = data.find_element(By.XPATH, ".//td[@data-stat='away_xg']").text
dates.append(date)
home_teams.append(home_team)
home_xgs.append(home_xg)
away_teams.append(away_team)
away_xgs.append(away_xg)
leagues.append(comp)
print(date, comp, home_team, "vs", away_team, "Match Expected Goals Added")
# updates the last_row_dict to account for the last row examined in the webscraping
last_row_dict[comp] = row_num
end_time = time.time()
print(comp, "Update Finished in", round((end_time - season_start_time) / 60, 2), "minutes")
print("Current Time:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
for league, comp_code in league_comp_codes.items():
# Gets the last row of the last web scrape run
starting_row = last_row_dict[league]
previous_league_matches = matches[matches['Competition'] == league]
date_of_last_update = previous_league_matches['Date'].max().date()
start_time = time.time()
print("Getting URL")
url = 'https://fbref.com/en/comps/' + comp_code + '/2024-2025/schedule/2024-2025-' + league + \
'-Scores-and-Fixtures'
print(url)
get_url_with_retries(driver, url)
print("Got URL. Implicit Wait Begins")
# waits 10 seconds for the page to load
driver.implicitly_wait(10)
print("Implicit Wait Finished")
try:
rows = driver.find_element(By.XPATH, "//body[@class='fb']/div[@id='wrap']/div[@id='content']"
"/div[@id='all_sched']"
"/div[@id='div_sched_2024-2025_" + comp_code + "_1']"
"/table[@id='sched_2024-2025_" + comp_code + "_1']"
"/tbody")
except NoSuchElementException:
rows = driver.find_element(By.XPATH,
"//body[@class='fb']/div[@id='wrap']/div[@id='content']/div[@id='all_sched']"
"/div[@id='switcher_sched']/div[@id='div_sched_all']/table[@id='sched_all']"
"/tbody")
print("starting", league)
# extracts match information by iterating over row numbers
# starts at 15 rows below to account for jumps in rows without match data
for row_num in range(starting_row - 15, 700):
element_finder = "//tr[@data-row='" + str(row_num) + "']"
data = rows.find_element(By.XPATH, element_finder)
# Use the 'data' element as the context for finding sub-elements
try:
# Try to extract the date
date_str = data.find_element(By.XPATH, ".//td[@data-stat='date']").text
except NoSuchElementException:
# If date cannot be found, skip this row
continue
if date_str != '':
date = pd.Timestamp(datetime.strptime(date_str, '%Y-%m-%d')).date()
if date > today:
break
# elif date < date_of_last_update:
# continue
else:
continue
try:
# Use the 'data' element as the context for finding sub-elements
home_team = data.find_element(By.XPATH, ".//td[@data-stat='home_team']/a").text
except NoSuchElementException:
# If home_team cannot be found, skip this row
continue
home_xg = data.find_element(By.XPATH, ".//td[@data-stat='home_xg']").text
away_team = data.find_element(By.XPATH, ".//td[@data-stat='away_team']/a").text
away_xg = data.find_element(By.XPATH, ".//td[@data-stat='away_xg']").text
dates.append(date)
home_teams.append(home_team)
home_xgs.append(home_xg)
away_teams.append(away_team)
away_xgs.append(away_xg)
leagues.append(league)
print(date, league, home_team, "vs", away_team, "Match Expected Goals Added")
# updates the last_row_dict to account for the last row examined in the webscraping
if row_num < 15:
last_row_dict[league] = 15
else:
last_row_dict[league] = row_num
end_time = time.time()
print(league, "Season Updated Expected Goals in", round((end_time - start_time) / 60, 2), "minutes")
print("Current Time:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
# Convert the dictionary to a DataFrame
last_row_df = pd.DataFrame(list(last_row_dict.items()), columns=['Competition', 'Last_Row'])
last_row_df.to_csv("Row_Update.csv", index=False, header=True)
season_end_time = time.time()
print("Expected Goals Match Data updated in", round((season_end_time - season_start_time) / 60, 2), "minutes")
# creates a Data Frame of Expected Goals Data for All Leagues and exports it to a CSV file
updated_matches = pd.DataFrame({"Competition": leagues, "Date": dates, "Home Team": home_teams, "Home XG": home_xgs,
"Away XG": away_xgs, "Away Team": away_teams})
matches = pd.concat([matches, updated_matches], ignore_index=True)
matches['Date'] = pd.to_datetime(matches['Date'])
matches.drop_duplicates(inplace=True)
matches.dropna(inplace=True)
matches.sort_values(by='Date', ascending=True, inplace=True)
# exports to a CSV file
matches.to_csv('Matches.csv', index=False, header=True)
# URL for Starting Elo Ratings
url = 'http://api.clubelo.com/2017-08-01'
# Read the CSV data into a DataFrame
xg_elo_ratings_df = pd.read_csv(url)
# changes the names of clubs in the original Expected Goals Elo Ratings Data Frame
xg_elo_ratings_df['Club'] = xg_elo_ratings_df['Club'].replace(elo_name_changes)
xg_missing_club_df = club_name_changes.xg_missing_club_df
xg_elo_ratings_df = pd.concat([xg_elo_ratings_df, xg_missing_club_df], ignore_index=True)
# updates the names of the Clubs in the 'matches' Data Frame
matches['Home Team'] = matches['Home Team'].replace(fbref_name_changes)
matches['Away Team'] = matches['Away Team'].replace(fbref_name_changes)
# Get the unique values in the "Clubs" column of xg_elo_ratings_df
xg_elo_clubs = xg_elo_ratings_df['Club'].unique()
# Find values in the "Home Team" and "Away_Team" column of matches that are not in elo_clubs
missing_home_teams = matches[~matches['Home Team'].isin(xg_elo_clubs)]['Home Team'].unique()
missing_away_teams = matches[~matches['Away Team'].isin(xg_elo_clubs)]['Away Team'].unique()
# Combine the missing teams from both columns
missing_teams = set(missing_home_teams) | set(missing_away_teams)
if len(missing_teams) > 0:
print("Teams in Matches Data Frame but Missing from XG Elo Ratings:")
print(missing_teams)
print()
# Each Nation in the Model is given a starting Home Field Advantage of 50 and it is updated after every match examined
hfas = {'ENG': 50, 'ESP': 50, 'GER': 50, 'ITA': 50, 'FRA': 50, 'NED': 50, 'POR': 50}
# This a list of dates where a match was played at a neutral venue in a European Club Competition
euro_neutral_dates = ['2018-05-26', '2019-06-01', '2020-08-12', '2020-08-13', '2020-08-14', '2020-08-15', '2020-08-18',
'2020-08-19', '2020-08-23', '2021-05-29', '2022-05-28', '2023-06-10', '2018-05-16', '2019-05-29',
'2020-08-05', '2020-08-06', '2020-08-10', '2020-08-11', '2020-08-16', '2020-08-17', '2020-08-21',
'2021-05-26', '2022-05-28', '2023-05-31', '2022-05-25', '2023-06-07', '2024-06-01', '2024-05-29',
'2024-05-22']
# Convert date strings to datetime objects
euro_neutral_dates = [datetime.strptime(date_str, '%Y-%m-%d') for date_str in euro_neutral_dates]
print("Reading Expected Goals Match Data")
start_time = time.time()
number_of_total_matches = len(matches)
for idx, match in matches.iterrows():
# gets key information regarding the match
# skips an abandoned match between Everton and Liverpool on December 7, 2024
if idx == 23701 and match['Home Team'] == 'Everton':
continue
home_team = match['Home Team']
away_team = match['Away Team']
home_xg = float(match['Home XG'])
away_xg = float(match['Away XG'])
home_xg_elo_info = xg_elo_ratings_df[xg_elo_ratings_df["Club"] == home_team].iloc[0]
away_xg_elo_info = xg_elo_ratings_df[xg_elo_ratings_df["Club"] == away_team].iloc[0]
home_elo = home_xg_elo_info["Elo"]
away_elo = away_xg_elo_info["Elo"]
home_team_country = home_xg_elo_info["Country"]
# adjusts for home field advantage in matches with non-neutral venues
if not (match['Competition'] in euro_comp_codes and match['Date'] in euro_neutral_dates):
if home_team_country in hfas:
home_elo += hfas[home_team_country]
else:
# sets the home field advantage to 50 elo points for countries not examined
home_elo += 50
# calculates the home team's win expectancy based on both teams' elo ratings
home_we = 1 / (10 ** ((away_elo - home_elo) / 400) + 1)
# calculates the mean expected goal difference based on the home team's win expectancy
home_mean_gd = statistics.NormalDist(0, 1.3).inv_cdf(home_we)
# gets the pre-match win and loss probabilities for the home team
z_loss_mark = (-0.5 - home_mean_gd) / 1.3
z_win_mark = (0.5 - home_mean_gd) / 1.3
home_pre_match_win_prob = 1 - statistics.NormalDist().cdf(z_win_mark)
home_pre_match_loss_prob = statistics.NormalDist().cdf(z_loss_mark)
# gets a list based on a Poisson distribution of Expected Goals in a Match for both teams
home_gps = []
away_gps = []
for goal_count in range(11):
home_goal_prob = poisson.pmf(k=goal_count, mu=home_xg)
away_goal_prob = poisson.pmf(k=goal_count, mu=away_xg)
home_gps.append(home_goal_prob)
away_gps.append(away_goal_prob)
win_pts_exchange_den = 0
loss_pts_exchange_den = 0
# gets a dictionary of goal differences and the probabilities based on the expected goals
gd_probabilities = {}
for gd in range(-10, 11):
z_lower = (gd - 0.5 - home_mean_gd) / 1.3
z_upper = (gd + 0.5 - home_mean_gd) / 1.3
# Approximate the probabilities using the standard normal distribution
probability_lower = statistics.NormalDist().cdf(z_upper)
probability_upper = statistics.NormalDist().cdf(z_lower)
# gets the pre-match probability for a particular Goal Difference Margin
pre_match_gd_prob = probability_lower - probability_upper
# adds the value for the goal difference to assist with calculating the elo points exchanged
if gd < 0:
loss_pts_exchange_den += math.sqrt(abs(gd)) * pre_match_gd_prob / home_pre_match_loss_prob
elif gd > 0:
win_pts_exchange_den += math.sqrt(gd) * pre_match_gd_prob / home_pre_match_win_prob
# creates a value for the goal difference in the dictionary
gd_probabilities.update({gd: 0})
# estimates the probability of each goal difference based on the expected goals statistic
for home_gc, home_gp in enumerate(home_gps):
for away_gc, away_gp, in enumerate(away_gps):
gd = home_gc - away_gc
prob = home_gp * away_gp
gd_probabilities[gd] += prob
# calculates the change in elo rating based on the probabilities from the expected goal statistic
change_in_elo = 0
for gd, prob in gd_probabilities.items():
if gd < 0:
change_in_elo += (0 - home_we) * 20 * math.sqrt(abs(gd)) * prob / loss_pts_exchange_den \
* math.sqrt(abs(gd))
elif gd == 0:
change_in_elo += (0.5 - home_we) * 20 * prob
else:
change_in_elo += (1 - home_we) * 20 * math.sqrt(gd) * prob / win_pts_exchange_den * math.sqrt(gd)
# gets the new home and away elo ratings and updates the dictionary
xg_elo_ratings_df.loc[xg_elo_ratings_df['Club'] == home_team, 'Elo'] += change_in_elo
xg_elo_ratings_df.loc[xg_elo_ratings_df['Club'] == away_team, 'Elo'] -= change_in_elo
# adjusts the effect of home field advantage depending on the home team's result
if home_team_country in hfas:
hfas[home_team_country] += 0.075 * change_in_elo
if (idx + 1) % (number_of_total_matches // 20) == 0:
print((idx + 1) / number_of_total_matches * 100, "% Complete")
current_time = time.time()
seconds_since_start = current_time - start_time
expected_total_seconds = seconds_since_start / ((idx + 1) / number_of_total_matches)
seconds_remaining = expected_total_seconds - seconds_since_start
print(round(seconds_remaining / 60, 2), "minutes remaining")
end_time = time.time()
print("Extracted Elo Ratings for Clubs Based on Expected Goals in", round((end_time - start_time) / 60, 2), "minutes")
# Leagues that have 0 teams in any UEFA Club Competitions or will not be modeled
leagues_of_non_interest = ['Serie B', '2. Bundesliga', 'Ligue 2', 'LaLiga 2 (Segunda División)',
'Keuken Kampioen Divisie', 'League One', 'Challenger Pro League', 'Serie C - Girone C',
'Challenge League', 'Segunda Liga', '1. Division', 'Regionalliga Ost', 'Serie C - Girone B',
'2 Liga']
# iterates over 12 pages of Data
clubs = []
leagues = []
ratings = []
start_time = time.time()
print()
for page_num in range(1, 13):
# Open the URL
driver.get('https://www.footballtransfers.com/en/teams/europe/' + str(page_num))
# Wait 10 seconds for the website to load
driver.implicitly_wait(10)
# Find all matching <tr> elements using the XPath expression
rows = driver.find_elements(By.XPATH, "//section[@id='template']"
"/section[@id='layout']/main[@id='content-part']"
"/section[@class='main-bar mainarticle-bar5']"
"/div[@class='container relative']"
"/div[@class='row']/div[@class='column-left']/div[@class='bg-white']"
"/div[@class='main-article auto-placeholder-table']"
"/table[@class='table table-striped table-hover ft-table team-overview-table mb-0']"
"/tbody[@id='player-table-body']/tr")
# Iterate over each <tr> element and print its text content
for row in rows:
# extracts key data for each Club
columns = row.find_elements(By.TAG_NAME, "td")
league = columns[2].text
# does not record data for Leagues that will not be used in for the Model
if league in leagues_of_non_interest:
continue
rating = float(columns[0].text.split()[0])
club_name = columns[1].text
# edits out the prefixes in each Club's Name to reduce matching issues
if club_name[-3:] in [' FC', ' CF', ' FK', ' SK', ' NK', ' IF', ' SC', ' BK', ' BC', ' AC', ' KV', ' FF', ' GF',
' VV', ' TC']:
club_name = club_name[:-3]
elif club_name[-4:] in [' AFC', ' HSC', ' CFC', ' GFS']:
club_name = club_name[:-4]
elif club_name[:3] in ['FC ', 'AS ', 'UD ', 'IF ', 'US ', 'FK ', 'RS ', 'IF ', 'SS ', 'CA ', 'SV ', 'GD ',
'AC ', 'SC ', 'SL ', 'SK ', 'NK ', 'KV ', 'RC ', 'BK ']:
club_name = club_name[3:]
elif club_name[:4] in ['AFC ', 'RSC ', 'GNK ', 'MTK ', 'IFK ', 'MSK ', 'ACF ', 'KAA ', 'SSC ', 'RKC ', 'PSC ',
'BSC ', 'AIK ', 'HNK ', 'OGC ', 'TSC ', 'DAC ', 'TSG ', 'PEC ', 'TSV ', 'NEC ', 'RCD ',
'KRC ', 'KAS ', 'KVC ', 'TSC ', 'WSG ', 'OFI ']:
club_name = club_name[4:]
# adds data to lists needed to create a Data Frame
if club_name in football_transfers_name_changes:
club_name = football_transfers_name_changes[club_name]
# To avoid a duplicate of a specific Romanian Club
if club_name == 'Farul Constanța' and 'Farul Constanța' in clubs:
continue
# Ensures there are no Clubs with a Rating of 0
elif rating <= 0:
continue
if club_name not in set(xg_elo_ratings_df['Club']):
print(club_name, "In Football Transfers Data, but not in Elo Ratings Data Frame")
clubs.append(club_name)
leagues.append(league)
ratings.append(rating)
driver.quit()
print()
end_time = time.time()
print("Examined Club Transfer Values in", round((end_time - start_time) / 60, 2), "minutes")
# creates a Data Frame from the extracted Information
transfer_value_df = pd.DataFrame({'Club': clubs, 'League': leagues, 'Rating': ratings})
print()
for row_num, row in xg_elo_ratings_df.iterrows():
club = row['Club']
country = row["Country"]
if club not in clubs and country in ['ENG', 'GER', 'ESP', 'FRA', 'ITA', 'POR', 'NED', 'BEL', 'UKR', 'CRO', 'AUT',
'SER', 'SUI', 'SCO', 'SVK', 'CZE']:
print(club, country, "is in XG El Ratings but not in Football Transfers")
print()
transfer_value_df = elo_ratings_df.merge(transfer_value_df, on='Club')
# Calculate z-score for the 'Rating' column
z_score = (transfer_value_df['Rating'] - transfer_value_df['Rating'].mean()) / transfer_value_df['Rating'].std()
# Define a function to estimate 'Elo' based on z-score
def estimate_elo(z_score, mean_elo, std_elo):
return mean_elo + z_score * std_elo
# Mean and standard deviation of 'Elo' column
mean_elo = elo_ratings_df['Elo'].mean()
std_elo = elo_ratings_df['Elo'].std()
# changes the name of a column in the XG Elo Ratings Data Frame
xg_elo_ratings_df.rename(columns={"Elo": "XG Elo"}, inplace=True)
# Estimate 'Elo' based on z-score
transfer_value_df['Transfer_Elo'] = estimate_elo(z_score, mean_elo, std_elo)
# Merge elo_ratings_df with xg_ratings_df using a left join
merged_df = elo_ratings_df.merge(xg_elo_ratings_df[["Club", "XG Elo"]], on='Club', how='left')
# Merge the merged_df with transfer_value_df using a left join
grand_elo_df = merged_df.merge(transfer_value_df[["Club", "Transfer_Elo"]], on='Club', how='left')
def calculate_combined_elo_rating(row):
original_elo = row["Elo"]
xg_elo = row["XG Elo"]
transfer_elo = row["Transfer_Elo"]
if pd.isna(xg_elo):
xg_elo = original_elo
if pd.isna(np.nan):
transfer_elo = original_elo
original_weight = 0.5
xg_weight = 0.25
transfer_weight = 0.25
return original_elo * original_weight + xg_elo * xg_weight + transfer_elo * transfer_weight
grand_elo_df['Adjusted_Elo_Rating'] = grand_elo_df.apply(calculate_combined_elo_rating, axis=1)
grand_elo_df.sort_values(by='Adjusted_Elo_Rating', ascending=False, inplace=True)
grand_elo_df.reset_index(inplace=True)
grand_elo_df['Rank'] = grand_elo_df.index + 1
grand_elo_df.drop(columns=["From", "To", "index"], inplace=True)
grand_elo_df.to_csv("Elo Ratings for European Clubs.csv", index=False, header=True)
grand_elo_df.set_index("Club", inplace=True)