diff --git a/.gitignore b/.gitignore
index e081ab01..3c68387d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,7 @@ src/data/debug/*.json
**/__pycache__/*
.DS_Store
+
+*my-venv/
+*new-venv
+node_modules
diff --git a/.prettierignore b/.prettierignore
index c6d72715..d648c41e 100644
--- a/.prettierignore
+++ b/.prettierignore
@@ -1,2 +1,9 @@
# Ignore built files
dist
+
+# Ignore data analysis HTML outputs, which are auto-generated
+src/data/analysis/output
+
+# Ignore static blog HTML and JSON files from data analysis
+static/blog/**/*.json
+static/blog/**/*.html
\ No newline at end of file
diff --git a/src/data/analysis/GHG_intensity_compliance_correlation.ipynb b/src/data/analysis/GHG_intensity_compliance_correlation.ipynb
new file mode 100644
index 00000000..8fbe1db4
--- /dev/null
+++ b/src/data/analysis/GHG_intensity_compliance_correlation.ipynb
@@ -0,0 +1,1275 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Compliance Analysis: \n",
+ "\n",
+ "## Correlation of GHG Intensity Last Year and Reporting Compliance This Year\n",
+ "\n",
+ "### Issue #114\n",
+ "Colton Lapp \n",
+ "\n",
+ "November 2024\n",
+ "\n",
+ "\n",
+ "### Description: \n",
+ "Does the GHG intensity the year prior help predict reporting non-compliance this year? \n",
+ "Does the trend of a GHG intensity (i.e. is a building increasing or decreasing its GHG intensity) help predict reporting non-compliance this year? \n",
+ "\n",
+ "I look at the raw data from the Chicago open data portal and use some basic group means and regression modeling. \n",
+ "\n",
+ "### Conclusion:\n",
+ "It doesn't seem like the level of GHG intensity or the trend of GHG intensity help predict compliance at all. \n",
+ "Building size DOES help predict compliance though. For every million additional square feet, the building is roughly 1.5% less likely to be NON compliant"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31mRunning cells with 'my-venv (Python 3.9.18)' requires the ipykernel package.\n",
+ "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n",
+ "\u001b[1;31mCommand: '\"/Users/coltonlapp/Dropbox/My Mac (Coltons-MacBook-Pro.local)/Desktop/Life/ChiHack/electrify-chicago/my-venv/bin/python\" -m pip install ipykernel -U --force-reinstall'"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import plotly.subplots as sp\n",
+ "import plotly.graph_objects as go\n",
+ "import plotly.express as px\n",
+ "from plotly.offline import iplot\n",
+ "import plotly.io as pio\n",
+ "from plotly.subplots import make_subplots\n",
+ "import math\n",
+ "import statsmodels.api as sm \n",
+ "import os\n",
+ "from pathlib import Path\n",
+ "import json \n",
+ "\n",
+ "from IPython.display import Image\n",
+ "\n",
+ "from plotly.offline import init_notebook_mode\n",
+ "init_notebook_mode(connected=True)\n",
+ "\n",
+ "pd.set_option('display.max_columns', None)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Set pathing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# get static dir for saving images\n",
+ "current_dir = Path.cwd()\n",
+ "project_root = current_dir\n",
+ "while True:\n",
+ " if os.path.basename(project_root) == 'electrify-chicago':\n",
+ " print(\"Success: Found 'electrify-chicago' as the base directory.\")\n",
+ " break\n",
+ " new_root = os.path.dirname(project_root)\n",
+ " if new_root == project_root: # Reached the filesystem root\n",
+ " raise FileNotFoundError(\"Error: 'electrify-chicago' directory not found in the path hierarchy.\")\n",
+ " project_root = new_root\n",
+ "static_blog_pth = os.path.join(project_root, 'static', 'blog', 'GHGIntensityPredictCompliance')\n",
+ "os.makedirs(static_blog_pth, exist_ok=True)\n",
+ "\n",
+ "expected_dir_name = \"analysis\"\n",
+ "fig_dir = os.path.join(current_dir, 'output', 'compliance_analysis')\n",
+ "\n",
+ "# Check if the current directory is the \"analysis\" folder\n",
+ "if current_dir.name != expected_dir_name:\n",
+ " raise AssertionError(f\"Expected working directory to be '{expected_dir_name}', but got '{current_dir.name}'.\\n\"\n",
+ " f\"Please ensure you are in the correct directory.\")\n",
+ "\n",
+ "print(f\"Current working directory is correctly set to '{current_dir}'.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Notebook options and custom plotting function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "reduce_memory = True # option to display some plotly graphs as static images to reduce memory, if possible\n",
+ "export_to_blog = True # if true, saves plots and regressions to blog static folder for website publishing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if export_to_blog:\n",
+ " dirs = [static_blog_pth, fig_dir]\n",
+ "else:\n",
+ " dirs = [fig_dir]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Function to display graphs\n",
+ "If you want to reduce memory size of notebook, set reduce_memory to True and this function will save some graphs as static image files instead of HTML"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def show_fig(fig, reduce_memory):\n",
+ " \"\"\"\n",
+ " Render a Plotly figure as a PNG image or an HTML visualization.\n",
+ "\n",
+ " Parameters:\n",
+ " - fig (plotly.graph_objs.Figure): The Plotly figure to render.\n",
+ " - reduce_memory (bool): If True, attempt to export the figure as a PNG \n",
+ " to save memory; defaults to HTML rendering on failure.\n",
+ "\n",
+ " Returns:\n",
+ " - tuple: The rendered figure (PNG or HTML) and the updated reduce_memory flag.\n",
+ " \"\"\"\n",
+ "\n",
+ " if reduce_memory:\n",
+ " try:\n",
+ " png_image = pio.to_image(fig, format='png')\n",
+ " return (png_image, reduce_memory)\n",
+ " \n",
+ " except RuntimeError as e:\n",
+ " print(\"Error exporting plotly to png, displaying html graph instead\\n\")\n",
+ " reduce_memory = False\n",
+ " \n",
+ " print(\"\"\"\n",
+ "Note: You may be recieving this error because of a unsolved bug in the Kaleido package (plotly dependency)\n",
+ "If you are using a virtual environment on a Mac with spaces in your directory pathname, you might get the error stating:\n",
+ " 'ValueError: Failed to start Kaleido subprocess. Error stream:'\n",
+ " \n",
+ "If this is the case, go to \n",
+ "'{Your venv name}/lib/python3.11/site-packages/kaleido/executable/kaleido'\n",
+ "\n",
+ "And add quotations to $DIR and \"$@\" like this:\n",
+ "\n",
+ " #!/bin/bash\n",
+ " DIR=\"$( cd \"$( dirname \"${BASH_SOURCE[0]}\" )\" >/dev/null 2>&1 && pwd )\"\n",
+ "\n",
+ " # Quote $DIR to handle spaces in the path\n",
+ " cd \"$DIR\"\n",
+ " ./bin/kaleido \"$@\"\n",
+ "\"\"\")\n",
+ "\n",
+ " if not reduce_memory:\n",
+ " return (iplot(fig), reduce_memory)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Read in data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Construct the path to the CSV file (one level above the current directory)\n",
+ "data_path = os.path.join( current_dir.parent, 'dist', 'benchmarking-all-years.csv')\n",
+ "df = pd.read_csv(data_path)\n",
+ "\n",
+ "# Create the \"reported\" column\n",
+ "df['Reported'] = df['GHGIntensity'].notna().astype(int)\n",
+ "\n",
+ "print(f\"There are {df['ID'].unique().shape[0]} unique building ids\")\n",
+ "\n",
+ "# Convert year to int\n",
+ "df['DataYear'] = df['DataYear'].astype(int)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Read in Building Benchmark Data to get Building Names"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "names_path = os.path.join( current_dir.parent, 'dist', 'building-benchmarks.csv')\n",
+ "\n",
+ "building_names = pd.read_csv(names_path)[['ID', 'PropertyName' ]]\n",
+ "building_names.drop_duplicates(keep='first')\n",
+ "building_names.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Merge names to data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.merge( df, building_names, how='left', on='ID')\n",
+ "df['PropertyName'] = df['PropertyName'].fillna(\"[Building Name Unavailable]\").replace(\"\", \"[Building Name Unavailable]\")\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Check that every building/year combo exists only once"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "group_counts = df.groupby(['ID', 'DataYear']).size()\n",
+ "\n",
+ "# Assert that the maximum count in any group is at most 1\n",
+ "assert group_counts.max() <= 1, \"There are buildings with more than one row in a given year!\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analysis: Unique values for GHG Intensity\n",
+ "\n",
+ "Conclusion: strange that some values are highly represented while others are not. How are these calculated? "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Round GHG Intensity values to 1 digit\n",
+ "df['GHGIntensity'] = df['GHGIntensity'].round(1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Plot Distribution of GHG Intensities\n",
+ "\n",
+ "Show histogram of how common GHG intensity values are, breaking out the outliers into a seperate plot "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "fig = sp.make_subplots(\n",
+ " rows=1,\n",
+ " cols=2,\n",
+ " column_widths=[0.8, 0.2],\n",
+ " horizontal_spacing=0.2,\n",
+ " subplot_titles = [\n",
+ " '', \n",
+ " 'GHG Intensity Outliers (GHG Intensity values > 50)'\n",
+ " ]\n",
+ ")\n",
+ "\n",
+ "fig.add_trace( go.Histogram(\n",
+ " x=df['GHGIntensity'],\n",
+ " xbins = dict(start=0, end=100, size=.5),\n",
+ " name='Histogram of Values',\n",
+ " hovertemplate = \" %{y} Buildings with GHG Intensity between %{x}\"\n",
+ " ),\n",
+ " row=1,col=1\n",
+ ")\n",
+ "\n",
+ "# Add a light red box to the first plot to show outliers\n",
+ "fig.add_shape(\n",
+ " type=\"rect\",\n",
+ " x0=50,\n",
+ " x1=100,\n",
+ " y0=0,\n",
+ " y1=1500,\n",
+ " fillcolor=\"rgba(255, 0, 0, 0.1)\", # Light red with transparency\n",
+ " line=dict(width=0),\n",
+ " row=1, col=1\n",
+ ")\n",
+ "\n",
+ "outlier_subset = df.dropna(subset=['GHGIntensity'])\n",
+ "outlier_subset = outlier_subset[outlier_subset['GHGIntensity']>50]\n",
+ "fig.add_trace(\n",
+ " go.Scatter(\n",
+ " x=[0] * len(outlier_subset['GHGIntensity']), # Make x an array of zeros with the correct length\n",
+ " y=outlier_subset['GHGIntensity'],\n",
+ " mode='markers',\n",
+ " marker=dict( color='blue', opacity=0.6),\n",
+ " customdata=df['DataYear'],\n",
+ " hovertext=outlier_subset['PropertyName'], # Add PropertyName to hovertext\n",
+ " hovertemplate=\"%{hovertext} GHG Intensity: %{y} in %{customdata}\",\n",
+ " name=''\n",
+ " ),\n",
+ " row=1, col=2\n",
+ ")\n",
+ "\n",
+ "# Add a light red background to the second subplot to show outliers\n",
+ "fig.add_shape(\n",
+ " type=\"rect\",\n",
+ " x0=-1,\n",
+ " x1=1,\n",
+ " y0=50,\n",
+ " y1=900,\n",
+ " fillcolor=\"rgba(255, 0, 0, 0.1)\", # Light red with transparency\n",
+ " line=dict(width=0),\n",
+ " layer=\"below\",\n",
+ " row=1, col=2\n",
+ ")\n",
+ "\n",
+ "\n",
+ "fig.update_xaxes(visible=False, row=1, col=2)\n",
+ "fig.update_xaxes(range=[0, 100], row=1,col=1)\n",
+ "fig.update_xaxes( title_text='' , row=1,col=2)\n",
+ "fig.update_yaxes( title_text='GHG Intensity', row=1,col=2)\n",
+ "\n",
+ "## Add an outline to the bars\n",
+ "fig.update_traces(marker=dict(line=dict(width=.1, color='black')))\n",
+ "\n",
+ "fig.add_annotation(\n",
+ " x=80,\n",
+ " y=300,\n",
+ " text=\"Some buildings had outlier GHG intensity levels (up to 800) →\",\n",
+ " showarrow=False, # No arrow for this annotation\n",
+ " font=dict(size=10), # Customize font size\n",
+ ")\n",
+ "\n",
+ "# Update layout for better display\n",
+ "fig.update_layout(\n",
+ " xaxis_title='GHG Intensity',\n",
+ " yaxis_title='Count',\n",
+ " showlegend=False,\n",
+ " title='Distribution of GHG Intensities',\n",
+ " height=400,\n",
+ " width=800\n",
+ ")\n",
+ "\n",
+ "# Show the plot\n",
+ "#pio.show(fig)\n",
+ "iplot(fig)\n",
+ "\n",
+ "for dir in [static_blog_pth, fig_dir]:\n",
+ " fig.write_html( os.path.join(dir,'distribution_of_GHG_intensity.html'), include_plotlyjs=\"cdn\" ) # 'cdn' reduces memory of file by not including all necessary JS"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Compliance type counts over time\n",
+ "\n",
+ "Count the number of reporting / non-reporting for every year, highlighting the impact of COVID"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Count each new column per year\n",
+ "value_counts = df.groupby('DataYear')['Reported'].value_counts()\n",
+ "non_reporting_counts = value_counts.xs(0, level='Reported')\n",
+ "reporting_counts = value_counts.xs(1, level='Reported')\n",
+ "\n",
+ "# Create the figure\n",
+ "fig = go.Figure()\n",
+ "\n",
+ "# Add traces for each category\n",
+ "fig.add_trace(go.Scatter(x=reporting_counts.index, y=reporting_counts.values,\n",
+ " mode='lines+markers', name='Reported',\n",
+ " line=dict(width=4, color='rgba(0, 0,255, 0.7)'),\n",
+ " marker=dict(symbol='circle', size=10)))\n",
+ "\n",
+ "fig.add_trace(go.Scatter(x=non_reporting_counts.index, y=non_reporting_counts.values,\n",
+ " mode='lines+markers', name=\"Didn't Report\",\n",
+ " line=dict(width=4, color='rgba(255, 0, 0, 0.7)'),\n",
+ " marker=dict(symbol='circle', size=10)))\n",
+ "\n",
+ "fig.add_trace( go.Scatter( x=[2018.5, 2019.5, 2019.5, 2018.5, 2018.5], \n",
+ " y=[0, 0, np.max(reporting_counts)*1.2, np.max(reporting_counts)*1.2, 0 ], \n",
+ " fill='toself', mode='lines', name='Covid Data Disruption' )\n",
+ ")\n",
+ "\n",
+ "# Update layout\n",
+ "fig.update_layout(title=\"Count of Buildings That Did/Didn't Report Emissions by Year\",\n",
+ " xaxis_title='Year of Emissions (One year before data is reported)',\n",
+ " yaxis_title='Count of Buildings',\n",
+ " legend_title='Category')\n",
+ "\n",
+ "# Show the plot\n",
+ "#pio.show(fig)\n",
+ "iplot(fig)\n",
+ "\n",
+ "\n",
+ "for dir in [static_blog_pth, fig_dir]:\n",
+ " fig.write_html( os.path.join(dir,'reporting_counts_over_time.html'), include_plotlyjs=\"cdn\" )\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analysis: Covid saw temporary drop in building reporting\n",
+ "Buildings report the prior years emissions data the following spring. Thus, 2019's data was supposed to be reported in Spring 2020 but it seems like Covid disrupted this. As a result, many observations of non-reporting are because of Covid. \n",
+ "\n",
+ "For robustness, we will create a subset of the dataframe without the covid data later in this code"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analysis: Correlation of \"GHG Intensity Last Year\" and \"Compliance This Year\"\n",
+ "\n",
+ "### First: calculate \"GHG Intensity last year\" and \"Chang in GHG Intensity Last Year\"\n",
+ "\n",
+ "- GHG Intensity Last Year is just the GHG intensity the year prior\n",
+ " - i.e. if the Data Year is 2019, then 'GHG Intensity Last Year' would be equal to GHG Intensity in 2018\n",
+ "\n",
+ "- Change in GHG Intensity Last Year is equal to the difference between the GHG intensity from two years ago to one year ago\n",
+ " - i.e. if the Data Year is 2019, Change in GHG Intensity Last year is equal to the GHG intensity from 2018 minus the GHG Intensity from 2017"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create the new column 'GHGIntensity_last_year'\n",
+ "df['GHGIntensity_LastYear'] = df.apply(\n",
+ " lambda row: df.loc[\n",
+ " (df['DataYear'] == row['DataYear'] - 1) & \n",
+ " (df['Reported'] == 1) & \n",
+ " (df['ID'] == row['ID']), # Match by id\n",
+ " 'GHGIntensity'\n",
+ " ].iloc[0] if len(df.loc[\n",
+ " (df['DataYear'] == row['DataYear'] - 1) & \n",
+ " (df['Reported'] == 1) & \n",
+ " (df['ID'] == row['ID']), # Match by id\n",
+ " 'GHGIntensity'\n",
+ " ]) > 0 else None,\n",
+ " axis=1\n",
+ ")\n",
+ "\n",
+ "# Create the new column 'pct_chng_GHGIntensity_last_year'\n",
+ "df['Chng_GHGIntensity_LastYear'] = df.apply(\n",
+ " lambda row: (\n",
+ " (\n",
+ " df.loc[\n",
+ " (df['DataYear'] == row['DataYear'] - 1) & \n",
+ " (df['Reported'] == 1) & \n",
+ " (df['ID'] == row['ID']), # Match by id\n",
+ " 'GHGIntensity'\n",
+ " ].iloc[0]\n",
+ "\n",
+ " -\n",
+ "\n",
+ " df.loc[\n",
+ " (df['DataYear'] == row['DataYear'] - 2) & \n",
+ " (df['Reported'] == 1) & \n",
+ " (df['ID'] == row['ID']), # Match by id\n",
+ " 'GHGIntensity'\n",
+ " ].iloc[0]\n",
+ " ) \n",
+ " \n",
+ " ) if len(df.loc[\n",
+ " (df['DataYear'] == row['DataYear'] - 1) & \n",
+ " (df['Reported'] == 1) & \n",
+ " (df['ID'] == row['ID']), # Match by id\n",
+ " 'GHGIntensity'\n",
+ " ]) == 1 and len(df.loc[\n",
+ " (df['DataYear'] == row['DataYear'] - 2) & \n",
+ " (df['Reported'] == 1) & \n",
+ " (df['ID'] == row['ID']), # Match by id\n",
+ " 'GHGIntensity'\n",
+ " ]) == 1 else None,\n",
+ " axis=1\n",
+ ")\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Demonstrate newly calculated values"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.sort_values( ['ID', 'DataYear']).head()[['DataYear', 'ID','GHGIntensity', 'GHGIntensity_LastYear', 'Chng_GHGIntensity_LastYear']]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Basic Regression Models to Determine Effects of GHG Intensity on Reporting"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def linear_prob_model(df, regressors, target):\n",
+ "\n",
+ " # Start with a copy of the DataFrame\n",
+ " df_cleaned = df.copy()\n",
+ "\n",
+ "\n",
+ " # Initialize a dictionary to track dropped rows for each regressor\n",
+ " dropped_rows = {}\n",
+ "\n",
+ " # Drop rows with missing values for each regressor and track the counts\n",
+ " nrow_init = df_cleaned.shape[0]\n",
+ " for regressor in regressors:\n",
+ " initial_rows = len(df_cleaned)\n",
+ " df_cleaned = df_cleaned.dropna(subset=[regressor])\n",
+ " dropped_rows[regressor] = initial_rows - len(df_cleaned)\n",
+ "\n",
+ " # Report the number of dropped rows for each regressor\n",
+ " for regressor, count in dropped_rows.items():\n",
+ " print(f\"Rows dropped due to missing values in '{regressor}': {count} ({round(100*(count/nrow_init), 1)}%)\")\n",
+ "\n",
+ " # Set up the linear probability model\n",
+ " X = df_cleaned[regressors]\n",
+ " X = sm.add_constant(X) # Add a constant for the intercept\n",
+ " y = df_cleaned[target]\n",
+ "\n",
+ " # Fit the regression model\n",
+ " model = sm.OLS(y, X).fit()\n",
+ "\n",
+ " # Display the summary of the model\n",
+ " print(model.summary())\n",
+ "\n",
+ " return model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Define the binary target column and regressors\n",
+ "df['GrossFloorArea_Mil'] = df['GrossFloorArea']/1000000\n",
+ "df['non_reporting'] = (df['Reported']==0).astype(int)\n",
+ "\n",
+ "target_var = 'non_reporting'\n",
+ "regressors = [\n",
+ " 'GHGIntensity_LastYear',\n",
+ " 'Chng_GHGIntensity_LastYear',\n",
+ " 'GrossFloorArea_Mil'\n",
+ "]\n",
+ "\n",
+ "# Define variables\n",
+ "target_var = 'non_reporting'\n",
+ "regressors = [\n",
+ " 'GHGIntensity_LastYear',\n",
+ " 'Chng_GHGIntensity_LastYear',\n",
+ " 'GrossFloorArea_Mil'\n",
+ "]\n",
+ "\n",
+ "# Map pretty names for each variable\n",
+ "pretty_names = {\n",
+ " 'non_reporting': 'Non-Reporting',\n",
+ " 'GHGIntensity_LastYear': 'GHG Intensity (Last Year)',\n",
+ " 'Chng_GHGIntensity_LastYear': 'Change in GHG Intensity Last Two Years',\n",
+ " 'GrossFloorArea_Mil': 'Gross Floor Area (Millions)'\n",
+ "}\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Basic Regression"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model = linear_prob_model(df, regressors, target_var)\n",
+ "\n",
+ "# Extract regression results\n",
+ "results = {\n",
+ " \"dependent_variable\": pretty_names[target_var], # Pretty name for dependent variable\n",
+ " \"number_of_observations\": int(model.nobs), # Number of observations\n",
+ " \"r_squared\": model.rsquared, # R-squared\n",
+ " \"adj_r_squared\": model.rsquared_adj, # Adjusted R-squared\n",
+ " \"coefficients\": { # Coefficients with pretty names\n",
+ " pretty_names.get(name, name): coef for name, coef in model.params.items()\n",
+ " },\n",
+ " \"p_values\": { # P-values with pretty names\n",
+ " pretty_names.get(name, name): pval for name, pval in model.pvalues.items()\n",
+ " },\n",
+ " \"confidence_intervals\": { # 95% confidence intervals with pretty names\n",
+ " pretty_names.get(name, name): list(ci) for name, ci in model.conf_int().iterrows()\n",
+ " },\n",
+ " \"covariance_type\": model.cov_type, # Covariance type\n",
+ "}\n",
+ "\n",
+ "# Save as JSON\n",
+ "for dir in dirs:\n",
+ " fpath = os.path.join(dir,'regression_results_w_covid.json')\n",
+ " with open(fpath, \"w\") as f:\n",
+ " json.dump(results, f, indent=4)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analysis of Regression Results: No Effect Found\n",
+ "Using different regression combinations of \n",
+ "- GHG Intensity Last Year\n",
+ "- Change in GHG Intensity from 2 years ago to 1 year ago\n",
+ "- Square footage of building\n",
+ "\n",
+ "there generally is no effect of GHG Intensity last year or change in GHG intensity from two years ago to 1 year ago. \n",
+ "\n",
+ "Interestingly but unrelatedly, it seems that larger buildings are more likely to be compliant in their reporting. The effect size is small though"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Graph to Show Averages\n",
+ "\n",
+ "Show the mean and median value of GHG Intensity (or Change in GHG Intensity Last Year) by compliance status"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "\n",
+ "def create_scatter_with_means(df, x, y, hover_data, value_mapping, base_fontsize=12, x_label=None, y_label=None, title=None):\n",
+ " \"\"\"\n",
+ " Create a series of side-by-side subplot scatter plots (one per unique x category),\n",
+ " each with a horizontal mean line. The subplots are arranged horizontally with minimal gap.\n",
+ " \n",
+ " Parameters:\n",
+ " - df: DataFrame containing the data.\n",
+ " - x: Column name for the x-axis category variable.\n",
+ " - y: Column name for the continuous y-axis variable.\n",
+ " - hover_data: List of columns to display on hover.\n",
+ " - value_mapping: Dictionary to map the values of the x column (often binary) to meaningful labels.\n",
+ " - base_fontsize: Base font size for scaling other font sizes.\n",
+ " - x_label: Label for the x-axis (overall).\n",
+ " - y_label: Label for the y-axis (overall).\n",
+ " - title: Main title of the entire figure.\n",
+ " \"\"\"\n",
+ " \n",
+ " # Copy and prepare the dataframe\n",
+ " df = df.copy()\n",
+ " df[x] = df[x].map(value_mapping)\n",
+ " \n",
+ " # Identify unique categories in the x variable\n",
+ " categories = df[x].unique()\n",
+ " categories = [cat for cat in categories if pd.notnull(cat)] # Ensure no NaN\n",
+ " \n",
+ " # Calculate mean and median per category\n",
+ " mean_values = df.groupby(x)[y].mean()\n",
+ " median_values = df.groupby(x)[y].median()\n",
+ " \n",
+ " # Create subplots: one row, as many columns as unique categories\n",
+ " fig = make_subplots(\n",
+ " rows=1, cols=len(categories),\n",
+ " horizontal_spacing=0.1, # Narrow gap between subplots\n",
+ " shared_yaxes=True, # Share the same y-axis\n",
+ " subplot_titles=categories\n",
+ " )\n",
+ "\n",
+ " colors = ['rgba(0, 0, 255, 1)', 'rgba(255, 0, 0, 1)'] \n",
+ " colors_t = ['rgba(0, 0, 255, 0.5)', 'rgba(255, 0, 0, 0.5)'] # Transparent versions of 'blue' and 'red'\n",
+ " \n",
+ " # For each category, add a scatter trace\n",
+ " for i, category in enumerate(categories, start=1):\n",
+ "\n",
+ " cat_data = df[df[x] == category]\n",
+ " \n",
+ " # Add a small horizontal jitter around x=1 for the scatter\n",
+ " jitter = np.random.uniform(0.999, 1.001, size=len(cat_data))\n",
+ " \n",
+ " # Create the scatter trace\n",
+ " fig.add_trace(\n",
+ " go.Scatter(\n",
+ " x=jitter,\n",
+ " y=cat_data[y],\n",
+ " mode='markers',\n",
+ " marker=dict(size=8, opacity=0.8, color=colors_t[i-1]),\n",
+ " hovertemplate=' '.join([f\"{col}: %{{customdata[{idx}]}}\" \n",
+ " for idx, col in enumerate(hover_data)]) if hover_data else None,\n",
+ " customdata=cat_data[hover_data] if hover_data else None,\n",
+ " name = f'{y_label} for \"{category}\"'\n",
+ " ),\n",
+ " row=1, col=i\n",
+ " )\n",
+ " \n",
+ " # Add mean line\n",
+ " mean_val = mean_values.loc[category]\n",
+ " fig.add_hline(\n",
+ " y=mean_val,\n",
+ " line_dash=\"dash\",\n",
+ " line_color=colors[i-1],\n",
+ " row=1, col=i,\n",
+ " name = f'Mean {y_label}',\n",
+ " line_width=3\n",
+ " )\n",
+ " \n",
+ " # Add annotation for mean and median\n",
+ " median_val = median_values.loc[category]\n",
+ " fig.add_annotation(\n",
+ " x=1.001, # slightly to the right of the main cluster\n",
+ " y=mean_val+20,\n",
+ " text=f\"Mean: {mean_val:.2f} Median: {median_val:.2f}\",\n",
+ " showarrow=False,\n",
+ " xanchor='left',\n",
+ " yanchor='bottom',\n",
+ " font=dict(size=base_fontsize * 1.2, color=colors[i-1]),\n",
+ " row=1, col=i\n",
+ " )\n",
+ " \n",
+ " # Update x-axis for this subplot (just show a single vertical line)\n",
+ " fig.update_xaxes(\n",
+ " showline=True,\n",
+ " linecolor='black',\n",
+ " zeroline=True,\n",
+ " showticklabels=False,\n",
+ " showgrid=False,\n",
+ " range=[0.99, 1.01], # tight range around jitter\n",
+ " tickfont=dict(size=base_fontsize * 1.2),\n",
+ " row=1, col=i\n",
+ " )\n",
+ "\n",
+ " # Update the y-axis (shared) styling\n",
+ " fig.update_yaxes(\n",
+ " showline=True, \n",
+ " linecolor='black',\n",
+ " showgrid=False,\n",
+ " title_text=y_label, \n",
+ " title_font=dict(size=base_fontsize * 1.4),\n",
+ " tickfont=dict(size=base_fontsize * 1.2),\n",
+ " row=1, col=1\n",
+ " )\n",
+ " \n",
+ " # Update the layout\n",
+ " fig.update_layout(\n",
+ " height=400,\n",
+ " width=400*len(categories), # adjust width based on number of categories\n",
+ " title=dict(\n",
+ " text=title or f'Scatterplot of {y} by {x}',\n",
+ " x=0.5,\n",
+ " font=dict(size=base_fontsize * 1.3)\n",
+ " ),\n",
+ " plot_bgcolor='white',\n",
+ " margin=dict(l=40, r=40, t=150, b=80)\n",
+ " )\n",
+ "\n",
+ " return fig"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Comparing Change in GHG Intensity"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "value_mapping = {1: \"Didn't Report\", 0: \"Reported\"}\n",
+ "fig = create_scatter_with_means(\n",
+ " df=df.dropna(subset=['Chng_GHGIntensity_LastYear']),\n",
+ " x='non_reporting',\n",
+ " y='Chng_GHGIntensity_LastYear',\n",
+ " #hover_data=['PropertyName', 'DataYear', 'non_reporting', 'GHGIntensity_LastYear', 'Chng_GHGIntensity_LastYear', 'GrossFloorArea_Mil'],\n",
+ " hover_data=['PropertyName', 'DataYear'], # Reduced columns to make file size smaller\n",
+ " value_mapping=value_mapping,\n",
+ " base_fontsize=12,\n",
+ " x_label='Reported This Year?',\n",
+ " y_label='Change In GHG Intensity',\n",
+ " title='Change In GHG Intensity (In Last 2 Years) By Reporting Compliance This Year'\n",
+ ")\n",
+ "\n",
+ "\n",
+ "iplot(fig)\n",
+ "for dir in dirs:\n",
+ " fig.write_html( os.path.join(dir,'change_GHG_trend_by_compliance.html'), include_plotlyjs=\"cdn\" )\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Same graph, but look at GHG Intensity instead of Change in GHG Intensity"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "value_mapping = {1: \"Didn't Report\", 0: \"Reported\"}\n",
+ "fig = create_scatter_with_means(\n",
+ " df=df.dropna(subset=['GHGIntensity_LastYear']),\n",
+ " x='non_reporting',\n",
+ " y='GHGIntensity_LastYear',\n",
+ " #hover_data=['PropertyName', 'DataYear', 'non_reporting', 'GHGIntensity_LastYear', 'Chng_GHGIntensity_LastYear', 'GrossFloorArea_Mil'],\n",
+ " hover_data=['PropertyName', 'DataYear'], # Reduced columns to make file size smaller\n",
+ " value_mapping=value_mapping,\n",
+ " base_fontsize=12,\n",
+ " x_label='Reported This Year?',\n",
+ " y_label='GHG Intensity',\n",
+ " title='GHG Intensity Last Year By Reporting Compliance This Year'\n",
+ ")\n",
+ "\n",
+ "\n",
+ "iplot(fig)\n",
+ "for dir in dirs:\n",
+ " fig.write_html( os.path.join(dir,'GHG_last_year_by_compliance.html'), include_plotlyjs=\"cdn\" )\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "\n",
+ "## Robustness: Dropping Covid\n",
+ "\n",
+ "Do the results change at all if we drop year=2019? (the year of the covid data anomaly)\n",
+ "\n",
+ "### Regression results:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "n_obs_original = df.shape[0]\n",
+ "df_no_covid = df.copy()\n",
+ "df_no_covid = df_no_covid[df_no_covid['DataYear']!=2019]\n",
+ "\n",
+ "\n",
+ "print(f\"Dropped {n_obs_original - len(df_no_covid)} observations out of {n_obs_original} ({round(100*(n_obs_original - len(df_no_covid))/n_obs_original, 2)}%) by dropping Covid year (2019)\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Count each new column per year\n",
+ "value_counts = df_no_covid.groupby('DataYear')['Reported'].value_counts()\n",
+ "non_reporting_counts = value_counts.xs(0, level='Reported')\n",
+ "reporting_counts = value_counts.xs(1, level='Reported')\n",
+ "\n",
+ "non_reporting_counts[2019] = 0\n",
+ "reporting_counts[2019] = 0\n",
+ "\n",
+ "non_reporting_counts.index = non_reporting_counts.index.astype(int)\n",
+ "reporting_counts.index = reporting_counts.index.astype(int)\n",
+ "\n",
+ "non_reporting_counts = non_reporting_counts.sort_index()\n",
+ "reporting_counts = reporting_counts.sort_index()\n",
+ "\n",
+ "# Create the figure\n",
+ "fig = go.Figure()\n",
+ "\n",
+ "# Add traces for each category\n",
+ "fig.add_trace(go.Scatter(x=reporting_counts.index, y=reporting_counts.values,\n",
+ " mode='lines+markers', name='Reported',\n",
+ " line=dict(width=4, color='rgba(0, 0,255, 0.7)'),\n",
+ " marker=dict(symbol='circle', size=10)))\n",
+ "\n",
+ "fig.add_trace(go.Scatter(x=non_reporting_counts.index, y=non_reporting_counts.values,\n",
+ " mode='lines+markers', name=\"Didn't Report\",\n",
+ " line=dict(width=4, color='rgba(255, 0, 0, 0.7)'),\n",
+ " marker=dict(symbol='circle', size=10)))\n",
+ "\n",
+ "fig.add_trace( go.Scatter( x=[2018.5, 2019.5, 2019.5, 2018.5, 2018.5], \n",
+ " y=[0, 0, np.max(reporting_counts)*1.2, np.max(reporting_counts)*1.2, 0 ], \n",
+ " fill='toself', mode='lines', name='Covid Data Disruption' )\n",
+ ")\n",
+ "\n",
+ "# Update layout\n",
+ "fig.update_layout(title=\"No Covid: Count of Buildings That Did/Didn't Report Emissions by Year\",\n",
+ " xaxis_title='Year of Emissions (One year before data is reported)',\n",
+ " yaxis_title='Count of Buildings',\n",
+ " legend_title='Category')\n",
+ "\n",
+ "\n",
+ "\n",
+ "out, reduce_memory = show_fig(fig, reduce_memory)\n",
+ "\n",
+ "if reduce_memory:\n",
+ " with open(os.path.join(fig_dir,'reporting_counts_over_time_NO_COVID.png'), \"wb\") as f:\n",
+ " f.write(out)\n",
+ " out = Image(out)\n",
+ "out\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "linear_prob_model(df_no_covid, regressors, target_var)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Results: It still seems like the isn't really a relationship, although GHG intensity seems to be associated with a tiny increase in non-reporting. The magnitude is very small and the likelihood this is from chance is high. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "value_mapping = {1: \"Didn't Report\", 0: \"Reported\"}\n",
+ "fig = create_scatter_with_means(\n",
+ " df=df_no_covid,\n",
+ " x='non_reporting',\n",
+ " y='Chng_GHGIntensity_LastYear',\n",
+ " hover_data=['PropertyName', 'DataYear', 'non_reporting', 'GHGIntensity_LastYear', 'Chng_GHGIntensity_LastYear', 'GrossFloorArea_Mil'],\n",
+ " value_mapping=value_mapping,\n",
+ " base_fontsize=12,\n",
+ " x_label='Reported This Year?',\n",
+ " y_label='Change In GHG Intensity',\n",
+ " title='No Covid: Change In GHG Intensity (In Last 2 Years) By Reporting Compliance This Year'\n",
+ ")\n",
+ "\n",
+ "\n",
+ "out, reduce_memory = show_fig(fig, reduce_memory)\n",
+ "\n",
+ "\n",
+ "if reduce_memory:\n",
+ " with open(os.path.join(fig_dir,'change_GHG_trend_by_compliance_NO_COVID.png'), \"wb\") as f:\n",
+ " f.write(out)\n",
+ " out = Image(out)\n",
+ "out"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "\n",
+ "value_mapping = {1: \"Didn't Report\", 0: \"Reported\"}\n",
+ "fig = create_scatter_with_means(\n",
+ " df=df_no_covid,\n",
+ " x='non_reporting',\n",
+ " y='GHGIntensity_LastYear',\n",
+ " hover_data=['PropertyName', 'DataYear', 'non_reporting', 'GHGIntensity_LastYear', 'Chng_GHGIntensity_LastYear', 'GrossFloorArea_Mil'],\n",
+ " value_mapping=value_mapping,\n",
+ " base_fontsize=12,\n",
+ " x_label='Reported This Year?',\n",
+ " y_label='GHG Intensity',\n",
+ " title='No Covid: GHG Intensity Last Year By Reporting Compliance This Year'\n",
+ ")\n",
+ "\n",
+ "out, reduce_memory = show_fig(fig, reduce_memory)\n",
+ "out \n",
+ "\n",
+ "if reduce_memory:\n",
+ " with open(os.path.join(fig_dir,'GHG_last_year_by_compliance_NO_COVID.png'), \"wb\") as f:\n",
+ " f.write(out)\n",
+ " out = Image(out)\n",
+ "out"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Drop outliers "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "n_obs_original = df.shape[0]\n",
+ "\n",
+ "# Identify buildings with any row having GHGIntensity > 100\n",
+ "buildings_with_outliers = df.loc[df['GHGIntensity'] > 100, 'ID'].unique()\n",
+ "\n",
+ "# Drop all rows for those buildings\n",
+ "df_no_outliers = df[~df['ID'].isin(buildings_with_outliers)]\n",
+ "\n",
+ "print(f\"Dropped {n_obs_original - len(df_no_outliers)} observations out of {n_obs_original} ({round(100*(n_obs_original - len(df_no_outliers))/n_obs_original, 2)}%) by dropping all rows for buildings with any GHGIntensity over 100\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "linear_prob_model(df_no_outliers, regressors, target_var)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "value_mapping = {1: \"Didn't Report\", 0: \"Reported\"}\n",
+ "fig = create_scatter_with_means(\n",
+ " df=df_no_outliers,\n",
+ " x='non_reporting',\n",
+ " y='Chng_GHGIntensity_LastYear',\n",
+ " hover_data=['PropertyName', 'DataYear', 'non_reporting', 'GHGIntensity_LastYear', 'Chng_GHGIntensity_LastYear', 'GrossFloorArea_Mil'],\n",
+ " value_mapping=value_mapping,\n",
+ " base_fontsize=12,\n",
+ " x_label='Reported This Year?',\n",
+ " y_label='Change In GHG Intensity',\n",
+ " title='No Outliers: Change In GHG Intensity (In Last 2 Years) By Reporting Compliance This Year'\n",
+ ")\n",
+ "\n",
+ "out, reduce_memory = show_fig(fig, reduce_memory)\n",
+ "out \n",
+ "\n",
+ "if reduce_memory:\n",
+ " with open(os.path.join(fig_dir,'change_GHG_trend_by_compliance_NO_OUTLIERS.png'), \"wb\") as f:\n",
+ " f.write(out)\n",
+ " out = Image(out)\n",
+ "out\n",
+ " \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Example usage:\n",
+ "\n",
+ "value_mapping = {1: \"Didn't Report\", 0: \"Reported\"}\n",
+ "fig = create_scatter_with_means(\n",
+ " df=df_no_outliers,\n",
+ " x='non_reporting',\n",
+ " y='GHGIntensity_LastYear',\n",
+ " hover_data=['PropertyName', 'DataYear', 'non_reporting', 'GHGIntensity_LastYear', 'Chng_GHGIntensity_LastYear', 'GrossFloorArea_Mil'],\n",
+ " value_mapping=value_mapping,\n",
+ " base_fontsize=12,\n",
+ " x_label='Reported This Year?',\n",
+ " y_label='GHG Intensity',\n",
+ " title='No Outliers: GHG Intensity Last Year By Reporting Compliance This Year'\n",
+ ")\n",
+ "\n",
+ "out, reduce_memory = show_fig(fig, reduce_memory)\n",
+ "out \n",
+ "\n",
+ " \n",
+ "if reduce_memory:\n",
+ " with open(os.path.join(fig_dir,'GHG_last_year_by_compliance_NO_OUTLIERS.png'), \"wb\") as f:\n",
+ " f.write(out)\n",
+ " out = Image(out)\n",
+ "out"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Results: Dropping outliers\n",
+ "\n",
+ "Again, no significant result. The only thing we see from dropping the outliers is that now it seems that having a higher GHG intensity \n",
+ "seems to be associated WITH reporting, which is the opposite of our hypothesis (reporting buildings have higher mean and median GHG intenstity). Again,\n",
+ "the magnitude is very small so we can likely ignore this. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Some extra graphs comparing the two variables: Change in GHG Intensity and GHG Intensity last year"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "\n",
+ "def customizable_scatterplot(df, x_var, y_var, color_var, x_limits=None, y_limits=None, title=None):\n",
+ " \"\"\"\n",
+ " Create a customizable scatterplot.\n",
+ "\n",
+ " Parameters:\n",
+ " - df: DataFrame containing the data.\n",
+ " - x_var: Column name for the x-axis.\n",
+ " - y_var: Column name for the y-axis.\n",
+ " - color_var: Column name for the binary variable to set discrete coloring.\n",
+ " - x_limits: Tuple (min, max) to set x-axis limits.\n",
+ " - y_limits: Tuple (min, max) to set y-axis limits.\n",
+ " - title: Title of the plot (default: None).\n",
+ " \"\"\"\n",
+ " # Ensure the color_var is treated as a categorical variable for discrete coloring\n",
+ " df[color_var] = df[color_var].astype(str)\n",
+ "\n",
+ " # Create scatterplot\n",
+ " fig = px.scatter(\n",
+ " df,\n",
+ " x=x_var,\n",
+ " y=y_var,\n",
+ " color=color_var,\n",
+ " color_discrete_sequence=[\"blue\", \"orange\"], # Discrete colors for binary 0 and 1\n",
+ " opacity=0.2, # Add transparency here\n",
+ " labels={x_var: x_var, y_var: y_var, color_var: color_var},\n",
+ " title=title or f'Scatterplot of {y_var} vs {x_var}'\n",
+ " )\n",
+ " \n",
+ " # Set axis limits if provided\n",
+ " if x_limits:\n",
+ " fig.update_xaxes(range=x_limits)\n",
+ " if y_limits:\n",
+ " fig.update_yaxes(range=y_limits)\n",
+ " \n",
+ " # Update layout for better aesthetics\n",
+ " fig.update_layout(\n",
+ " height=600,\n",
+ " width=800,\n",
+ " legend_title=color_var\n",
+ " )\n",
+ " \n",
+ "\n",
+ " return fig"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fig = customizable_scatterplot(df, 'GHGIntensity_LastYear', 'Chng_GHGIntensity_LastYear', 'non_reporting' , \n",
+ " x_limits =[0,40], y_limits=[-10,10], title=\"Scatterplot of 'Change in GHG Intensity' vs 'GHG Intensity Last Year'\")\n",
+ "\n",
+ "out, reduce_memory = show_fig(fig, reduce_memory)\n",
+ "\n",
+ "if reduce_memory:\n",
+ " with open(os.path.join(fig_dir,'scatterplot_of_GHG_last_year_by_GHG_trend.png'), \"wb\") as f:\n",
+ " f.write(out)\n",
+ " out = Image(out)\n",
+ "out"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "my-venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.18"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/data/analysis/output/compliance_analysis/GHG_last_year_by_compliance.html b/src/data/analysis/output/compliance_analysis/GHG_last_year_by_compliance.html
new file mode 100644
index 00000000..5e9da90f
--- /dev/null
+++ b/src/data/analysis/output/compliance_analysis/GHG_last_year_by_compliance.html
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/data/analysis/output/compliance_analysis/GHG_last_year_by_compliance_NO_COVID.png b/src/data/analysis/output/compliance_analysis/GHG_last_year_by_compliance_NO_COVID.png
new file mode 100644
index 00000000..1850d98c
Binary files /dev/null and b/src/data/analysis/output/compliance_analysis/GHG_last_year_by_compliance_NO_COVID.png differ
diff --git a/src/data/analysis/output/compliance_analysis/GHG_last_year_by_compliance_NO_OUTLIERS.png b/src/data/analysis/output/compliance_analysis/GHG_last_year_by_compliance_NO_OUTLIERS.png
new file mode 100644
index 00000000..208281fe
Binary files /dev/null and b/src/data/analysis/output/compliance_analysis/GHG_last_year_by_compliance_NO_OUTLIERS.png differ
diff --git a/src/data/analysis/output/compliance_analysis/change_GHG_trend_by_compliance.html b/src/data/analysis/output/compliance_analysis/change_GHG_trend_by_compliance.html
new file mode 100644
index 00000000..62d87c79
--- /dev/null
+++ b/src/data/analysis/output/compliance_analysis/change_GHG_trend_by_compliance.html
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/data/analysis/output/compliance_analysis/change_GHG_trend_by_compliance_NO_COVID.png b/src/data/analysis/output/compliance_analysis/change_GHG_trend_by_compliance_NO_COVID.png
new file mode 100644
index 00000000..7b5bed80
Binary files /dev/null and b/src/data/analysis/output/compliance_analysis/change_GHG_trend_by_compliance_NO_COVID.png differ
diff --git a/src/data/analysis/output/compliance_analysis/change_GHG_trend_by_compliance_NO_OUTLIERS.png b/src/data/analysis/output/compliance_analysis/change_GHG_trend_by_compliance_NO_OUTLIERS.png
new file mode 100644
index 00000000..2a809266
Binary files /dev/null and b/src/data/analysis/output/compliance_analysis/change_GHG_trend_by_compliance_NO_OUTLIERS.png differ
diff --git a/src/data/analysis/output/compliance_analysis/distribution_of_GHG_intensity.html b/src/data/analysis/output/compliance_analysis/distribution_of_GHG_intensity.html
new file mode 100644
index 00000000..10cb4dee
--- /dev/null
+++ b/src/data/analysis/output/compliance_analysis/distribution_of_GHG_intensity.html
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/data/analysis/output/compliance_analysis/regression_results_w_covid.json b/src/data/analysis/output/compliance_analysis/regression_results_w_covid.json
new file mode 100644
index 00000000..2951a75b
--- /dev/null
+++ b/src/data/analysis/output/compliance_analysis/regression_results_w_covid.json
@@ -0,0 +1,37 @@
+{
+ "dependent_variable": "Non-Reporting",
+ "number_of_observations": 10588,
+ "r_squared": 0.0006814442215383743,
+ "adj_r_squared": 0.00039819066264434877,
+ "coefficients": {
+ "const": 0.12735628464907744,
+ "GHG Intensity (Last Year)": -6.42240400826807e-08,
+ "Change in GHG Intensity Last Two Years": 0.00020244263665042202,
+ "Gross Floor Area (Millions)": -0.017037109384991228
+ },
+ "p_values": {
+ "const": 8.632077178330187e-174,
+ "GHG Intensity (Last Year)": 0.9997894807085226,
+ "Change in GHG Intensity Last Two Years": 0.4398536725259733,
+ "Gross Floor Area (Millions)": 0.010176545692003403
+ },
+ "confidence_intervals": {
+ "const": [
+ 0.11863904983380956,
+ 0.13607351946434532
+ ],
+ "GHG Intensity (Last Year)": [
+ -0.0004771904393046634,
+ 0.000477061991224498
+ ],
+ "Change in GHG Intensity Last Two Years": [
+ -0.0003112678381680631,
+ 0.0007161531114689071
+ ],
+ "Gross Floor Area (Millions)": [
+ -0.030030457208608786,
+ -0.004043761561373668
+ ]
+ },
+ "covariance_type": "nonrobust"
+}
\ No newline at end of file
diff --git a/src/data/analysis/output/compliance_analysis/reporting_counts_over_time.html b/src/data/analysis/output/compliance_analysis/reporting_counts_over_time.html
new file mode 100644
index 00000000..d06ed83c
--- /dev/null
+++ b/src/data/analysis/output/compliance_analysis/reporting_counts_over_time.html
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/data/analysis/output/compliance_analysis/reporting_counts_over_time_NO_COVID.png b/src/data/analysis/output/compliance_analysis/reporting_counts_over_time_NO_COVID.png
new file mode 100644
index 00000000..3a32c2d5
Binary files /dev/null and b/src/data/analysis/output/compliance_analysis/reporting_counts_over_time_NO_COVID.png differ
diff --git a/src/data/analysis/output/compliance_analysis/scatterplot_of_GHG_last_year_by_GHG_trend.png b/src/data/analysis/output/compliance_analysis/scatterplot_of_GHG_last_year_by_GHG_trend.png
new file mode 100644
index 00000000..1ebd24d5
Binary files /dev/null and b/src/data/analysis/output/compliance_analysis/scatterplot_of_GHG_last_year_by_GHG_trend.png differ
diff --git a/src/data/requirements.txt b/src/data/requirements.txt
index 99bd7570..7c0162dc 100644
--- a/src/data/requirements.txt
+++ b/src/data/requirements.txt
@@ -2,3 +2,57 @@ python-slugify==4.0.1
pandas==2.1.2
numpy
pytest==7.4.4
+
+# Packages for Jupyter notebook data analysis, creating figures, running regressions
+appnope==0.1.4
+asttokens==3.0.0
+attrs==24.3.0
+comm==0.2.2
+debugpy==1.8.12
+decorator==5.1.1
+executing==2.1.0
+fastjsonschema==2.21.1
+iniconfig==2.0.0
+ipykernel==6.29.5
+ipython>=7.0.0,<8.0.0 # Fixed to work with python 3.9
+jedi==0.19.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+kaleido==0.1.0
+matplotlib-inline==0.1.7
+nbformat==5.10.4
+nest-asyncio==1.6.0
+packaging==24.2
+parso==0.8.4
+patsy==1.0.1
+pexpect==4.9.0
+platformdirs==4.3.6
+plotly==5.3.1
+pluggy==1.5.0
+prompt_toolkit==3.0.50
+psutil==6.1.1
+ptyprocess==0.7.0
+pure_eval==0.2.3
+Pygments==2.19.1
+python-dateutil==2.9.0.post0
+pytz==2024.2
+pyzmq==26.2.0
+referencing==0.36.1
+rpds-py==0.22.3
+scipy==1.15.1
+six==1.17.0
+stack-data==0.6.3
+statsmodels==0.14.4
+tenacity==9.0.0
+text-unidecode==1.3
+tornado==6.4.2
+traitlets==5.14.3
+typing_extensions==4.12.2
+tzdata==2025.1
+wcwidth==0.2.13
+notebook==7.3.2
+jupyterlab==4.3.4
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
diff --git a/src/pages/About.vue b/src/pages/About.vue
index 0e311904..7abeec21 100644
--- a/src/pages/About.vue
+++ b/src/pages/About.vue
@@ -24,10 +24,14 @@ export default class About extends Vue {
About Us
- Electrify Chicago is an independent open-source project looking to shed
- light onto one of the biggest sources of Chicago's CO2
+ Electrify Chicago is an independent open-source project based out of
+ Chi Hack Night
+ looking to shed light onto one of the biggest sources of Chicago's
+ CO2
emissions - buildings. By providing more information about some of the
- city's largest and most polluting buildings, we hope t encourage these
+ city's largest and most polluting buildings, we hope to encourage these
buildings to electrify, particularly by mobilizing people related to the
building - whether that be students and faculty for a college building
or employees and patients at a hospital.
diff --git a/src/pages/Blog.vue b/src/pages/Blog.vue
index 94b67e8c..70d2c1a4 100644
--- a/src/pages/Blog.vue
+++ b/src/pages/Blog.vue
@@ -23,7 +23,30 @@ export default class About extends Vue {
Electrify Chicago Blog
-
+
+
+
+
+ Do High Emitting Buildings Report Emissions Less Often?
+
+
+
+
+ Published
+
+
+
+ Qualitatively, some readers have reported that buildings who report
+ high levels of emissions seem to stop reporting. This blog post
+ looks at the data to determine if there is a pattern in the data
+ that links high emitting buildings to non-reporting. We find that
+ despite anecdotal observations, there seems to be no pattern for
+ buildings that have higher GHG intensities in the prior year or
+ upwardly trending GHG intensities to stop reporting the following
+ year.
+
+
+
@@ -31,6 +54,9 @@ export default class About extends Vue {
from Building Benchmarking Ordinance
+
+ Published
+
The City of Chicago didn't fully enforce the benchmarking ordinance
@@ -46,9 +72,17 @@ export default class About extends Vue {
diff --git a/src/pages/blog/MillionsInMissedFines.vue b/src/pages/blog/MillionsInMissedFines.vue
index cb12dec2..ebfd2918 100644
--- a/src/pages/blog/MillionsInMissedFines.vue
+++ b/src/pages/blog/MillionsInMissedFines.vue
@@ -26,18 +26,22 @@ export default class MillionsInMissedFine extends Vue {
-
+ Back to Blog
City Of Chicago Failed to Collect $30 Million In Potential Fines from
Building Benchmarking Ordinance
-
+
Not Enforcing The Benchmarking Ordinance's Fines From 2018 - 2022
Reduced Accountability & Transparency
+
+ Published
+
+
Electrify Chicago has analyzed the last five years of city benchmark
data, and found 3,325 instances of building owners not submitting data.
@@ -188,7 +192,7 @@ export default class MillionsInMissedFine extends Vue {
Contact the lead developer on this site, Viktor Köves, by emailing
contact@viktorkoves.com