Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add raw price threshold for sales val #142

Merged
merged 14 commits into from
Jan 6, 2025
23 changes: 19 additions & 4 deletions glue/flagging_script_glue/flagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def go(
iso_forest_cols: list,
dev_bounds: tuple,
condos: bool,
raw_price_threshold: int,
):
"""
This function runs all of our other functions in the correct sequence.
Expand Down Expand Up @@ -50,7 +51,9 @@ def go(
print("string_processing() done")
df = iso_forest(df, groups, iso_forest_cols)
print("iso_forest() done")
df = outlier_taxonomy(df, dev_bounds, groups, condos=condos)
df = outlier_taxonomy(
df, dev_bounds, groups, condos=condos, raw_price_threshold=raw_price_threshold
)
print("outlier_taxonomy() done\nfinished")

return df
Expand All @@ -69,7 +72,13 @@ def create_group_string(groups: tuple, sep: str) -> str:
return sep.join(groups)


def outlier_taxonomy(df: pd.DataFrame, permut: tuple, groups: tuple, condos: bool):
def outlier_taxonomy(
df: pd.DataFrame,
permut: tuple,
groups: tuple,
condos: bool,
raw_price_threshold: int,
):
"""
Creates columns having to do with our chosen outlier taxonomy.
Ex: Family sale, Home flip sale, Non-person sale, High price (raw and or sqft), etc.
Expand All @@ -84,7 +93,7 @@ def outlier_taxonomy(df: pd.DataFrame, permut: tuple, groups: tuple, condos: boo

df = check_days(df, SHORT_TERM_OWNER_THRESHOLD)
df = pricing_info(df, permut, groups, condos=condos)
df = outlier_type(df, condos=condos)
df = outlier_type(df, condos=condos, raw_price_threshold=raw_price_threshold)

return df

Expand Down Expand Up @@ -740,7 +749,9 @@ def z_normalize_groupby(s: pd.Series):
return zscore(s, nan_policy="omit")


def outlier_type(df: pd.DataFrame, condos: bool) -> pd.DataFrame:
def outlier_type(
df: pd.DataFrame, condos: bool, raw_price_threshold: int
) -> pd.DataFrame:
"""
This function create indicator columns for each distinct outlier type between price
and characteristic outliers. These columns are prefixed with 'sv_ind_'.
Expand Down Expand Up @@ -807,6 +818,10 @@ def outlier_type(df: pd.DataFrame, condos: bool) -> pd.DataFrame:
"sv_ind_price_low_price_sqft",
]

# Implement raw threshold, unlog price
price_conditions.append((10 ** df["meta_sale_price"]) > raw_price_threshold)
price_labels.append("sv_ind_raw_price_threshold")

combined_conditions = price_conditions + char_conditions
combined_labels = price_labels + char_labels

Expand Down
26 changes: 19 additions & 7 deletions glue/sales_val_flagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,8 @@ def classify_outliers(df, stat_groups: list, min_threshold):
2. Implement our group threshold requirement. In the statistical flagging process, if
the group a sale belongs too is below N=30 then we want to manually set these flags to
non-outlier status, even if they were flagged in the mansueto script. This requirement
is bypasses for ptax outliers - we don't care about group threshold in this case.
is bypasses for ptax outliers and raw price threshold outliers - we don't care about
group threshold in this case.

Inputs:
df: The data right after we perform the flagging script (go()), when the exploded
Expand Down Expand Up @@ -178,6 +179,7 @@ def classify_outliers(df, stat_groups: list, min_threshold):
"sv_ind_ptax_flag_w_high_price_sqft": "High price per square foot",
"sv_ind_price_low_price_sqft": "Low price per square foot",
"sv_ind_ptax_flag_w_low_price_sqft": "Low price per square foot",
"sv_ind_raw_price_threshold": "Raw price threshold",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assigning this reason with priority right after the other price flags that are generated with the statistical flagging groups.

Order of assignment between the three outlier reasons will be

  • Price outlier
  • Raw price outlier (this cap)
  • Ptax
  • Other char reasons

"sv_ind_ptax_flag": "PTAX-203 Exclusion",
"sv_ind_char_short_term_owner": "Short-term owner",
"sv_ind_char_family_sale": "Family Sale",
Expand All @@ -199,6 +201,11 @@ def classify_outliers(df, stat_groups: list, min_threshold):

Note: This doesn't apply for sales that also have a ptax outlier status.
In this case, we still assign the price outlier status.

We also don't apply this threshold with sv_raw_price_threshold,
since this is designed to be a safeguard that catches very high price
sales that may have slipped through the cracks due to the group
threshold requirement
"""
group_thresh_price_fix = [
"sv_ind_price_high_price",
Expand Down Expand Up @@ -237,12 +244,14 @@ def fill_outlier_reasons(row):
# Drop the _merge column
df = df.drop(columns=["_merge"])

# Assign outlier status
# Assign outlier status, these are the outlier types
# that assign a sale as an outlier
values_to_check = {
"High price",
"Low price",
"High price per square foot",
"Low price per square foot",
"Raw price threshold",
}

df["sv_is_outlier"] = np.where(
Expand Down Expand Up @@ -471,8 +480,9 @@ def get_parameter_df(
ptax_sd,
rolling_window,
time_frame,
short_term_thresh,
min_group_thresh,
short_term_threshold,
min_group_threshold,
raw_price_threshold,
run_id,
):
"""
Expand All @@ -488,8 +498,9 @@ def get_parameter_df(
ptax_sd: list of standard deviations used for ptax flagging
rolling_window: how many months used in rolling window methodology
date_floor: parameter specification that limits earliest flagging write
short_term_thresh: short-term threshold for Mansueto's flagging model
short_term_threshold: short-term threshold for Mansueto's flagging model
min_group_thresh: minimum group size threshold needed to flag as outlier
raw_price_threshold: raw price threshold at which we unconditionally classify sales as outliers
run_id: unique run_id to flagging program run
Outputs:
df_parameters: parameters table associated with flagging run
Expand All @@ -512,8 +523,9 @@ def get_parameter_df(
"ptax_sd": [ptax_sd],
"rolling_window": [rolling_window],
"time_frame": [time_frame],
"short_term_owner_threshold": [short_term_thresh],
"min_group_thresh": [min_group_thresh],
"short_term_owner_threshold": [short_term_threshold],
"min_group_thresh": [min_group_threshold],
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This min_group_thresh should eventually be migrated to min_group_threshold to maintain naming style.

"raw_price_threshold": [raw_price_threshold],
}

df_parameters = pd.DataFrame(parameter_dict_to_df)
Expand Down
4 changes: 3 additions & 1 deletion manual_flagging/flagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ def create_bins_and_labels(input_list):
iso_forest_cols=df_info["iso_forest_cols"],
dev_bounds=tuple(inputs["dev_bounds"]),
condos=df_info["condos_boolean"],
raw_price_threshold=inputs["raw_price_threshold"],
)

# Add the edited or unedited dataframe to the new dictionary
Expand Down Expand Up @@ -400,8 +401,9 @@ def create_bins_and_labels(input_list):
ptax_sd=inputs["ptax_sd"],
rolling_window=inputs["rolling_window_months"],
time_frame=inputs["time_frame"],
short_term_thresh=flg_model.SHORT_TERM_OWNER_THRESHOLD,
short_term_threshold=flg_model.SHORT_TERM_OWNER_THRESHOLD,
min_group_thresh=inputs["min_groups_threshold"],
raw_price_threshold=inputs["raw_price_threshold"],
run_id=run_id,
)

Expand Down
4 changes: 4 additions & 0 deletions manual_flagging/yaml/inputs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,7 @@ ptax_sd: [1, 1]

# Flags are only applied if there are at least this many sales in the group
min_groups_threshold: 30

# This is the raw price threshold that is used to set sales to outlier status
# regardless of group size
raw_price_threshold: 15_000_000
Loading