import numpy as np
import pandas as pd
[docs]def fix_dataframe_outliers(demand):
"""Make a data frame of demand with outliers replaced with values interpolated
from the non-outlier edge points using :py:func:`slope_interpolate`.
:param pandas.Dataframe demand: demand data frame with UTC timestamp as indicss
and BA name as column name.
:return: (*pandas.DataFrame*) -- data frame with anomalous demand values replaced
by interpolated values.
"""
demand_fix_outliers = pd.DataFrame(index=demand.index)
for ba in demand.columns.to_list():
demand_ba = demand[ba]
outlier_output = slope_interpolate(pd.DataFrame(demand_ba))
demand_fix_outliers[ba] = outlier_output[ba]
return demand_fix_outliers
[docs]def slope_interpolate(ba_df):
"""Look for demand outliers by applying a z-score threshold to the demand slope.
Loop through all the outliers detected, determine the non-outlier edge points and
then interpolate a line joining these 2 edge points. The line value at the
timestamp of the the outlier event is used to replace the anomalous value.
:param pandas.DataFrame ba_df: demand data frame with UTC timestamp as indices and
BA name as column name.
:return: (*pandas.DataFrame*) -- data frame indexed with anomalous demand values
replaced by interpolated values.
.. note::
It is implicitly assumed that:
1. demand is correlated with temperature, and temperature rise is limited by
heat capacity which is finite and generally uniform across region; hence,
temperature dependent derivative spikes are unphysical.
2. there is indeed nothing anomalous that happened to electrical usage in the
relevant time range, so using a line to estimate the correct value is
reasonable.
.. todo::
If there are more than a few hours (say > 4) of anomalous behavior, linear
interpolation may give a bad estimate. Non-linear interpolation methods
should be considered, and other information may be needed to interpolate
properly, for example, the temperature data or other relevant profiles.
"""
df = ba_df.copy()
ba_name = df.columns[0]
df["delta"] = df[ba_name].diff()
delta_mu = df["delta"].describe().loc["mean"]
delta_sigma = df["delta"].describe().loc["std"]
df["delta_zscore"] = np.abs((df["delta"] - delta_mu) / delta_sigma)
# Find the outliers
outlier_index_list = df.loc[df["delta_zscore"] > 5].index
hour_save = -1
for i in outlier_index_list:
hour_index = df.index.get_loc(i)
if hour_save == -1:
hour_save = hour_index
next_save = hour_index + 1
continue
if hour_index == next_save:
next_save = hour_index + 1
continue
# Check for zeros: consecutive zeros, which don't have delta_zscore
# exceed threshold, will get extrapolated to the next non-zero value.
# This is fine for, say up to 5 hours; will not be appropriate
# otherwise since it may not capture the periodic patterns.
# Print a warning
if df.iloc[hour_index - 1][ba_name] == 0:
next_save = hour_index + 1
continue
num = next_save - hour_save
if num > 4:
print("Too many zeros near ", i, "! Review data!")
start = df.iloc[hour_save - 1][ba_name]
dee = (df.iloc[next_save - 1][ba_name] - start) / num
for j in range(hour_save - 1, next_save):
save_me = df.iloc[j][ba_name]
df.iloc[j][ba_name] = start + (j - hour_save + 1) * dee
print(j, save_me, df.iloc[j][ba_name])
hour_save = hour_index
next_save = hour_index + 1
if hour_save != -1:
num = next_save - hour_save
start = df.iloc[hour_save - 1][ba_name]
dee = (df.iloc[next_save - 1][ba_name] - start) / num
for j in range(hour_save - 1, next_save):
save_me = df.iloc[j][ba_name]
df.iloc[j][ba_name] = start + (j - hour_save + 1) * dee
print(j, save_me, df.iloc[j][ba_name])
return df
[docs]def replace_with_shifted_demand(demand, start, end):
"""Replace missing data within overall demand data frame with averages of nearby
shifted demand.
:param pandas.DataFrame demand: data frame with hourly demand where the columns are
BA regions.
:param pandas.Timestamp/numpy.datetime64/datetime.datetime start: start of period
of interest.
:param pandas.Timestamp/numpy.datetime64/datetime.datetime end: end of period
of interest.
:return: (*pandas.DataFrame*) -- data frame with missing demand data filled in.
"""
# Create a data frame where each column is the same demand data,
# but shifted by a specific time interval
look_back1day = demand.shift(1, freq="D")
look_back2day = demand.shift(2, freq="D")
look_back1week = demand.shift(7, freq="D")
look_forward1day = demand.shift(-1, freq="D")
look_forward2day = demand.shift(-2, freq="D")
look_forward1week = demand.shift(-7, freq="D")
shifted_demand = pd.concat(
[
demand,
look_back1day,
look_forward1day,
look_back2day,
look_forward2day,
look_back1week,
look_forward1week,
],
axis=1,
)
# Include only the dates we care about
shifted_demand = shifted_demand.loc[start:end]
shifted_demand["dayofweek"] = shifted_demand.index.dayofweek
column_names = [
"look_back1day",
"look_forward1day",
"look_back2day",
"look_forward2day",
"look_back1week",
"look_forward1week",
"dayofweek",
]
# Dicts of weekdays. 0 = Monday, 1 = Tuesday, etc.
# day_map: attempt to shift the data by only one day if possible
# Do not fill in Mon-Fri with the weekend days and vice versa
day_map = {
0: ["look_forward1day"],
1: ["look_forward1day", "look_back1day"],
2: ["look_forward1day", "look_back1day"],
3: ["look_forward1day", "look_back1day"],
4: ["look_back1day"],
5: ["look_forward1day"],
6: ["look_back1day"],
}
# If we are still missing data, look two days
more_days_map = {
0: ["look_forward2day"],
1: ["look_forward2day"],
2: ["look_back2day", "look_forward2day"],
3: ["look_back2day"],
4: ["look_back2day"],
5: ["look_back1week", "look_forward1week"],
6: ["look_back1week", "look_forward1week"],
}
# Finally, check for data exactly one week ago / one week from date
more_more_days_map = {
0: ["look_back1week", "look_forward1week"],
1: ["look_back1week", "look_forward1week"],
2: ["look_back1week", "look_forward1week"],
3: ["look_back1week", "look_forward1week"],
4: ["look_back1week", "look_forward1week"],
5: ["look_back1week", "look_forward1week"],
6: ["look_back1week", "look_forward1week"],
}
# Attempt to shift demand data,
# getting progressively more aggressive if necessary
filled_demand = pd.DataFrame(index=demand.index)
for ba_name in demand.columns:
shifted_demand_ba = shifted_demand.loc[:, [ba_name, "dayofweek"]]
shifted_demand_ba.columns = [ba_name] + column_names
shifted_demand_ba[ba_name] = fill_ba_demand(shifted_demand_ba, ba_name, day_map)
shifted_demand_ba[ba_name] = fill_ba_demand(
shifted_demand_ba, ba_name, more_days_map
)
filled_demand[ba_name] = fill_ba_demand(
shifted_demand_ba, ba_name, more_more_days_map
)
return filled_demand
[docs]def fill_ba_demand(df_ba, ba_name, day_map):
"""Replace missing data in BA demand and returns result.
:param pandas.DataFrame df_ba: data frame for BA demand, shifted demand, and day
of the week
:param str ba_name: name of the BA in data frame.
:param dict day_map: mapping for replacing missing demand data with shifted demand.
:return: (*pandas.Series*) -- series of BA demand filled in
"""
for day in range(0, 7):
df_ba.loc[(df_ba.dayofweek == day) & (df_ba[ba_name].isna()), ba_name] = df_ba[
day_map[day]
].mean(axis=1)
return df_ba[ba_name]