Source code for prereise.gather.winddata.impute

import numpy as np
import pandas as pd
from tqdm import tqdm

from prereise.gather.winddata.power_curves import (
    get_power,
    get_state_power_curves,
    get_turbine_power_curves,
)


def _check_curve(curve):
    allowed_curves = ["state", "IEC class 2"]
    if curve not in allowed_curves:
        err_msg = "curve not in allowed: " + ", ".join(allowed_curves)
        raise ValueError(err_msg)


def _find_to_impute(data):
    # Locate missing data
    to_impute = data[data.U.isna()].index
    if len(to_impute) == 0:
        print("No missing data")
        return
    else:
        return to_impute


def _select_similar(data, dates, j):
    year = dates[j].year
    month = dates[j].month
    hour = dates[j].hour
    select = data[
        (dates.year == year)
        & (dates.month == month)
        & (dates.hour == hour)
        & (pd.notna(data.Pout))
    ]
    return select


[docs]def simple(data, wind_farm, inplace=True, curve="state"):
    """Impute missing data using a simple procedure. For each missing entry,
    the extrema of the U and V components of the wind speed of all non missing
    entries that have the same location, same month, same hour are first found
    for each missing entry. Then, a U and V value are randomly generated
    between the respective derived ranges.

    :param pandas.DataFrame data: data frame as returned by
        :py:func:`prereise.gather.winddata.rap.rap.retrieve_data`.
    :param pandas.DataFrame wind_farm: data frame of wind farms.
    :param bool inplace: should the imputation be done in place.
    :param str curve: 'state' to use the state average, otherwise named curve.
    :return: (*pandas.DataFrame*) -- data frame with missing entries imputed.
    """

    _check_curve(curve)
    data_impute = data if inplace else data.copy()
    to_impute = _find_to_impute(data)
    if to_impute is None:
        return

    # Information on wind turbines & state average tubrine curves
    tpc = get_turbine_power_curves()
    spc = get_state_power_curves()

    # Timestamp of all entries in data frame
    dates = pd.DatetimeIndex(data.index.values)

    n_target = len(wind_farm)
    select = None
    for i, j in tqdm(enumerate(to_impute), total=len(to_impute)):
        if i % n_target == 0:
            select = _select_similar(data, dates, j)

        k = data.loc[j].plant_id
        select_plant = select[select.plant_id == k]

        min_u, max_u = select_plant["U"].min(), select_plant["U"].max()
        min_v, max_v = select_plant["V"].min(), select_plant["V"].max()
        data_impute.at[j, "U"] = min_u + (max_u - min_u) * np.random.random()
        data_impute.at[j, "V"] = min_v + (max_v - min_v) * np.random.random()
        wspd = np.sqrt(data.loc[j].U ** 2 + data.loc[j].V ** 2)
        normalized_power = get_power(tpc, spc, wspd, "IEC class 2")
        data_impute.at[j, "Pout"] = normalized_power

    if not inplace:
        return data_impute


[docs]def gaussian(data, wind_farm, inplace=True, curve="state"):
    """Impute missing data using gaussian distributions of U & V. For each
    missing entry, sample U & V based on mean and covariance of non-missing
    entries that have the same location, same month, and same hour.

    :param pandas.DataFrame data: data frame as returned by
        :py:func:`prereise.gather.winddata.rap.rap.retrieve_data`.
    :param pandas.DataFrame wind_farm: data frame of wind farms.
    :param bool inplace: should the imputation be done in place.
    :param str curve: 'state' to use the state average, otherwise named curve.
    :return: (*pandas.DataFrame*) -- data frame with missing entries imputed.
    """

    _check_curve(curve)
    data_impute = data if inplace else data.copy()
    to_impute = _find_to_impute(data)
    if to_impute is None:
        return

    # Information on wind turbines & state average tubrine curves
    tpc = get_turbine_power_curves()
    spc = get_state_power_curves()

    # Timestamp of all entries in data frame
    dates = pd.DatetimeIndex(data.index.values)

    n_target = len(wind_farm)
    select = None
    for i, hour in tqdm(enumerate(to_impute), total=len(to_impute)):
        # Only run the similar-selection function the first time
        if i % n_target == 0:
            select = _select_similar(data, dates, hour)

        plant_id = data.loc[hour].plant_id
        select_plant = select[select.plant_id == plant_id]

        uv_data = np.array([select_plant["U"].to_numpy(), select_plant["V"].to_numpy()])
        cov = np.cov(uv_data)
        mean = np.mean(uv_data, axis=1)
        sample = np.random.multivariate_normal(mean=mean, cov=cov, size=1)
        data_impute.at[hour, "U"] = sample[0][0]
        data_impute.at[hour, "V"] = sample[0][1]

        wspd = np.sqrt(data.loc[hour].U ** 2 + data.loc[hour].V ** 2)
        normalized_power = get_power(tpc, spc, wspd, "IEC class 2")
        data_impute.at[hour, "Pout"] = normalized_power

    if not inplace:
        return data_impute


[docs]def linear(data, inplace=True):
    """Given a 2D array, linearly interpolate any missing values column-wise.

    :param numpy.array/pandas.DataFrame data: data to interpolate.
    :param bool inplace: whether to modify the data inplace or return a modified copy.
    :return: (*None/pandas.DataFrame*) -- if ``inplace`` is False, data frame with
        missing entries imputed.
    """
    data_impute = data if inplace else data.copy()
    data_impute[:] = pd.DataFrame(data_impute).interpolate()
    if not inplace:
        return data_impute
Source code for prereise.gather.winddata.impute

Navigation

Related Topics

Useful Links

Code