Using Quantile Ranks to Identify Potential Outliers#

This example demonstrates the use of quantile regression forest (QRF) quantile ranks to identify potential outliers in a dataset. In this scenario, we train a QRF model on a toy dataset and use quantile ranks to highlight values that deviate significantly from the expected range. Potential outliers are defined as points whose quantile rank falls outside the specified threshold interval around the median.

import math

import altair as alt
import numpy as np
import pandas as pd
from sklearn.utils.validation import check_random_state

from quantile_forest import RandomForestQuantileRegressor

random_state = np.random.RandomState(0)
n_samples = 5000
bounds = [0, 10]


def make_toy_dataset(n_samples, bounds, random_state=None):
    """Make a toy dataset."""
    random_state = check_random_state(random_state)
    X_1d = np.linspace(*bounds, num=n_samples)
    X = X_1d.reshape(-1, 1)
    y = X_1d * np.cos(X_1d) + random_state.normal(scale=X_1d / math.e)
    return X, y


# Create a toy dataset.
X, y = make_toy_dataset(n_samples, bounds, random_state=0)

qrf = RandomForestQuantileRegressor(
    min_samples_leaf=50,
    max_samples_leaf=None,
    random_state=random_state,
)
qrf.fit(X, y)

y_pred = qrf.predict(X, quantiles=0.5)

# Get the quantile rank for all samples.
y_rank = qrf.quantile_ranks(X, y)  # output is a value in the range [0, 1] for each sample

df = pd.DataFrame({"x": X.reshape(-1), "y": y, "y_pred": y_pred, "y_rank": y_rank})


def plot_pred_and_ranks(df):
    """Plot quantile predictions and ranks."""
    # Slider for varying the interval that defines the upper and lower quantile rank thresholds.
    slider = alt.binding_range(name="Rank Interval Threshold: ", min=0, max=1, step=0.01)
    interval_val = alt.param(name="interval", value=0.05, bind=slider)

    click = alt.selection_point(bind="legend", fields=["outlier"], on="click")

    base = alt.Chart(df)

    # For desired legend labels.
    dummy_legend = (
        base.mark_line(opacity=1)
        .encode(opacity=alt.Opacity("model:N", scale=alt.Scale(range=[1, 1]), title="Prediction"))
        .transform_calculate(model="'Median'")
    )

    circle = (
        base.add_params(interval_val, click)
        .transform_calculate(
            outlier="abs(datum.y_rank - 0.5) > (0.5 - interval / 2) ? 'Yes' : 'No'",
            threshold_low="0 + interval / 2",
            threshold_upp="1 - interval / 2",
        )
        .mark_circle(opacity=0.5, size=25)
        .encode(
            x=alt.X("x:Q"),
            y=alt.Y("y:Q"),
            color=alt.condition(
                click,
                alt.Color(
                    "outlier:N",
                    scale=alt.Scale(domain=["Yes", "No"], range=["red", "#f2a619"]),
                    title="Outlier",
                ),
                alt.value("lightgray"),
            ),
            tooltip=[
                alt.Tooltip("x:Q", format=",.3f", title="X"),
                alt.Tooltip("y:Q", format=",.3f", title="Y"),
                alt.Tooltip("y_pred:Q", format=",.3f", title="Predicted Y"),
                alt.Tooltip("y_rank:Q", format=".3f", title="Quantile Rank"),
                alt.Tooltip("threshold_low:Q", format=".3f", title="Lower Threshold"),
                alt.Tooltip("threshold_upp:Q", format=".3f", title="Upper Threshold"),
                alt.Tooltip("y_rank:Q", format=".3f", title="Quantile Rank"),
                alt.Tooltip("outlier:N", title="Outlier"),
            ],
        )
    )

    line_pred = base.mark_line(color="#006aff", size=4).encode(
        x=alt.X("x:Q", axis=alt.Axis(title="X")),
        y=alt.Y("y_pred:Q", axis=alt.Axis(title="Y")),
        tooltip=[
            alt.Tooltip("x:Q", format=",.3f", title="X"),
            alt.Tooltip("y:Q", format=",.3f", title="Y"),
            alt.Tooltip("y_pred:Q", format=",.3f", title="Predicted Y"),
        ],
    )

    chart = (dummy_legend + circle + line_pred).properties(
        title="QRF Predictions with Quantile Rank Thresholding on Toy Dataset",
        height=400,
        width=650,
    )

    return chart


chart = plot_pred_and_ranks(df)
chart