ols_fit, ols_vars = fit_ols(yt, xt)

# =========================================================
# User settings (edit here as necessary)
# =========================================================

import pandas as pd
from datetime import datetime

# ---------------------------------------------------------
# 0) DATA/METADATA FILE NAMES
# ---------------------------------------------------------
data_file = "data.csv"
meta_file = "meta.csv"

# ---------------------------------------------------------
# 1) DEFINE TRAIN / TEST WINDOW
# ---------------------------------------------------------
# These dates define the sample splits used for model training and evaluation.
# - Training sample:   [train_start_date, test_start_date)
# - Test sample:       [test_start_date, test_end_date]
train_start_date = "2015-03-01"
test_start_date  = "2023-06-01"

# Convert to Timestamp for safer comparisons later
train_start_date = pd.to_datetime(train_start_date)
test_start_date  = pd.to_datetime(test_start_date)

# ---------------------------------------------------------
# 2) DEFINE OUT-OF-SAMPLE FORECAST TARGET (LAST DATE / VINTAGE)
# ---------------------------------------------------------
# This is the date for which you want to produce the final nowcast/forecast.
# Use the first day of the month (or the appropriate GDP reference month).
desired_date = pd.to_datetime("2026-03-01")

# ---------------------------------------------------------
# 3) DEFINE OUTPUT FILENAMES (timestamped)
# ---------------------------------------------------------
# Timestamp output files so each run creates a unique set of results.
today = datetime.today()
date_str = today.strftime("%Y%m%d")

forecast_file     = f"{date_str} 3_nwcst Tab fcst.xlsx"   # model forecasts / nowcasts
performance_file  = f"{date_str} 3_nwcst Tab perf.xlsx"   # model performance metrics
outofsample_file  = f"{date_str} 3_nwcst Tab oos.xlsx"    # full OOS prediction panel

# (Optional) If you want to resume / compare against a previous run,
# set last_performance to the filename of the prior performance workbook.
# Example: last_performance = "perf_20251207.xlsx"
last_performance = None

# ---------------------------------------------------------
# 4) MODEL SETTINGS
# ---------------------------------------------------------
# Target GDP series ID
target_variable = "RGDP0000"

# Prediction horizon (vintages) to evaluate:
# - negative values: backcasts (e.g., -1 = one period back)
# - 0: nowcast (current period)
# - positive values: forecasts (e.g., +1 = one period ahead)
lags = list(range(-2, 3))

# ---------------------------------------------------------
# 5) TARGET MODELS
# ---------------------------------------------------------
# Parameter: iterations in GBT, DT, RF
_iter_ = 10
# Parameter: LSTM n_models and train_episodes
_iter_lstm = 10
_epoch_lstm = 50

# =========================================================
# =========================================================
# =========================================================
# Nowcasting pipeline: imports, paths, data loading, and preprocessing
# =========================================================
# This section:
# 1) Imports all required libraries for the nowcasting toolkit (ML + time-series + LSTM)
# 2) Defines local folder paths for raw inputs, processed data, and outputs
# 3) Loads the full dataset and metadata
# 4) Deflates nominal variables using CPI (base year = 2018 average)
# 5) Seasonally adjusts series using trend extraction from seasonal decomposition
# 6) Converts all series to growth rates (percent change)
# 7) Cleans the final modeling dataset and prints library versions for reproducibility
# =========================================================

import os, json, time, warnings, re
from copy import deepcopy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# -----------------------------
# Models / ML utilities
# -----------------------------
from sklearn import linear_model, tree
from sklearn.linear_model import (
    LinearRegression, RidgeCV, ElasticNet, Lasso, Lars
)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# Optional time-series tools (if used downstream)
import pmdarima as pm

# Deep learning (LSTM nowcasting)
import torch
from nowcast_lstm.LSTM import LSTM

# Statsmodels seasonal adjustment
from statsmodels.tsa.seasonal import seasonal_decompose

# Reduce noisy warnings from some estimators / decomposition
warnings.filterwarnings("ignore", category=UserWarning)

# =========================================================
# Paths
# =========================================================
# Define local project directories (relative to current working directory)
d = os.getcwd() + "/"
PATH_RAW = os.path.join(d, "..", "raw")   # raw inputs (downloads / intermediate CSVs)
PATH_DATA = os.path.join(d, "..", "data")    # processed modeling tables
# Output folder for model results (forecasts, performance tables)
PATH_OUTPUT = os.path.join(d, "..", "output/", f"{date_str}")
# In case it is run several times in a day:
# PATH_OUTPUT = os.path.join(d, "..", "output/", f"{date_str}_2")
# Create folder if it does not exist
os.makedirs(PATH_OUTPUT, exist_ok=True)

print("# =========================================================")
print("Key Directories:")
print(f"Raw inputs: {PATH_RAW}")
print(f"Modeling data: {PATH_DATA}")
print(f"Model outputs: {PATH_OUTPUT}")
print("# =========================================================")

# =========================================================
# Load data and metadata
# =========================================================
# Main modeling dataset (indexed by date)
data = pd.read_csv(
    os.path.join(PATH_DATA, data_file ),
    parse_dates=["date"],
    index_col="date"
)

test_end_date = data[target_variable].dropna().index.max()

# Metadata describing each variable (frequency, nominal/real flags, etc.)
# NOTE: this uses a Windows-style path separator; works on Windows.
# For portability, consider: os.path.join(PATH_DATA, "meta_full.csv")
metadata = pd.read_csv(os.path.join(PATH_DATA, meta_file))

# Key column names and CPI series ID used for deflation
meta_col = "ticket"     # variable identifier column in metadata
cpi = "RCPI0000"        # CPI series used to deflate nominal variables

# =========================================================
# Deflate nominal variables using CPI (base year: 2019 average, STATINJA 2019=100)
# =========================================================
# Many series are nominal; deflating them puts everything in real terms.
# We compute a CPI base as the average CPI level in 2019 (STATINJA 2019=100 → base = 100).
base = data[cpi][data.index.year == 2019].mean()   # beta11: base year 2019 (Jamaica official)

# Identify variables flagged as nominal in metadata
nominal = metadata.loc[metadata["nominal"] == 1][meta_col].values

# Deflate: Real_t = Nominal_t / CPI_t * CPI_base   (corrected: was multiplying)
for column in data.columns:
    if column in nominal:
        data[column] = data[column] / data[cpi] * base    # beta11: divide (deflate) not multiply

# Drop CPI
data.drop(columns=[cpi], inplace=True)
# Drop GDP components
data = data.drop(columns=[col for col in data.columns if 'RGDP' in col and col != 'RGDP0000'])

# =========================================================
# Seasonal adjustment (trend extraction)
# =========================================================
# We apply seasonal decomposition and keep the trend component.
# Test for seasonality in data

# Period depends on data frequency:
#   quarterly: 4, monthly: 12, daily: 31 (approx monthly seasonality)
quarterly  = metadata.loc[metadata["freq"].str.lower().str.contains("q")][meta_col].values
monthly    = metadata.loc[metadata["freq"].str.lower().str.contains("m")][meta_col].values
already_sa = metadata.loc[metadata["sa"]==1][meta_col].values

# Drop columns with fewer than 24 non-NaN observations
data = data.loc[:, data.notna().sum() >= 24]

for column in data.columns:
    if column in already_sa:
        continue
    series = data[column].dropna()
    if column in quarterly:
        decomp = seasonal_decompose(series, model='additive', extrapolate_trend='freq', period=4)
        data[column] = data[column] - decomp.seasonal
    else :
        decomp = seasonal_decompose(series, model='additive', extrapolate_trend='freq', period=12)
        data[column] = data[column] - decomp.seasonal

# =========================================================
# Convert levels to growth rates (percent change)
# =========================================================
# Quarterly series: quarter-over-quarter growth rate
# All other series: default percent change at their observed frequency
for column in data.columns:
    if column in quarterly:
        # Get non-NaN values with their original index
        non_nan = data[column].dropna()
        
        # Calculate percent change on non-NaN values only
        pct_change_values = non_nan.pct_change(periods=1, fill_method=None) * 100
        
        # Create a new series filled with NaN, then update with calculated values
        result = pd.Series(np.nan, index=data.index)
        result.loc[pct_change_values.index] = pct_change_values
        
        data[column] = result
    else:
        # Store original NaN mask
        original_was_nan = data[column].isna()
        
        # For monthly/weekly/daily, calculate normally
        pct_change_values = data[column].pct_change(periods=1, fill_method=None) * 100
        
        # Set to NaN where previous value was NaN OR current original value was NaN
        prev_was_nan = data[column].shift(1).isna()
        pct_change_values[prev_was_nan | original_was_nan] = np.nan
        
        data[column] = pct_change_values

data.replace([np.inf, -np.inf], np.nan, inplace=True)

# =========================================================
# Drop columns in quarterly with more than 75% missing values
# =========================================================
for column in [ col for col in data.columns if col not in quarterly ] :
    missing_pct = data[data.index<=test_start_date][column].isna().sum() / len(data[data.index<=test_start_date][column])
    if missing_pct > 0.75:
        data.drop(column, axis=1, inplace=True)

# =========================================================
# Final cleaning
# =========================================================
# Replace infinities created by pct_change with NaN
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop CPI from predictor set (used only for deflation)
#data.drop(cpi, axis=1, inplace=True)

# Keep data from 2015 onward and remove fully-missing rows
data = data[~(data.index < train_start_date)].reset_index(drop=False)
data = data.set_index("date").dropna(how="all", axis=0)

metadata["months_lag"] = (
    pd.to_numeric(metadata["months_lag"], errors="coerce")
      .fillna(1)
      .astype(int)
)

# Drop those columns
data = data.drop(columns=[col for col in data.columns if 'RGDP' in col and col != 'RGDP0000'])
for col in data.columns :
    last = data[col].dropna().index.max().year
    if last < 2024 :
        data.drop(col, axis=1, inplace=True)


'''
# drop low hierarcy series
# Pattern: ends with 000x OR has x00x in last 4 digits
pattern = r'.*(\d00\d|000\d)$'
hierarchy = [col for col in data.columns if re.match(pattern, col)]
data = data[hierarchy]
'''

# Manage high-corr series from that proxy similar underlying DGP
def deduplicate_correlated(df, threshold=0.85, target=target_variable,
                           metadata=metadata, meta_col=meta_col):
    """Drop one variable from each highly-correlated pair.
    Priority: (1) keep aggregate tickets (numeric suffix % 100 == 0);
              (2) keep the variable with more non-null observations.
    """
    import re

    def _is_agg(ticket):
        nums = re.findall(r'\d+', str(ticket))
        return bool(nums) and int(nums[-1]) % 100 == 0

    agg_set = set(metadata.loc[metadata[meta_col].apply(_is_agg), meta_col])

    X   = df.drop(columns=[target])
    obs = X.notna().sum()
    corr  = X.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

    to_drop = set()
    for col in upper.columns:
        if col in to_drop:
            continue
        peers = upper.index[upper[col] > threshold].tolist()
        for peer in peers:
            if peer in to_drop:
                continue
            col_agg  = col  in agg_set
            peer_agg = peer in agg_set
            if col_agg and not peer_agg:
                to_drop.add(peer)
            elif peer_agg and not col_agg:
                to_drop.add(col)
                break
            elif obs[col] >= obs[peer]:
                to_drop.add(peer)
            else:
                to_drop.add(col)
                break

    kept = [c for c in df.columns if c not in to_drop]
    print(f"Removed {len(df.columns) - len(kept)} correlated variables "
          f"(|r| > {threshold}). {len(kept)} remain.")
    return df[kept]

data = deduplicate_correlated(data, threshold=0.98)

# Drop high-publication lag series
tickets_to_keep = metadata[metadata.months_lag <= 5]['ticket'].tolist()
# Keep only these columns (that exist in data)
data = data[[ col for col in data.columns if col in tickets_to_keep or "RGDP" in col ]]

data.to_csv(os.path.join(PATH_OUTPUT, "data_processed.csv"), index=True)
metadata = metadata[metadata.ticket.isin(data.columns)]
metadata.to_csv(os.path.join(PATH_OUTPUT, "meta_processed.csv"), index=True)
# =========================================================
# Reproducibility: print core library versions
# =========================================================
print("================== Library Versions =================")
import sklearn, numpy, pandas
print("scikit-learn:", sklearn.__version__)
print("numpy:", numpy.__version__)
print("pandas:", pandas.__version__)


# Data dimensions (observations and variables)
print("Data dimensions:", data.shape)
print(f"Training dates: {train_start_date.strftime('%Y-%m-%d')} to {(test_start_date - pd.DateOffset(days=1)).strftime('%Y-%m-%d')}")
T1 = data[data.index < test_start_date].shape[0]
T2 = data[(data.index >= test_start_date) & (data.index <= test_end_date)].shape[0]
print(f"Training periods: {T1} ({T1/(T1+T2) :.2%})")
print(f"Testing dates: {test_start_date.strftime('%Y-%m-%d')} to {test_end_date.strftime('%Y-%m-%d')}")
print(f"Testing periods: {T2} ({T2/(T1+T2) :.2%})")

# =========================================================
Key Directories:
Raw inputs: c:\Users\guerr\Dropbox\02_Work\Consulting\2026_NWCST_CAR_Research-Workshop\JAM Workshop\workshop_code/..\raw
Modeling data: c:\Users\guerr\Dropbox\02_Work\Consulting\2026_NWCST_CAR_Research-Workshop\JAM Workshop\workshop_code/..\data
Model outputs: c:\Users\guerr\Dropbox\02_Work\Consulting\2026_NWCST_CAR_Research-Workshop\JAM Workshop\workshop_code/..\output/20260512
# =========================================================
Removed 173 correlated variables (|r| > 0.98). 687 remain.
================== Library Versions =================
scikit-learn: 1.8.0
numpy: 2.4.1
pandas: 2.3.3
Data dimensions: (134, 245)
Training dates: 2015-03-01 to 2023-05-31
Training periods: 99 (76.15%)
Testing dates: 2023-06-01 to 2025-12-01
Testing periods: 31 (23.85%)

data = data[['RGDP0000', 'UGTR0000', "UNTL0000", 'REXC0002', 'XEMP0004', 'XOIL0000', 'MLOA0000']]

agg_map = metadata.set_index('ticket')['agg_method'].to_dict()
agg_funcs = {
    col: (agg_map[col] if agg_map[col] in ['mean', 'last', 'sum'] else 'mean')
    for col in data.columns if col in agg_map
}
data = data.resample('QE').agg(agg_funcs)
data = data[data.index>"2015-01-01"]
data.dropna(how='any', axis=0, inplace=True)
data.tail(5)

train = data[data.index < test_start_date ]
test = data[data.index >= test_start_date ]

X_train = train.drop('RGDP0000', axis=1)
y_train = train['RGDP0000']

X_test = test.drop('RGDP0000', axis=1)
y_obs = test['RGDP0000']

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

pred = model.predict(X_test)
pred

array([ 0.09364087, -1.73781166,  1.22820084, -0.67165895, -0.23315402,
        0.62003632, -0.42694809,  0.66078934, -0.46287158,  3.4027889 ,
       -1.8475537 ])

Table = pd.DataFrame({'Observed': y_obs, 'OLS predicted': pred}, index=y_obs.index)
# Estimate how far off in percent terms is the predicted value from observed
Table['Percent Error'] = (Table['OLS predicted']/Table['Observed']) - 1
Table

np.sqrt(np.mean((np.array(Table['Observed']) - np.array(Table['OLS predicted'])) ** 2))

np.float64(2.4426866060397976)

model = SomeModel(param1=..., param2=...)
model.fit(X_train, y_train)
pred = model.predict(X_test)
rmse = np.sqrt(np.mean((y_obs - pred) ** 2))

# =============================================================
# OLS  —  Ordinary Least Squares
# =============================================================
def fit_ols(ytrain, xtrain, target_variable=target_variable):
    model = LinearRegression()
    return model.fit(xtrain, ytrain), xtrain.columns

def predict_ols(model, X, train_vars, date, pred_dict, l, target_variable=target_variable):
    pred = model.predict(X[train_vars])[0]
    pred_dict[l].append(pred)
    return pred_dict


# =============================================================
# OLS-R  —  Ridge Regression  (L2 penalty, cross-validated α)
# =============================================================
def fit_olsr(ytrain, xtrain, target_variable=target_variable,
             alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10, 20]):
    model = RidgeCV(alphas=alphas)
    return model.fit(xtrain, ytrain), xtrain.columns

def predict_olsr(model, X, train_vars, date, pred_dict, l, target_variable=target_variable):
    pred = model.predict(X[train_vars])[0]
    pred_dict[l].append(pred)
    return pred_dict


# =============================================================
# LASSO  —  L1-penalized Regression  (automatic variable selection)
# =============================================================
def fit_lasso(ytrain, xtrain, target_variable=target_variable, alpha=1e-5):
    model = Lasso(alpha=alpha)
    return model.fit(xtrain, ytrain), xtrain.columns

def predict_lasso(model, X, train_vars, date, pred_dict, l, target_variable=target_variable):
    pred = model.predict(X[train_vars])[0]
    pred_dict[l].append(pred)
    return pred_dict


# =============================================================
# ENET  —  Elastic Net  (L1 + L2 combined penalty)
# =============================================================
def fit_enet(ytrain, xtrain, target_variable=target_variable,
             params={'alpha': 1e-5, 'l1_ratio': 0.25}):
    model = ElasticNet(alpha=params['alpha'], l1_ratio=params['l1_ratio'])
    return model.fit(xtrain, ytrain), xtrain.columns

def predict_enet(model, X, train_vars, date, pred_dict, l, target_variable=target_variable):
    pred = model.predict(X[train_vars])[0]
    pred_dict[l].append(pred)
    return pred_dict


# =============================================================
# DT  —  Decision Tree  (ensemble of ModelN independent trees)
# =============================================================
def fit_dt(ytrain, xtrain, target_variable=target_variable, ModelN=200):
    models = []
    for _ in range(ModelN):
        model = DecisionTreeRegressor(
            criterion='absolute_error',
            min_samples_split=6,
            min_samples_leaf=2,
        )
        model.fit(xtrain, ytrain)
        models.append(model)
    return models, xtrain.columns

def predict_dt(model, X, train_vars, date, pred_dict, l, target_variable=target_variable):
    preds = [m.predict(X[train_vars])[0] for m in model]
    pred_dict[l].append(np.nanmean(preds))
    return pred_dict


# =============================================================
# RF  —  Random Forest  (ensemble of ModelN independent forests)
# =============================================================
def fit_rf(ytrain, xtrain, target_variable=target_variable,
           ModelN=200, n_estimators=130):
    models = []
    for _ in range(ModelN):
        model = RandomForestRegressor(
            n_estimators=n_estimators,
            criterion='absolute_error',
            max_features=18,
            min_samples_split=4,
            min_samples_leaf=2,
        )
        model.fit(xtrain, ytrain)
        models.append(model)
    return models, xtrain.columns

def predict_rf(model, X, train_vars, date, pred_dict, l, target_variable=target_variable):
    preds = [m.predict(X[train_vars])[0] for m in model]
    pred_dict[l].append(np.nanmean(preds))
    return pred_dict


# =============================================================
# GBT  —  Gradient Boosting Tree  (sequential ensemble)
# =============================================================
def fit_gbt(ytrain, xtrain, target_variable=target_variable,
            ModelN=200, n_estimators=100, learning=0.15):
    models = []
    for _ in range(ModelN):
        model = GradientBoostingRegressor(
            n_estimators=n_estimators,
            learning_rate=learning,
            loss='absolute_error',
            min_samples_split=6,
            min_samples_leaf=3,
        )
        model.fit(xtrain, ytrain)
        models.append(model)
    return models, xtrain.columns

def predict_gbt(model, X, train_vars, date, pred_dict, l, target_variable=target_variable):
    preds = [m.predict(X[train_vars])[0] for m in model]
    pred_dict[l].append(np.nanmean(preds))
    return pred_dict


# =============================================================
# LSTM  —  Long Short-Term Memory  (recurrent neural network)
# =============================================================
def fit_lstm(ttrain, gdp_lags=0, target_variable=target_variable,
             params={
                 "n_timesteps": 12,
                 "fill_na_func": np.nanmean,
                 "fill_ragged_edges_func": np.nanmean,
                 "n_models": 100,
                 "train_episodes": 100,
                 "batch_size": 50,
                 "decay": 0.98,
                 "n_hidden": 10,
                 "n_layers": 1,
                 "dropout": 0.0,
                 "criterion": torch.nn.MSELoss(),
                 "optimizer": torch.optim.Adam,
                 "optimizer_parameters": {"lr": 1e-2, "weight_decay": 0.0},
             }):
    train = lagged_target(ttrain, gdp_lags)
    model = LSTM(data=train, target_variable=target_variable, **params)
    model.train(quiet=True)
    return model, train.drop(["date", target_variable], axis=1).columns

def predict_lstm(model, X, train_vars, date, pred_dict, l,
                 gdp_lags=0, target_variable=target_variable):
    X = lagged_target(X, gdp_lags)
    pred = model.predict(X).loc[lambda x: x.date == date, "predictions"].values[0]
    pred_dict[l].append(pred)
    return pred_dict

from sklearn.linear_model import Lasso

model_lasso = Lasso(alpha=1e-5)
model_lasso.fit(X_train, y_train)

pred_lasso = model_lasso.predict(X_test)

Table_lasso = pd.DataFrame({'Observed': y_obs, 'LASSO predicted': pred_lasso}, index=y_obs.index)
Table_lasso['Percent Error'] = (Table_lasso['LASSO predicted'] / Table_lasso['Observed']) - 1

rmse_lasso = np.sqrt(np.mean((Table_lasso['Observed'] - Table_lasso['LASSO predicted']) ** 2))
print(f"LASSO RMSE : {rmse_lasso:.4f}")

# Inspect which coefficients LASSO set to zero (variable selection)
coef_lasso = pd.Series(model_lasso.coef_, index=X_train.columns)
print(f"Coefficients set to zero: {(coef_lasso == 0).sum()} / {len(coef_lasso)}")

Table_lasso

LASSO RMSE : 2.4427
Coefficients set to zero: 0 / 6

from sklearn.linear_model import RidgeCV

model_ridge = RidgeCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10, 20])
model_ridge.fit(X_train, y_train)

pred_ridge = model_ridge.predict(X_test)

Table_ridge = pd.DataFrame({'Observed': y_obs, 'Ridge predicted': pred_ridge}, index=y_obs.index)
Table_ridge['Percent Error'] = (Table_ridge['Ridge predicted'] / Table_ridge['Observed']) - 1

rmse_ridge = np.sqrt(np.mean((Table_ridge['Observed'] - Table_ridge['Ridge predicted']) ** 2))
print(f"Ridge RMSE    : {rmse_ridge:.4f}")
print(f"Selected alpha: {model_ridge.alpha_}")  # penalty chosen by cross-validation

Table_ridge

Ridge RMSE    : 2.4538
Selected alpha: 20.0

from sklearn.linear_model import ElasticNet

model_enet = ElasticNet(alpha=1e-5, l1_ratio=0.25)
model_enet.fit(X_train, y_train)

pred_enet = model_enet.predict(X_test)

Table_enet = pd.DataFrame({'Observed': y_obs, 'ENET predicted': pred_enet}, index=y_obs.index)
Table_enet['Percent Error'] = (Table_enet['ENET predicted'] / Table_enet['Observed']) - 1

rmse_enet = np.sqrt(np.mean((Table_enet['Observed'] - Table_enet['ENET predicted']) ** 2))
print(f"ENET RMSE : {rmse_enet:.4f}")

# l1_ratio=0.25 means 25% LASSO + 75% Ridge penalty mix
coef_enet = pd.Series(model_enet.coef_, index=X_train.columns)
print(f"Coefficients set to zero: {(coef_enet == 0).sum()} / {len(coef_enet)}")

Table_enet

ENET RMSE : 2.4427
Coefficients set to zero: 0 / 6

from sklearn.tree import DecisionTreeRegressor

model_dt = DecisionTreeRegressor(
    criterion='absolute_error',
    min_samples_split=6,
    min_samples_leaf=2,
)
model_dt.fit(X_train, y_train)

pred_dt = model_dt.predict(X_test)

Table_dt = pd.DataFrame({'Observed': y_obs, 'DT predicted': pred_dt}, index=y_obs.index)
Table_dt['Percent Error'] = (Table_dt['DT predicted'] / Table_dt['Observed']) - 1

rmse_dt = np.sqrt(np.mean((Table_dt['Observed'] - Table_dt['DT predicted']) ** 2))
print(f"DT RMSE : {rmse_dt:.4f}")

Table_dt

DT RMSE : 2.9732

from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(
    n_estimators=100,
    criterion='absolute_error',
    min_samples_split=4,
    min_samples_leaf=2,
)
model_rf.fit(X_train, y_train)

pred_rf = model_rf.predict(X_test)

Table_rf = pd.DataFrame({'Observed': y_obs, 'RF predicted': pred_rf}, index=y_obs.index)
Table_rf['Percent Error'] = (Table_rf['RF predicted'] / Table_rf['Observed']) - 1

rmse_rf = np.sqrt(np.mean((Table_rf['Observed'] - Table_rf['RF predicted']) ** 2))
print(f"RF RMSE : {rmse_rf:.4f}")

# Feature importances: how much each predictor contributed to splits
importances_rf = pd.Series(model_rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(f"\nFeature importances:\n{importances_rf.round(3)}")

Table_rf

RF RMSE : 2.5753

Feature importances:
UGTR0000    0.225
XOIL0000    0.195
XEMP0004    0.183
MLOA0000    0.179
REXC0002    0.111
UNTL0000    0.106
dtype: float64

from sklearn.ensemble import GradientBoostingRegressor

model_gbt = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.15,
    loss='absolute_error',
    min_samples_split=6,
    min_samples_leaf=3,
)
model_gbt.fit(X_train, y_train)

pred_gbt = model_gbt.predict(X_test)

Table_gbt = pd.DataFrame({'Observed': y_obs, 'GBT predicted': pred_gbt}, index=y_obs.index)
Table_gbt['Percent Error'] = (Table_gbt['GBT predicted'] / Table_gbt['Observed']) - 1

rmse_gbt = np.sqrt(np.mean((Table_gbt['Observed'] - Table_gbt['GBT predicted']) ** 2))
print(f"GBT RMSE : {rmse_gbt:.4f}")

# Feature importances: accumulated gain from splits on each predictor
importances_gbt = pd.Series(model_gbt.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(f"\nFeature importances:\n{importances_gbt.round(3)}")

Table_gbt

GBT RMSE : 2.4270

Feature importances:
XOIL0000    0.241
MLOA0000    0.236
UGTR0000    0.210
XEMP0004    0.159
UNTL0000    0.108
REXC0002    0.046
dtype: float64

# ── 1. Prepare data ───────────────────────────────────────────────────────────
# LSTM expects 'date' as a column, not an index.
train_lstm = train.reset_index()          # columns: date, RGDP0000, ...
test_lstm  = test.reset_index()
# LSTM expects full data. Not just test subset.
full_lstm  = pd.concat([train_lstm, test_lstm]).reset_index(drop=True)

# ── 2. Parameters ─────────────────────────────────────────────────────────────
# n_timesteps=4 → 1 year of quarterly history per sequence (appropriate for
# short macro panels; use 8–12 with longer monthly datasets).
# n_models/train_episodes are kept small here for exercise speed.
# In production (full pipeline) use n_models=50–100, train_episodes=100+.
lstm_params = {
    "n_timesteps"            : 4,
    "fill_na_func"           : np.nanmean,
    "fill_ragged_edges_func" : np.nanmean,
    "n_models"               : 5,
    "train_episodes"         : 50,
    "batch_size"             : 30,
    "decay"                  : 0.98,
    "n_hidden"               : 8,
    "n_layers"               : 1,
    "dropout"                : 0.0,
    "criterion"              : torch.nn.MSELoss(),
    "optimizer"              : torch.optim.Adam,
    "optimizer_parameters"   : {"lr": 1e-2, "weight_decay": 0.0},
}

# ── 3. Fit ────────────────────────────────────────────────────────────────────
lstm_model = LSTM(data=train_lstm, target_variable=target_variable, **lstm_params)
lstm_model.train(quiet=False)

# ── 4. Predict ────────────────────────────────────────────────────────────────
# Pass the full dataset so the model can generate predictions for every date.
# Then filter to the test window.
preds_df   = lstm_model.predict(full_lstm)
test_preds = (preds_df[preds_df["date"].isin(test_lstm["date"])]
              .set_index("date")["predictions"]
              .values)

# ── 5. Evaluate ───────────────────────────────────────────────────────────────
Table_lstm = pd.DataFrame(
    {"Observed": y_obs, "LSTM predicted": test_preds},
    index=y_obs.index,
)
Table_lstm["Percent Error"] = (Table_lstm["LSTM predicted"] / Table_lstm["Observed"]) - 1

rmse_lstm = np.sqrt(np.mean((y_obs.values - test_preds) ** 2))
print(f"LSTM RMSE : {rmse_lstm:.4f}")
Table_lstm

Training model 1
step :  0 loss :  14.040483474731445
step :  1 loss :  13.783120155334473
step :  2 loss :  13.53753662109375
step :  3 loss :  13.312020301818848
step :  4 loss :  13.111226081848145
step :  5 loss :  12.9336576461792
step :  6 loss :  12.77358627319336
step :  7 loss :  12.623543739318848
step :  8 loss :  12.47849178314209
step :  9 loss :  12.335400581359863
step :  10 loss :  12.192242622375488
step :  11 loss :  12.047696113586426
step :  12 loss :  11.854911804199219
step :  13 loss :  11.703693389892578
step :  14 loss :  11.55160903930664
step :  15 loss :  11.399922370910645
step :  16 loss :  11.249380111694336
step :  17 loss :  11.100813865661621
step :  18 loss :  10.95447063446045
step :  19 loss :  10.809059143066406
step :  20 loss :  10.662778854370117
step :  21 loss :  10.514568328857422
step :  22 loss :  10.364129066467285
step :  23 loss :  10.21234130859375
step :  24 loss :  10.061378479003906
step :  25 loss :  9.913403511047363
step :  26 loss :  9.769612312316895
step :  27 loss :  9.630471229553223
step :  28 loss :  9.495656967163086
step :  29 loss :  9.3643217086792
step :  30 loss :  9.235066413879395
step :  31 loss :  9.106490135192871
step :  32 loss :  8.977612495422363
step :  33 loss :  8.847843170166016
step :  34 loss :  8.71701717376709
step :  35 loss :  8.585759162902832
step :  36 loss :  8.465209007263184
step :  37 loss :  8.32946491241455
step :  38 loss :  8.208365440368652
step :  39 loss :  8.093409538269043
step :  40 loss :  7.984160423278809
step :  41 loss :  7.879317760467529
step :  42 loss :  7.777919292449951
step :  43 loss :  7.680274963378906
step :  44 loss :  7.58624792098999
step :  45 loss :  7.494492530822754
step :  46 loss :  7.404337406158447
step :  47 loss :  7.3175835609436035
step :  48 loss :  7.2332940101623535
step :  49 loss :  7.150623798370361
Training model 2
step :  0 loss :  14.525810241699219
step :  1 loss :  14.275728225708008
step :  2 loss :  14.061412811279297
step :  3 loss :  13.863075256347656
step :  4 loss :  13.673211097717285
step :  5 loss :  13.488382339477539
step :  6 loss :  13.308210372924805
step :  7 loss :  13.133746147155762
step :  8 loss :  12.96371841430664
step :  9 loss :  12.794361114501953
step :  10 loss :  12.626708030700684
step :  11 loss :  12.460148811340332
step :  12 loss :  12.284730911254883
step :  13 loss :  12.117682456970215
step :  14 loss :  11.960124969482422
step :  15 loss :  11.808381080627441
step :  16 loss :  11.660979270935059
step :  17 loss :  11.519412994384766
step :  18 loss :  11.381877899169922
step :  19 loss :  11.246015548706055
step :  20 loss :  11.11082649230957
step :  21 loss :  10.976456642150879
step :  22 loss :  10.843568801879883
step :  23 loss :  10.71261215209961
step :  24 loss :  10.583839416503906
step :  25 loss :  10.457551956176758
step :  26 loss :  10.33198356628418
step :  27 loss :  10.206745147705078
step :  28 loss :  10.084156036376953
step :  29 loss :  9.964303016662598
step :  30 loss :  9.846068382263184
step :  31 loss :  9.728982925415039
step :  32 loss :  9.612885475158691
step :  33 loss :  9.497721672058105
step :  34 loss :  9.383604049682617
step :  35 loss :  9.313443183898926
step :  36 loss :  9.199136734008789
step :  37 loss :  9.054981231689453
step :  38 loss :  8.952714920043945
step :  39 loss :  8.853232383728027
step :  40 loss :  8.755013465881348
step :  41 loss :  8.657848358154297
step :  42 loss :  8.562182426452637
step :  43 loss :  8.468605041503906
step :  44 loss :  8.377588272094727
step :  45 loss :  8.289313316345215
step :  46 loss :  8.203658103942871
step :  47 loss :  8.120292663574219
step :  48 loss :  8.038833618164062
step :  49 loss :  7.958977699279785
Training model 3
step :  0 loss :  13.974383354187012
step :  1 loss :  13.73353385925293
step :  2 loss :  13.51854133605957
step :  3 loss :  13.314557075500488
step :  4 loss :  13.116266250610352
step :  5 loss :  12.909747123718262
step :  6 loss :  12.728311538696289
step :  7 loss :  12.560131072998047
step :  8 loss :  12.367165565490723
step :  9 loss :  12.209863662719727
step :  10 loss :  12.055900573730469
step :  11 loss :  11.898309707641602
step :  12 loss :  11.734042167663574
step :  13 loss :  11.563725471496582
step :  14 loss :  11.391763687133789
step :  15 loss :  11.219243049621582
step :  16 loss :  11.04773998260498
step :  17 loss :  10.875751495361328
step :  18 loss :  10.601932525634766
step :  19 loss :  10.427793502807617
step :  20 loss :  10.25275707244873
step :  21 loss :  10.075128555297852
step :  22 loss :  9.893473625183105
step :  23 loss :  9.708620071411133
step :  24 loss :  9.522173881530762
step :  25 loss :  9.336169242858887
step :  26 loss :  9.152546882629395
step :  27 loss :  8.9728422164917
step :  28 loss :  8.798229217529297
step :  29 loss :  8.629088401794434
step :  30 loss :  8.465328216552734
step :  31 loss :  8.306522369384766
step :  32 loss :  8.152256965637207
step :  33 loss :  8.00222110748291
step :  34 loss :  7.856224060058594
step :  35 loss :  7.714158535003662
step :  36 loss :  7.575982570648193
step :  37 loss :  7.441679000854492
step :  38 loss :  7.311223030090332
step :  39 loss :  7.184561729431152
step :  40 loss :  7.061589241027832
step :  41 loss :  6.9421892166137695
step :  42 loss :  6.826196670532227
step :  43 loss :  6.713418483734131
step :  44 loss :  6.6036176681518555
step :  45 loss :  6.496509552001953
step :  46 loss :  6.391753673553467
step :  47 loss :  6.289035797119141
step :  48 loss :  6.189352512359619
step :  49 loss :  6.090808868408203
Training model 4
step :  0 loss :  14.112824440002441
step :  1 loss :  13.873544692993164
step :  2 loss :  13.65280818939209
step :  3 loss :  13.441068649291992
step :  4 loss :  13.231843948364258
step :  5 loss :  13.021574020385742
step :  6 loss :  12.811164855957031
step :  7 loss :  12.59911060333252
step :  8 loss :  12.389607429504395
step :  9 loss :  12.186083793640137
step :  10 loss :  11.95494556427002
step :  11 loss :  11.759047508239746
step :  12 loss :  11.566143035888672
step :  13 loss :  11.320815086364746
step :  14 loss :  11.122957229614258
step :  15 loss :  10.927894592285156
step :  16 loss :  10.739933967590332
step :  17 loss :  10.557984352111816
step :  18 loss :  10.381101608276367
step :  19 loss :  10.20901107788086
step :  20 loss :  10.041470527648926
step :  21 loss :  9.878012657165527
step :  22 loss :  9.717863082885742
step :  23 loss :  9.560154914855957
step :  24 loss :  9.403970718383789
step :  25 loss :  9.248527526855469
step :  26 loss :  9.093567848205566
step :  27 loss :  8.939751625061035
step :  28 loss :  8.788496017456055
step :  29 loss :  8.641401290893555
step :  30 loss :  8.49963092803955
step :  31 loss :  8.365594863891602
step :  32 loss :  8.233518600463867
step :  33 loss :  8.108604431152344
step :  34 loss :  7.988570213317871
step :  35 loss :  7.871016979217529
step :  36 loss :  7.757694721221924
step :  37 loss :  7.647851943969727
step :  38 loss :  7.541498184204102
step :  39 loss :  7.4381103515625
step :  40 loss :  7.337221622467041
step :  41 loss :  7.238999843597412
step :  42 loss :  7.143918037414551
step :  43 loss :  7.052158355712891
step :  44 loss :  6.962757110595703
step :  45 loss :  6.874863624572754
step :  46 loss :  6.788544654846191
step :  47 loss :  6.70429801940918
step :  48 loss :  6.622411727905273
step :  49 loss :  6.5428948402404785
Training model 5
step :  0 loss :  14.307440757751465
step :  1 loss :  14.0261812210083
step :  2 loss :  13.765351295471191
step :  3 loss :  13.528738975524902
step :  4 loss :  13.315656661987305
step :  5 loss :  13.110860824584961
step :  6 loss :  12.937095642089844
step :  7 loss :  12.771193504333496
step :  8 loss :  12.608572006225586
step :  9 loss :  12.446443557739258
step :  10 loss :  12.284333229064941
step :  11 loss :  12.121978759765625
step :  12 loss :  11.958609580993652
step :  13 loss :  11.79438591003418
step :  14 loss :  11.6304292678833
step :  15 loss :  11.466774940490723
step :  16 loss :  11.304401397705078
step :  17 loss :  11.146822929382324
step :  18 loss :  10.993706703186035
step :  19 loss :  10.843819618225098
step :  20 loss :  10.692940711975098
step :  21 loss :  10.549134254455566
step :  22 loss :  10.407319068908691
step :  23 loss :  10.267396926879883
step :  24 loss :  10.12883186340332
step :  25 loss :  9.99206829071045
step :  26 loss :  9.8572998046875
step :  27 loss :  9.724531173706055
step :  28 loss :  9.593561172485352
step :  29 loss :  9.464423179626465
step :  30 loss :  9.337360382080078
step :  31 loss :  9.212629318237305
step :  32 loss :  9.090309143066406
step :  33 loss :  8.970354080200195
step :  34 loss :  8.852643013000488
step :  35 loss :  8.737071990966797
step :  36 loss :  8.62354850769043
step :  37 loss :  8.512018203735352
step :  38 loss :  8.40242862701416
step :  39 loss :  8.294722557067871
step :  40 loss :  8.188859939575195
step :  41 loss :  8.084774017333984
step :  42 loss :  7.982387542724609
step :  43 loss :  7.881572246551514
step :  44 loss :  7.7821760177612305
step :  45 loss :  7.683959484100342
step :  46 loss :  7.586613655090332
step :  47 loss :  7.489711761474609
step :  48 loss :  7.392746448516846
step :  49 loss :  7.295129776000977
LSTM RMSE : 2.6384
OLS  RMSE : 2.4427  (baseline)

	RGDP0000	UGTR0000	UNTL0000	REXC0002	XEMP0004	XOIL0000	MLOA0000
date
2024-12-31	1.691583	-0.928915	0.374498	85.541713	5.940967	1.173205	1.089155
2025-03-31	1.086629	-0.558565	2.875776	12.413470	-6.006168	-3.481808	0.724042
2025-06-30	0.255034	-1.378025	1.072993	-7.816600	-3.490598	0.055859	1.003215
2025-09-30	0.350465	6.175978	1.142558	29.273677	11.943489	-0.089192	0.966247
2025-12-31	-7.272064	-3.359553	-17.157254	-24.146770	8.959772	-1.213730	1.277365

	fit_intercept fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).	True
	copy_X copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.	True
	tol tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7	1e-06
	n_jobs n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.	None
	positive positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24	False

	Observed	OLS predicted	Percent Error
date
2023-06-30	-0.066758	0.093641	-2.402688
2023-09-30	1.988200	-1.737812	-1.874063
2023-12-31	-0.778219	1.228201	-2.578220
2024-03-31	-0.054704	-0.671659	11.278039
2024-06-30	-0.997453	-0.233154	-0.766251
2024-09-30	-1.055556	0.620036	-1.587403
2024-12-31	1.691583	-0.426948	-1.252396
2025-03-31	1.086629	0.660789	-0.391891
2025-06-30	0.255034	-0.462872	-2.814944
2025-09-30	0.350465	3.402789	8.709343
2025-12-31	-7.272064	-1.847554	-0.745938

	Observed	LASSO predicted	Percent Error
date
2023-06-30	-0.066758	0.093648	-2.402792
2023-09-30	1.988200	-1.737806	-1.874060
2023-12-31	-0.778219	1.228207	-2.578228
2024-03-31	-0.054704	-0.671645	11.277789
2024-06-30	-0.997453	-0.233140	-0.766264
2024-09-30	-1.055556	0.620043	-1.587409
2024-12-31	1.691583	-0.426943	-1.252392
2025-03-31	1.086629	0.660796	-0.391884
2025-06-30	0.255034	-0.462865	-2.814916
2025-09-30	0.350465	3.402779	8.709314
2025-12-31	-7.272064	-1.847534	-0.745941

	Observed	Ridge predicted	Percent Error
date
2023-06-30	-0.066758	0.046555	-1.697362
2023-09-30	1.988200	-1.517953	-1.763481
2023-12-31	-0.778219	1.305987	-2.678175
2024-03-31	-0.054704	-0.455791	7.331936
2024-06-30	-0.997453	-0.036049	-0.963859
2024-09-30	-1.055556	0.737153	-1.698355
2024-12-31	1.691583	-0.296833	-1.175476
2025-03-31	1.086629	0.766231	-0.294855
2025-06-30	0.255034	-0.312660	-2.225956
2025-09-30	0.350465	3.036095	7.663038
2025-12-31	-7.272064	-1.467452	-0.798207

Estimation and Prediction¶

OLS¶

Data¶

Variable Selection¶

OLS Estimation¶

Machine Learning Models¶

Helper Function Reference¶

LASSO¶

OLS Ridge (Ridge Regression)¶

Elastic Net (ENET)¶

Decision Tree (DT)¶

Random Forest (RF)¶

Gradient Boosting Tree (GBT)¶

Long Short-Term Memory (LSTM)¶

Parameter	Default	Effect
`alpha`	`1e-5`	Overall regularization strength. Higher → more shrinkage toward zero for all coefficients.
`l1_ratio`	`0.25`	Mix between L1 and L2 penalties. `l1_ratio=1` → pure LASSO (sparse); `l1_ratio=0` → pure Ridge (all variables retained). Values between blend both.

Parameter	Default	Effect
`ModelN`	`200`	Number of independently fit trees to average. Higher → more stable predictions; lower → faster but noisier.
`min_samples_split`	`6`	Minimum number of observations required to split an internal node. Higher → shallower trees, less overfitting.
`min_samples_leaf`	`2`	Minimum number of observations required in a leaf node. Higher → simpler trees, more regularization.
`criterion`	`"absolute_error"`	Loss function used to evaluate candidate splits. `"absolute_error"` (MAE-based) is more robust to outliers than `"squared_error"` (MSE-based).

Parameter	Default	Effect
`ModelN`	`200`	Number of independently fit forests to average. More → more stable predictions, slower.
`n_estimators`	`130`	Number of trees per forest. More trees → lower variance within each forest; beyond ~200 the gain is marginal.
`max_features`	`18`	Number of features randomly considered at each split. Lower → more diversity across trees (less correlated predictions), at the cost of potentially weaker individual trees.
`min_samples_split`	`4`	Minimum observations to split a node. Higher → shallower, more regularized trees.
`min_samples_leaf`	`2`	Minimum observations in a leaf. Higher → simpler trees.

Parameter	Default	Effect
`ModelN`	`200`	Number of independently fit GBT ensembles to average. More → more stable, slower.
`n_estimators`	`100`	Number of boosting stages (sequential trees). More stages → lower training error but higher risk of overfitting.
`learning_rate`	`0.15`	Shrinkage applied to each tree's contribution. Lower → each tree contributes less, requiring more stages; acts as regularization. Typical values: 0.01–0.2.
`min_samples_split`	`6`	Minimum observations to split a node. Higher → shallower trees, less overfitting.
`min_samples_leaf`	`3`	Minimum observations in a leaf. Higher → simpler trees.

Parameter	Default	Effect
`n_models`	`100`	Number of LSTM models trained and averaged (ensemble). More → more stable predictions, much slower training.
`train_episodes`	`100`	Training epochs per model. More → better convergence; too many can overfit on small samples.
`n_timesteps`	`12`	Length of the historical window (periods) fed to the LSTM at each step. Longer → captures more temporal dynamics; requires more data to estimate reliably.
`n_hidden`	`10`	Number of hidden units in the LSTM cell. More → higher model capacity, higher overfitting risk on small samples.
`n_layers`	`1`	Number of stacked LSTM layers. Deeper → can learn more complex dynamics; harder to train with short time series.
`learning_rate` (`lr`)	`1e-2`	Step size for the Adam optimizer. Too high → unstable training; too low → slow convergence.
`decay`	`0.98`	Learning rate decay per epoch. Values near 1 → very slow decay; smaller values → faster decay.
`dropout`	`0.0`	Fraction of hidden units randomly dropped during training. Useful regularization when `n_hidden` is large.
`batch_size`	`50`	Samples per gradient update. Smaller → noisier but often better-generalizing gradients.