import pandas as pd                          # pandas: the main library for tabular data (DataFrames)
import numpy as np                           # numpy: fast math and array operations
import matplotlib.pyplot as plt              # pyplot: plotting library (like ggplot in R)

from sklearn.linear_model import LinearRegression   # OLS regression from scikit-learn
from statsmodels.tsa.seasonal import seasonal_decompose  # decompose a series into trend + seasonal + residual

from sklearn.decomposition import PCA               # Principal Component Analysis
from sklearn.preprocessing import StandardScaler    # standardize features to mean=0, std=1
from sklearn.metrics import mean_squared_error      # computes MSE between predictions and actuals
from sklearn.model_selection import train_test_split  # utility to split data into train/test sets

from copy import deepcopy                    # deepcopy creates a fully independent copy of an object
from sklearn.linear_model import Lars        # LARS: Least Angle Regression for variable selection

scaler = StandardScaler()                    # create a scaler object (will be used later to standardize data)

target_variable = "RGDP0000"                # string: the column name for Jamaica real GDP (our Y variable)

# ---------------------------------------------------------
# 1) DEFINE TRAIN / TEST WINDOW
# ---------------------------------------------------------
# These dates define the sample splits used for model training and evaluation.
# - Training sample:   [train_start_date, test_start_date)
# - Test sample:       [test_start_date, test_end_date]
train_start_date = "2015-03-01"   # strings in ISO format: "YYYY-MM-DD"
test_start_date  = "2023-06-01"

# pd.to_datetime() converts a string into a pandas Timestamp (a proper date object)
train_start_date = pd.to_datetime(train_start_date)
test_start_date  = pd.to_datetime(test_start_date)

# ---------------------------------------------------------
# 2) DEFINE OUT-OF-SAMPLE FORECAST TARGET (LAST DATE / VINTAGE)
# ---------------------------------------------------------
# This is the date for which you want to produce the final nowcast/forecast.
# Use the first day of the month (or the appropriate GDP reference month).
desired_date = pd.to_datetime("2026-03-01")

# pd.read_csv() loads a CSV file into a DataFrame; parse_dates=True converts date strings automatically
# index_col='date' sets the 'date' column as the row index instead of a plain column
data = pd.read_csv('../data/data.csv', parse_dates=True, index_col='date')

# .dropna() removes rows where GDP is NaN; .index.max() returns the latest date remaining
test_end_date = data[target_variable].dropna().index.max()

data.head(2)   # .head(n) shows the first n rows — useful for a quick sanity check

metadata = pd.read_csv('../data/meta.csv')   # load the metadata CSV as a separate DataFrame
# The column with variable names
meta_col = 'ticket'   # 'ticket' is the column in metadata that holds the variable name/code

metadata   # displaying a variable on the last line of a cell prints it in the notebook output

cpi = 'RCPI0000'   # string variable holding the CPI column name
# =========================================================
# Deflate nominal variables using CPI (base year: 2019 average, STATINJA 2019=100)
# =========================================================
# Many series are nominal; deflating them puts everything in real terms.
# We compute a CPI base as the average CPI level in 2019 (STATINJA 2019=100 → base = 100).
base = data[cpi][data.index.year == 2019].mean()   # beta11: base year 2019 (Jamaica official)
# data.index.year == 2019 is a boolean mask; it selects only rows where the year is 2019

# metadata.loc[condition][column].values extracts matching column values as a numpy array
nominal = metadata.loc[metadata["nominal"] == 1][meta_col].values

# Deflate: Real_t = Nominal_t / CPI_t * CPI_base   (corrected: was multiplying)
for column in data.columns:       # loop over every column name in the DataFrame
    if column in nominal:         # 'in' checks whether a value exists in a list/array
        data[column] = data[column] / data[cpi] * base    # beta11: divide (deflate) not multiply

# .drop() removes columns (axis=1) or rows (axis=0); inplace=True modifies the DataFrame directly
data.drop(columns=[cpi], inplace=True)
# Drop GDP components
# Drop those columns
data = data.drop(columns=[col for col in data.columns if 'RGDP' in col and col != 'RGDP0000'])
# list comprehension: builds a list of column names where 'RGDP' appears but keeping the target

# .str accessor lets you use string methods on a Series; .str.lower() converts to lowercase
quarterly  = metadata.loc[metadata["freq"].str.lower().str.contains("q")][meta_col].values
monthly    = metadata.loc[metadata["freq"].str.lower().str.contains("m")][meta_col].values
already_sa = metadata.loc[metadata["sa"]==1][meta_col].values   # sa=1 means already seasonally adjusted

# Keep only columns with at least 24 non-NaN values; .notna().sum() counts non-missing entries per column
data = data.loc[:, data.notna().sum() >= 24]

for column in data.columns:
    if column in already_sa:
        continue   # 'continue' skips the rest of the loop body and moves to the next column
    series = data[column].dropna()   # remove NaN before decomposing (seasonal_decompose requires it)
    if column in quarterly:
        # period=4: quarterly series has 4 seasons per year
        decomp = seasonal_decompose(series, model='additive', extrapolate_trend='freq', period=4)
        data[column] = data[column] - decomp.seasonal   # subtract seasonal component to remove seasonality
    else :
        # period=12: monthly series has 12 seasons per year
        decomp = seasonal_decompose(series, model='additive', extrapolate_trend='freq', period=12)
        data[column] = data[column] - decomp.seasonal

# =========================================================
# Convert levels to growth rates (percent change)
# =========================================================
# Quarterly series: quarter-over-quarter growth rate
# All other series: default percent change at their observed frequency
for column in data.columns:
    if column in quarterly:
        # Get non-NaN values with their original index
        non_nan = data[column].dropna()
        
        # .pct_change(periods=1) computes (x_t - x_{t-1}) / x_{t-1}; * 100 gives percentage
        pct_change_values = non_nan.pct_change(periods=1, fill_method=None) * 100
        
        # Create a full-length Series of NaN, then fill in only the computed positions
        result = pd.Series(np.nan, index=data.index)
        result.loc[pct_change_values.index] = pct_change_values   # .loc with an index selects by label
        
        data[column] = result
    else:
        # Store original NaN mask
        original_was_nan = data[column].isna()   # boolean Series: True where value is missing
        
        # For monthly/weekly/daily, calculate normally
        pct_change_values = data[column].pct_change(periods=1, fill_method=None) * 100
        
        # Mask out cases where either current or previous value was originally NaN
        prev_was_nan = data[column].shift(1).isna()   # .shift(1) shifts values down by 1 row
        pct_change_values[prev_was_nan | original_was_nan] = np.nan   # | is the bitwise OR for booleans

        data[column] = pct_change_values

# np.inf arises from pct_change when dividing by zero; replace with NaN to keep data clean
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# =========================================================
# Drop columns in quarterly with more than 75% missing values
# =========================================================
for column in [ col for col in data.columns if col not in quarterly ] :
    # compute fraction of missing values in the training period only
    missing_pct = data[data.index<=test_start_date][column].isna().sum() / len(data[data.index<=test_start_date][column])
    if missing_pct > 0.75:
        data.drop(column, axis=1, inplace=True)

# =========================================================
# Final cleaning
# =========================================================
# Replace infinities created by pct_change with NaN
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop CPI from predictor set (used only for deflation)
#data.drop(cpi, axis=1, inplace=True)

# ~ negates a boolean Series; data[condition] filters rows where condition is True
data = data[~(data.index < train_start_date)].reset_index(drop=False)
# .reset_index(drop=False) moves the date index back to a regular column
data = data.set_index("date").dropna(how="all", axis=0)   # drop rows that are entirely NaN

# pd.to_numeric(..., errors='coerce') converts to number; non-numeric values become NaN
metadata["months_lag"] = (
    pd.to_numeric(metadata["months_lag"], errors="coerce")
      .fillna(1)      # replace any NaN lags with 1 (safe default)
      .astype(int)    # .astype(int) casts the Series to integer dtype
)

# Drop those columns
data = data.drop(columns=[col for col in data.columns if 'RGDP' in col and col != 'RGDP0000'])
for col in data.columns :
    last = data[col].dropna().index.max().year   # find the most recent year with data for this column
    if last < 2024 :
        data.drop(col, axis=1, inplace=True)   # remove columns that stopped updating before 2024

def deduplicate_correlated(df, threshold=0.85, target=target_variable,
                           metadata=metadata, meta_col=meta_col):
    """Drop one variable from each highly-correlated pair.
    Priority: (1) keep aggregate tickets (numeric suffix % 100 == 0);
              (2) keep the variable with more non-null observations.
    """
    import re

    def _is_agg(ticket):
        nums = re.findall(r'\d+', str(ticket))
        return bool(nums) and int(nums[-1]) % 100 == 0

    agg_set = set(metadata.loc[metadata[meta_col].apply(_is_agg), meta_col])

    X   = df.drop(columns=[target])
    obs = X.notna().sum()
    corr  = X.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

    to_drop = set()
    for col in upper.columns:
        if col in to_drop:
            continue
        peers = upper.index[upper[col] > threshold].tolist()
        for peer in peers:
            if peer in to_drop:
                continue
            col_agg  = col  in agg_set
            peer_agg = peer in agg_set
            if col_agg and not peer_agg:
                to_drop.add(peer)
            elif peer_agg and not col_agg:
                to_drop.add(col)
                break
            elif obs[col] >= obs[peer]:
                to_drop.add(peer)
            else:
                to_drop.add(col)
                break

    kept = [c for c in df.columns if c not in to_drop]
    print(f"Removed {len(df.columns) - len(kept)} correlated variables "
          f"(|r| > {threshold}). {len(kept)} remain.")
    return df[kept]

data = deduplicate_correlated(data, threshold=0.98)

# Drop high-publication lag series
tickets_to_keep = metadata[metadata.months_lag <= 5]['ticket'].tolist()

# Keep only these columns (that exist in data)
data = data[[ col for col in data.columns if col in tickets_to_keep or "RGDP" in col ]]

Removed 173 correlated variables (|r| > 0.98). 687 remain.

fig, axes = plt.subplots(4, 1, figsize=(13, 13), sharex=False)

# GDP (quarterly) — bar chart
ax = axes[0]
gdp = data['RGDP0000'].dropna()
ax.bar(gdp.index, gdp, width=60, color='#003865', alpha=0.8, label='GDP')
ax.axhline(0, color='black', linewidth=0.5, linestyle='--')
ax.set_title('Real GDP Growth (QoQ %)', fontsize=11, weight='bold')
ax.set_ylabel('% change')
ax.legend(frameon=False)
ax.grid(axis='y', linestyle=':', alpha=0.5)

# Payroll (MPAY2000)
ax = axes[1]
pay = data['MPAY2000'].dropna()
ax.plot(pay.index, pay, color='#009CA7', linewidth=1.8, label='Payroll (MPAY2000)')
ax.axhline(0, color='black', linewidth=0.5, linestyle='--')
ax.set_title('Payroll Growth (MoM %)', fontsize=11, weight='bold')
ax.set_ylabel('% change')
ax.legend(frameon=False)
ax.grid(axis='y', linestyle=':', alpha=0.5)

# Google Trends (UGTR0000)
ax = axes[2]
gtr = data['UGTR0000'].dropna()
ax.plot(gtr.index, gtr, color='#F5A623', linewidth=1.8, label='Google Trends (UGTR0000)')
ax.axhline(0, color='black', linewidth=0.5, linestyle='--')
ax.set_title('Google Trends Growth (MoM %)', fontsize=11, weight='bold')
ax.set_ylabel('% change')
ax.legend(frameon=False)
ax.grid(axis='y', linestyle=':', alpha=0.5)

# Nighttime Lights (UNTL0000)
ax = axes[3]
ntl = data['UNTL0000'].dropna()
ax.plot(ntl.index, ntl, color='#8B5CF6', linewidth=1.8, label='Nighttime Lights (UNTL0000)')
ax.axhline(0, color='black', linewidth=0.5, linestyle='--')
ax.set_title('Nighttime Lights Growth (MoM %)', fontsize=11, weight='bold')
ax.set_ylabel('% change')
ax.legend(frameon=False)
ax.grid(axis='y', linestyle=':', alpha=0.5)

fig.suptitle('Data Glimpse — Selected Indicators', fontsize=13, weight='bold')
plt.tight_layout()
plt.show()

[ col for col in data.columns if "UNTL" in col ]

['UNTL3001',
 'UNTL3002',
 'UNTL3003',
 'UNTL3004',
 'UNTL3005',
 'UNTL3006',
 'UNTL3007',
 'UNTL3008',
 'UNTL3000',
 'UNTL1001',
 'UNTL1002',
 'UNTL1003',
 'UNTL1004',
 'UNTL1005',
 'UNTL1006',
 'UNTL1007',
 'UNTL1008',
 'UNTL0000']

# =========================================================
# Reproducibility: print core library versions
# =========================================================
print("================== Library Versions =================")
import sklearn, numpy, pandas
print("scikit-learn:", sklearn.__version__)
print("numpy:", numpy.__version__)
print("pandas:", pandas.__version__)


# Data dimensions (observations and variables)
print("Data dimensions:", data.shape)
print(f"Training dates: {train_start_date.strftime('%Y-%m-%d')} to {(test_start_date - pd.DateOffset(days=1)).strftime('%Y-%m-%d')}")
T1 = data[data.index < test_start_date].shape[0]
T2 = data[(data.index >= test_start_date) & (data.index <= test_end_date)].shape[0]
print(f"Training periods: {T1} ({T1/(T1+T2) :.2%})")
print(f"Testing dates: {test_start_date.strftime('%Y-%m-%d')} to {test_end_date.strftime('%Y-%m-%d')}")
print(f"Testing periods: {T2} ({T2/(T1+T2) :.2%})")

================== Library Versions =================
scikit-learn: 1.8.0
numpy: 2.4.1
pandas: 2.3.3
Data dimensions: (134, 245)
Training dates: 2015-03-01 to 2023-05-31
Training periods: 99 (76.15%)
Testing dates: 2023-06-01 to 2025-12-01
Testing periods: 31 (23.85%)

def estimate_factors(X, r_max=10, criterion='IC1'):
    '''
    Information criteria to PCA.
    Bai and Ng (2002)
    inputs:
    - X: df
    - r_max: max number of factors (should be less than T or N)
    - criterion: conservative criteria is IC3
    '''
    
    T, N = X.shape
    r_max = min(r_max, T - 1)  # enforce theoretical max
    IC_vals = []
    
    for r in range(r_max + 1):
        if r == 0:
            sigma2_r = np.mean(X.values ** 2)
        else:
            pca = PCA(n_components=r)
            X_scaled = scaler.fit_transform(X)
            F_hat = pca.fit_transform(X_scaled)
            Lambda_hat = X_scaled.T @ F_hat / T
            X_hat = F_hat @ Lambda_hat.T
            sigma2_r = np.mean((X_scaled - X_hat).values ** 2)
    
        if criterion == 'IC1':
            penalty = r * (N + T) / (N * T) * np.log(N * T / (N + T))
        elif criterion == 'IC2':
            penalty = r * (N + T) / (N * T) * np.log(min(N, T))
        elif criterion == 'IC3':
            penalty = r * np.log(min(N, T)) / min(N, T)
        else:
            raise ValueError("Invalid criterion")
    
        IC_vals.append(np.log(sigma2_r) + penalty)

    return np.argmin(IC_vals)



def lars_variable_ranking(df, target, max_vars=None, test=None, factors=5):
    """
    Stepwise LARS variable selection: rank variables by their entry into the model.
    - df: DataFrame with predictors and target (train period only)
    - target: name of the target variable
    - max_vars: number of variables to select; if None, continue until all are ranked
    - test: DataFrame with test period rows (same columns as df)
    Returns:
    - selected_vars: ordered list of variable names by entry into the LARS model
    - rmse: out-of-sample RMSE on the test period (or None)

    Chinn, M. D., Meunier, B., & Stumpner, S. (2023). Nowcasting world trade with machine learning: a three-step approach (No. w31419). National Bureau of Economic Research.
    """
    df0 = deepcopy(df)
    
    selected_order = []
    count = 0
    max_vars = max_vars or (df0.shape[1] - 1)

    while count < max_vars:
        X = df0.drop(columns=[target])
        y = df0[target]

        # Stop if no variables remain
        if X.shape[1] == 0:
            break

        # Fit LARS model (drop NaN rows for fitting)
        mask = y.notna()
        model = Lars(n_nonzero_coefs=X.shape[1], jitter=1e-10, random_state=0)
        model.fit(X[mask], y[mask])

        # coef_path_ has shape (n_features, n_steps)
        coef_path = np.array(model.coef_path_)
        # Count how many times each variable is zero
        zero_counts = (coef_path == 0).sum(axis=1)

        # Summary table
        out = pd.DataFrame({'Variable': X.columns, 'ZeroCount': zero_counts})
        out = out.sort_values('ZeroCount')
        
        # Take variables that are uniquely at this ZeroCount level
        grouped = out.groupby('ZeroCount')
        selected = grouped.filter(lambda g: len(g) == 1)

        # If nothing uniquely enters, dump all remaining vars
        if selected.empty:
            remaining = X.columns.tolist()
            selected = pd.DataFrame({'Variable': remaining, 'ZeroCount': [count]*len(remaining)})
            selected_order.extend(remaining)
            print("Warning: Using fallback procedure for LARS")
            break
        else:
            selected_vars = selected['Variable'].tolist()
            selected_order.extend(selected_vars)
            df0 = df0.drop(columns=selected_vars)
            count += len(selected_vars)

    top_vars = selected_order[:max_vars]

    if test is None:
        return top_vars, None

    # --- Evaluation: mirror the benchmark OLS setup exactly ---
    df_train = df[  [target] + top_vars]
    df_test  = test[[target] + top_vars]

    X_train_raw = df_train.drop(target, axis=1)
    X_test_raw  = df_test.drop(target, axis=1)
    y_train_all = df_train[target]
    y_test_all  = df_test[target]

    # Fit scaler on train only, transform both (no data leakage)
    X_train_scaled = pd.DataFrame(
        scaler.fit_transform(X_train_raw), index=X_train_raw.index
    ).dropna(axis=1)
    X_test_scaled = pd.DataFrame(
        scaler.transform(X_test_raw), index=X_test_raw.index
    )[X_train_scaled.columns]

    # Fit PCA on train only, transform both
    pca_local = PCA(n_components=factors)
    X_train_pca = pd.DataFrame(
        pca_local.fit_transform(X_train_scaled), index=X_train_raw.index,
        columns=[f"PC_{i}" for i in range(factors)]
    )
    X_test_pca = pd.DataFrame(
        pca_local.transform(X_test_scaled), index=X_test_raw.index,
        columns=[f"PC_{i}" for i in range(factors)]
    )

    # Drop NaN GDP rows from training (same as benchmark obs_mask)
    obs_mask = y_train_all.notna()
    X_train_pca = X_train_pca[obs_mask]
    y_train_fit = y_train_all[obs_mask]

    # Drop NaN GDP rows from test when computing RMSE
    obs_mask_test = y_test_all.notna()
    X_test_eval  = X_test_pca[obs_mask_test]
    y_test_eval  = y_test_all[obs_mask_test]

    final_model = LinearRegression()
    final_model.fit(X_train_pca, y_train_fit)
    y_pred = final_model.predict(X_test_eval)

    rmse = float(np.sqrt(np.mean((y_test_eval.values - y_pred) ** 2)))
    return top_vars, rmse

import warnings
warnings.filterwarnings('ignore')   # suppress non-critical warnings to keep output clean
factors = 1   # number of PCA factors used throughout the notebook

# .set_index('ticket') makes ticket the index so we can look up agg_method by variable name
# .to_dict() converts the resulting Series into a plain Python dictionary {ticket: agg_method}
agg_map = metadata.set_index('ticket')['agg_method'].to_dict()

# Dictionary comprehension: build {col: aggregation_function} for every column in data
# If agg_method is not 'mean', 'last', or 'sum', fall back to 'mean'
agg_funcs = {
    col: (agg_map[col] if agg_map[col] in ['mean', 'last', 'sum'] else 'mean')
    for col in data.columns if col in agg_map
}
# .resample('QS') groups observations by quarter-start; .agg() applies a different function per column
pre_selected = data.resample('QS').agg(agg_funcs)
pre_selected.tail(5)   # .tail(5) shows the last 5 rows

pre_selected.index = pd.to_datetime(pre_selected.index)   # ensure the index is a proper DatetimeIndex
pre_selected = pre_selected.dropna(how="all", axis=1)     # drop columns that are entirely NaN
# .isnull().mean() gives the fraction of missing values per column; keep columns with <= 75% missing
pre_selected = pre_selected.loc[:, pre_selected.isnull().mean() <= 0.75]

# Mean-fill predictors only — leave target_variable as NaN so we can
# identify quarters without observed GDP and drop them from training.
# List comprehension: all columns except the GDP target
predictors = [c for c in pre_selected.columns if c != target_variable]
# .fillna(mean) replaces NaN with the column mean; applied only to predictor columns
pre_selected[predictors] = pre_selected[predictors].fillna(
    pre_selected[predictors].mean(numeric_only=True)
)

pre_selected.tail(5)

last_gdp = data[target_variable].dropna()   # Series containing only the non-NaN GDP observations
last_gdp = last_gdp.index.max()             # .index.max() gives the latest date that has observed GDP
# Boolean indexing: keep only rows where the date is <= last observed GDP date
pre_selected = pre_selected[pre_selected.index <= last_gdp]
# dropna(subset=[...]) drops rows where the specified column is NaN (removes unobserved GDP quarters)
pre_selected.dropna(subset=[target_variable], inplace=True)
print(f"First date: {pre_selected.index.min()}")   # f-string: embeds variable values inside the string
print(f"Last date: {pre_selected.index.max()}")

First date: 2015-04-01 00:00:00
Last date: 2025-10-01 00:00:00

# Split raw data first
X_raw = pre_selected.drop(target_variable, axis=1)   # all predictor columns (drop GDP)
y_all = pre_selected[[target_variable]]               # double brackets [[ ]] keep it as a DataFrame (not a Series)

# Boolean indexing splits X and y into train and test portions by date
X_train_raw = X_raw[X_raw.index <  pd.to_datetime(test_start_date)]
X_test_raw  = X_raw[X_raw.index >= pd.to_datetime(test_start_date)]
y_train     = y_all[y_all.index <  pd.to_datetime(test_start_date)]
y_test      = y_all[y_all.index >= pd.to_datetime(test_start_date)]

# Drop quarters where GDP was not observed (NaN target = no real observation)
obs_mask    = y_train[target_variable].notna()   # boolean mask: True where GDP is available
y_train     = y_train[obs_mask]
X_train_raw = X_train_raw[obs_mask]

# scaler.fit_transform() learns mean & std from train, then standardizes; applied to train only
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_raw), index=X_train_raw.index)
# scaler.transform() applies the SAME scaling learned from train — never refit on test data!
X_test_scaled  = pd.DataFrame(scaler.transform(X_test_raw),      index=X_test_raw.index)

# pca.fit_transform() learns principal components from train, then projects; applied to train only
pca = PCA(n_components=factors)
X_train = pd.DataFrame(pca.fit_transform(X_train_scaled), index=X_train_raw.index)
X_test  = pd.DataFrame(pca.transform(X_test_scaled),      index=X_test_raw.index)
# Rename columns to PC_0, PC_1, ... using a list comprehension
X_train.columns = [f"PC_{i}" for i in range(factors)]
X_test.columns  = [f"PC_{i}" for i in range(factors)]

benchmark_model = LinearRegression()        # instantiate the OLS model object
benchmark_model.fit(X_train, y_train)       # .fit() estimates coefficients using training data
print(f"Training on {len(y_train)} observed GDP quarters")

Training on 33 observed GDP quarters

# .predict() returns a 2-D array; .ravel() flattens it to a 1-D array
y_pred = benchmark_model.predict(X_test).ravel()
y_true = y_test.values.ravel()   # .values converts a DataFrame/Series to a plain numpy array

# Naive baseline: always predict the training-period mean
naive_pred = float(y_train[target_variable].mean())   # float() converts a numpy scalar to a Python float
naive_rmse = float(np.sqrt(np.mean((y_true - naive_pred) ** 2)))   # RMSE formula: sqrt(mean of squared errors)

benchmark = float(np.sqrt(np.mean((y_true - y_pred) ** 2)))  # RMSE for OLS benchmark

print(f"OLS benchmark RMSE : {benchmark:.4f}")   # :.4f in f-string formats the number to 4 decimal places

OLS benchmark RMSE : 2.7344

# Split the quarterly data into train and test using boolean indexing on the date index
pre_train = pre_selected[pre_selected.index <  test_start_date]
pre_test  = pre_selected[pre_selected.index >= test_start_date]

# lars_variable_ranking returns (ordered list of selected variable names, out-of-sample RMSE)
selection, rmse = lars_variable_ranking(pre_train, target=target_variable, max_vars=10, test=pre_test, factors=factors)

# Build a dictionary (key: value pairs) to store this specification's results
b = {
    "size" : 10 ,
    "selected_vars" : selection ,   # list of variable names selected by LARS
    "rmse" : rmse
}

# pd.DataFrame([b]) converts a list of one dictionary into a one-row DataFrame
spec_list = pd.DataFrame([b])

spec_list

# .idxmin() returns the index label of the row with the smallest 'rmse' value
top_vars = spec_list.loc[spec_list['rmse'].idxmin(), 'selected_vars']

# .isin(top_vars) filters the metadata DataFrame to rows whose ticket is in the selected list
metadata[metadata['ticket'].isin(top_vars)]

d = []   # empty list; will collect one dictionary per model size
for size in range(10,50) :   # range(10, 50) generates integers 10, 11, ..., 49
    selection, rmse = lars_variable_ranking(pre_train, target=target_variable, max_vars=size+1, test = pre_test, factors = factors)
    b = {
        "size" : size+1 ,
        "selected_vars" : selection ,
        "rmse" : rmse
    }
    d.append(b)   # .append() adds one item to the end of the list
d = pd.DataFrame(d)   # convert the list of dicts into a DataFrame (each dict becomes a row)
d.set_index('size', inplace=True)   # use model size as the row index for easy plotting

d['ratio'] = d['rmse']/benchmark   # add a new column: ratio of LARS RMSE to the OLS benchmark RMSE

# Best threshold is 1: where models outperform non-selected OLS
threshold = 1
d['ratio'].plot(color='black')                        # pandas .plot() wraps matplotlib for quick line charts
plt.axhline(y=threshold, color='black', linestyle=':', linewidth=1)  # draw a horizontal reference line at y=1
plt.ylabel("Ratio of RMSE")
plt.xlabel("Number of variables")
plt.legend().set_visible(False)
plt.show()   # plt.show() renders the figure; without it, the plot may not display outside notebooks

print(f"OLS benchmark RMSE : {benchmark:.4f}")
print(f"LARS min RMSE : {d['rmse'].min():.4f}")   # .min() returns the smallest value in the column

OLS benchmark RMSE : 2.7344
LARS min RMSE : 2.5824

# Filter rows where rmse equals the minimum rmse — keeps only the best-performing specification
spec_list = d[d['rmse']==d['rmse'].min()]
spec_list

# lags is a list of integers representing information-set vintages
# negative = data available that many months BEFORE the GDP reference quarter
# positive = data available that many months AFTER the GDP reference quarter
lags = list(range(-2, 3))   # range(-2, 3) gives -2, -1, 0, 1, 2; list() converts it to a list

# helper function, generate lagged datasets for testing on vintages
def gen_lagged_data(metadata, data, last_date, lag):
    # only go up to the last date
    lagged_data = data.loc[data.date <= last_date, :].reset_index(drop=True)
    for col in lagged_data.columns[1:]:   # [1:] slices the column list, skipping the first column (date)
        pub_lag = metadata.loc[metadata.ticket == col, "months_lag"].values[0] # publication lag of this particular variable
        # go back as far as needed for the pub_lag of the variable, then + the lag (so -2 for 2 months back), also -1 because 0 lag means in month, last month data available, not current month in
        lagged_data.loc[(len(lagged_data) - pub_lag + lag ) :, col] = np.nan
        # set the last pub_lag+lag rows to NaN — those values haven't been published yet

    return lagged_data

# helper function, flatten a dataset for methods that don't do timeseries, extra columns for each lag
def flatten_data(data, target_variable, n_lags):
    # keep only rows where GDP is observed (non-NaN)
    flattened_data = data.loc[~pd.isna(data[target_variable]), :]
    orig_index = flattened_data.index   # save the original row positions
    for i in range(1, n_lags):   # loop i = 1, 2, ..., n_lags-1
        lagged_indices = orig_index - i          # shift index back by i rows
        lagged_indices = lagged_indices[lagged_indices >= 0]   # drop any negative indices
        tmp = data.loc[lagged_indices, :].copy()   # .copy() avoids modifying the original DataFrame
        tmp['date'] = tmp['date'] + pd.DateOffset(months=i)   # DateOffset shifts dates forward by i months
        tmp = tmp.drop([target_variable], axis=1)
        # rename columns to indicate they are lagged (e.g., "VAR_1" = value 1 period ago)
        tmp.columns = [j + "_" + str(i) if j != "date" else j for j in tmp.columns]
        flattened_data = flattened_data.merge(tmp, how="left", on="date")   # left join on date column

    return flattened_data


def lagged_target(data, total_lags, metadata=metadata, target_variable=target_variable) :
    if total_lags == 0 :
        return data   # early return: if no lags requested, hand back the data unchanged
    else :
        for l in range(1,total_lags+1) :
            data[f"{target_variable}_l{l}"] = data[target_variable]   # copy GDP column, will shift it
            pub_lag_q = max(1, metadata[metadata['ticket']==target_variable]['months_lag'].values[0] // 3)
            # // is integer (floor) division; converts monthly publication lag to quarterly
            data.loc[data.index[-pub_lag_q:], f"{target_variable}_l{l}"] = np.nan
            data[f"{target_variable}_l{l}"] = data[f"{target_variable}_l{l}"].ffill()   # forward-fill NaN
            data[f"{target_variable}_l{l}"] = data[f"{target_variable}_l{l}"].shift(l)  # shift down by l rows
            data[f"{target_variable}_l{l}"] = data[f"{target_variable}_l{l}"].fillna(data[f"{target_variable}_l{l}"].mean())
        return data
    
# helper function fill missings in a dataset with the mean from the training set
def mean_fill_dataset(training, test):
    mean_dict = {}   # empty dict; will map column name -> training mean
    for col in training.columns[1:]:
        mean_dict[col] = np.nanmean(training[col])   # np.nanmean ignores NaN when computing the mean
    filled = test.copy()
    for col in training.columns[1:]:
        # replace NaN in test with the training mean (avoids data leakage from test set)
        filled.loc[pd.isna(filled[col]), col] = mean_dict[col]
        
    return filled


def fit_ols(
    ttrain ,
    gdp_lags = 0,
    flat_lags = 3 ,
    target_variable = target_variable,
    ) :
    transformed_train = flatten_data(ttrain, target_variable, flat_lags )
    # keep only quarter-end months (March=3, June=6, Sep=9, Dec=12) — GDP is quarterly
    # .dt.month extracts the month number from a datetime column
    transformed_train = transformed_train.loc[transformed_train.date.dt.month.astype(int).isin([3,6,9,12]),:].fillna(transformed_train.mean(numeric_only=True)).dropna(axis=1).reset_index(drop=True)
    transformed_train = lagged_target(transformed_train, gdp_lags)
    
    model = LinearRegression()
    x = transformed_train.drop(["date", target_variable], axis=1)
    y = transformed_train[target_variable]
    return model.fit(x, y) , x.columns   # returns both the fitted model AND the column names used


def predict_ols(
    model ,
    X ,
    train_vars ,
    date ,
    pred_dict ,
    l,
    flat_lags = 3 ,
    gdp_lags = 0 ,
    target_variable = target_variable,
    ) :
    # Apply model in test
    Xi = flatten_data(X, target_variable, flat_lags )
    Xi = lagged_target(Xi, gdp_lags)
        
    # Select only the single row matching the target date
    Xi = Xi.loc[Xi['date'] == date, :].drop(["date", target_variable], axis=1)
    Xi = Xi[train_vars]   # reorder/subset columns to match what the model was trained on
    
    pred = model.predict(Xi)[0]   # [0] extracts the scalar from the 1-element array returned by predict
    pred_dict[l].append(pred)     # append prediction to the list for this lag (l) in the dictionary
    return pred_dict


def perform_tab( pred_values , model_name , specification , values , lags=lags) :
    # table of RMSE by vintage
    table = pd.DataFrame(columns=[ "Vintage", "RMSE", "estimator", "spec" ])
    for lag in lags:
        tmp = pd.DataFrame({
            "Vintage" : lag,
            # np.array() converts lists to arrays so element-wise subtraction works correctly
            "RMSE" : np.sqrt(np.mean((np.array(values) - np.array(pred_values[lag])) ** 2)) ,
            "estimator": model_name ,
            "spec" : specification ,
        }, index=[0])   # index=[0] required when all values are scalars (not lists)
        table = pd.concat([table, tmp]).reset_index(drop=True)   # pd.concat stacks DataFrames vertically
    table.round(4)
    table.set_index('Vintage' , inplace=True)
    return table

def forecast_table( pred_values , model_name , specification, values  , dates ) :
    # plot of predictions vs actuals
    result = pd.DataFrame(
        {
            "observed": actuals ,
            "two_back": pred_values[-2] ,   # dictionary key -2: info set 2 months before GDP release
            "one_back": pred_values[-1] ,
            "zero_back": pred_values[0] ,   # key 0: contemporaneous information set
            "one_ahead": pred_values[1] ,
            "two_ahead": pred_values[2] ,
            "estimator": model_name ,
            "spec": specification ,
        }
    )
    result.index = pd.to_datetime(dates)   # assign proper date index to the result table
    return result

# .values[0] extracts the first (and only) element from the numpy array stored in the cell
XVars = spec_list['selected_vars'].values[0]
print(XVars)         # print the list of selected variable names
print(len(XVars))    # len() returns the number of items in a list

['MNPL0009', 'XIMP0002', 'MXRT0101', 'MNPL0014', 'XEMP0002', 'REXC0002', 'MNPL0004', 'MNPL0013', 'MLOAB002', 'MLOAB004', 'MLOA9803', 'XIMP0003', 'MNPL0012', 'UGTR0039', 'MLOA9305', 'MLOA9306', 'MRES0302', 'MNPL0008']
18

# Keep only LARS-selected predictors plus the GDP target; list comprehension filters data.columns
df = data[[ col for col in data.columns if col in XVars or col == target_variable ]]
print(f"Data shape: {df.shape}")   # .shape returns (number of rows, number of columns)
df.tail(2)

Data shape: (134, 19)

df = df.reset_index(drop=False)   # move the date index back to a plain column called 'date'
# .dropna(axis=1, thresh=23): drop any column that has fewer than 23 non-NaN values
df.dropna(axis=1, thresh=23, inplace=True)
# Filter rows within the full sample window (train + test)
test = df.loc[(df.date >= train_start_date) & (df.date <= test_end_date), :].reset_index(drop=True)
# DATES AND ACTUAL VALUES
# pd.date_range generates evenly spaced quarterly dates; freq="3MS" = every 3 month-starts
# .strftime("%Y-%m-%d") formats each Timestamp as a string; .tolist() converts to a plain Python list
dates = ( pd.date_range(test_start_date, test_end_date, freq="3MS").strftime("%Y-%m-%d").tolist() )
# .isin(dates) checks which rows in test fall on a test-period GDP date; .values extracts as array
actuals = list(test.loc[test.date.astype(str).isin(dates), target_variable].values)

dates

['2023-06-01',
 '2023-09-01',
 '2023-12-01',
 '2024-03-01',
 '2024-06-01',
 '2024-09-01',
 '2024-12-01',
 '2025-03-01',
 '2025-06-01',
 '2025-09-01',
 '2025-12-01']

# param_pca is the number of factors included.
param_pca = 1
# " ".join(list) concatenates all items in XVars into a single space-separated string
spec = " ".join(XVars)

date = dates[0]   # take the first test date (dates is a list of strings)
date

'2023-06-01'

# define train dataset
# pd.to_datetime(date) converts the string date to a Timestamp so we can do arithmetic
# DateOffset(months=3) shifts the date back by one quarter — training ends before the test quarter
train = test.loc[test.date <= str(pd.to_datetime(date) - pd.tseries.offsets.DateOffset(months=3))[:10],:]
# str(...)[:10] slices the first 10 characters of the Timestamp string to get "YYYY-MM-DD"
train

transformed_train = mean_fill_dataset(train, train)
transformed_train

# Standardize predictors: .set_index('date').drop(target_variable) removes date & GDP columns
# scaler.fit_transform() returns a numpy array; wrap in pd.DataFrame() to restore column names
X = pd.DataFrame(scaler.fit_transform(transformed_train.set_index('date').drop(target_variable, axis=1)))
X.columns = transformed_train.set_index('date').drop(target_variable, axis=1).columns   # restore column names
X = X.dropna(axis=1)   # drop any all-NaN columns after scaling
pca = PCA(n_components=param_pca)   # create a new PCA object with the chosen number of components

X = pca.fit_transform(X)   # learn components from this training slice and project X onto them
X = pd.DataFrame(X)
# .rename(columns=lambda x: ...) applies a function to every column name; lambda is an anonymous function
X.rename(columns=lambda x: f'PC_{x}', inplace=True)
# pd.concat with axis=1 joins DataFrames side-by-side (column-wise)
transformed_train = pd.concat( [ transformed_train[['date', target_variable]] , X ] ,axis=1)
transformed_train

flats = 3   # number of within-quarter months to include as extra columns (months 1, 2, 3 of each quarter)
# flatten_data adds lagged columns so each quarterly row carries all monthly info for that quarter
transformed_train = flatten_data(transformed_train, target_variable, flats )
transformed_train

# .dt.month extracts month as integer from a datetime column
# .isin([3,6,9,12]) keeps only rows where month is a quarter-end (Mar, Jun, Sep, Dec)
# .fillna(mean) fills any remaining NaN with column means; .dropna(axis=1) drops all-NaN columns
transformed_train = transformed_train.loc[transformed_train.date.dt.month.astype(int).isin([3,6,9,12]),:].fillna(transformed_train.mean()).dropna(axis=1).reset_index(drop=True)
transformed_train.tail(5)

gdp_lags = 2   # include 2 autoregressive lags of GDP as additional predictors
# lagged_target() appends columns like RGDP0000_l1, RGDP0000_l2 to the DataFrame
transformed_train = lagged_target(transformed_train, gdp_lags)
transformed_train

model = LinearRegression()   # create a fresh OLS model object
# Build X (predictors) by dropping the date and GDP target columns
x = transformed_train.drop(["date", target_variable], axis=1)
y = transformed_train[target_variable]
model.fit(x, y)   # estimate OLS coefficients; the fitted model is returned (and stored in 'model')

LinearRegression()

metadata[metadata['ticket'] == "UGTR0000"]

date

'2023-06-01'

lag = -2   # -2 means 2 months before the GDP reference quarter (early vintage, less data available)
# gen_lagged_data() artificially sets future observations to NaN to simulate publication lags
tmp_data = gen_lagged_data(metadata, test, date, lag)
tmp_data

tmp_data[['date', 'UGTR0039']]

# Fill NaN in test data using means computed from the training data (avoids leakage)
tmp_data = mean_fill_dataset(train, tmp_data)
# The 'or' condition is always True here (both branches do the same thing); PCA is always applied
if param_pca != False or param_pca != 0 :
    X = pd.DataFrame(scaler.fit_transform(tmp_data.set_index('date').drop(target_variable, axis=1)))
    X.columns = tmp_data.set_index('date').drop(target_variable, axis=1).columns
    X = X.dropna(axis=1)
    pca = PCA(n_components=param_pca)
    
    X = pca.fit_transform(X)   # project test predictors onto principal components
    X = pd.DataFrame(X)
    X.rename(columns=lambda x: f'PC_{x}', inplace=True)
    
    # Reassemble: date + GDP (target) columns on the left, PCA factors on the right
    tmp_data = pd.concat( [ tmp_data[['date', target_variable]] , X ] ,axis=1)
tmp_data

# Apply the same flattening and lagging transformations as were done during training
tmp_data = flatten_data(tmp_data, target_variable, flats )
tmp_data = lagged_target(tmp_data, gdp_lags )
# Keep only the single row for the target date; drop date and GDP (not needed for prediction)
tmp_data = tmp_data.loc[tmp_data['date']== date, :].drop(["date", target_variable], axis=1)
tmp_data

# model.predict() returns an array; [0] pulls out the scalar nowcast value
pred = model.predict(tmp_data)[0]
# Displaying a tuple prints all three values side by side in the notebook output
date, lag, pred

('2023-06-01', -2, np.float64(0.24135180451979205))

# param_pca is the number of factors included.
param_pca = 1
# " ".join(list) builds a single string from list items — used as a readable label for this spec
spec = " ".join(XVars)

# Dictionary comprehension: create {lag: []} for each lag in lags
# This will store predicted GDP values for each information-set vintage
pred_ols = { k: [] for k in lags }
# Empty DataFrames that will be filled by concatenating results in the loop
performance_table = pd.DataFrame()
result_table = pd.DataFrame()

for date in dates :    # outer loop: iterate over each quarterly test period

    # Rolling training set: everything before the current test quarter
    train = test.loc[test.date <= str(pd.to_datetime(date) - pd.tseries.offsets.DateOffset(months=3))[:10],:]
    transformed_train = mean_fill_dataset(train, train)   # fill NaN with training means
    
    # -----------------------------------------------------------------
    # Run PCA scaling
    if param_pca != False or param_pca != 0 :
        if param_pca == True: param_pca=1

        X = pd.DataFrame(scaler.fit_transform(transformed_train.set_index('date').drop(target_variable, axis=1)))
        X.columns = transformed_train.set_index('date').drop(target_variable, axis=1).columns
        X = X.dropna(axis=1)
        pca = PCA(n_components=param_pca)
        
        X = pca.fit_transform(X)   # fit PCA on this rolling training window
        X = pd.DataFrame(X)
        X.rename(columns=lambda x: f'PC_{x}', inplace=True)
        transformed_train = pd.concat( [ transformed_train[['date', target_variable]] , X ] ,axis=1)

        
        ols_fit, ols_vars = fit_ols(   # fit_ols returns the model object and the column list
            transformed_train , 
            gdp_lags = gdp_lags,
            flat_lags = flats
        )

         # -----------------------------------------------------------------
        # Publication Lags in Test data
        for lag in lags:   # inner loop: iterate over each vintage (information set)
            # the data available for this date at this artificial vintage
            tmp_data = gen_lagged_data(metadata, test, date, lag)
            # get data in format necessary for model
            tmp_data = mean_fill_dataset(train, tmp_data)
            
            if param_pca != False or param_pca != 0 :
                X = pd.DataFrame(scaler.fit_transform(tmp_data.set_index('date').drop(target_variable, axis=1)))
                X.columns = tmp_data.set_index('date').drop(target_variable, axis=1).columns
                X = X.dropna(axis=1)
                pca = PCA(n_components=param_pca)
                
                X = pca.fit_transform(X)
                X = pd.DataFrame(X)
                X.rename(columns=lambda x: f'PC_{x}', inplace=True)
                
                tmp_data = pd.concat( [ tmp_data[['date', target_variable]] , X ] ,axis=1)

            # -----------------------------------------------------------------
            # Predicted values: append nowcast for this (date, lag) to pred_ols
            pred_ols = predict_ols(
                model=ols_fit,
                X = tmp_data ,
                train_vars = ols_vars,
                date = date,
                l = lag,
                pred_dict = pred_ols ,   # pred_dict is updated in-place and returned
                flat_lags = flats ,
                gdp_lags = gdp_lags,
                )
            
# pd.concat stacks the per-specification tables vertically into one summary table
performance_table = pd.concat( [ performance_table , perform_tab(pred_ols, "OLS", spec , actuals) ], axis=0)
result_table = pd.concat( [ result_table , forecast_table(pred_ols, "OLS", spec , actuals, dates) ], axis=0)

result_table

performance_table

result_table.plot()

<Axes: >

	RGDP0000	UMBL0000	MLIA1001	MLIA1002	MLIA1003	MLIA1004	MLIA1000	MLIA0003	MLIA0002	MLIA0004	...	UNTL1001	UNTL1002	UNTL1003	UNTL1004	UNTL1005	UNTL1006	UNTL1007	UNTL1008	UNTL0000	UGTR0000
date
2015-02-01	NaN	54304.080413	31890.278	135559.643	249451.351	116008.373	532909.645	52036.644	1501.182	14318.718	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	32.586207
2015-03-01	376071.0	55526.300906	29976.701	127010.461	245889.630	112828.497	515705.289	56535.456	588.502	13734.467	...	19.581647	23.877658	20.174278	13.7625	8.753688	30.49385	9.522143	30.088176	1.674579	32.607143

	series	ticket	description	description 2	freq	first_date	last_date	No Obs	months_lag	nominal	sa	unstructured	type	agg_method	file_name
0	gdp	RGDP0000	Gross Domestic Product, Constant Prices and Se...	NaN	QE	3/1/2015	12/1/2025	44	5	0.0	1.0	0.0	R	last	boj_gdp.csv
1	blk_ntl	UMBL0000	Black Marble Night Time Lights (Monthly)	NaN	MS	1/1/2012	12/1/2025	168	5	0.0	0.0	1.0	U	sum	blk_ntl.csv
2	bank_liabilities_cheques_in_course_of_payment	MLIA0001	Cheques in clearing not yet settled	NaN	MS	1/1/1991	11/1/2025	419	6	NaN	0.0	0.0	M	mean	boj_bank_liabilities.csv
3	bank_liabilities_deposits_central_gvt	MLIA1001	Central government deposits held by banks	NaN	MS	1/1/1991	11/1/2025	419	6	NaN	0.0	0.0	M	mean	boj_bank_liabilities.csv
4	bank_liabilities_deposits_demand	MLIA1002	Deposits withdrawable on demand	NaN	MS	1/1/1991	11/1/2025	419	6	NaN	0.0	0.0	M	mean	boj_bank_liabilities.csv
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1010	exp_uk	XEMP0003	Exports UK	NaN	MS	1/1/2018	12/1/2025	96	5	0.0	0.0	0.0	X	sum	uk_trade.csv
1011	imp_uk	XIMP0003	Imports UK	NaN	MS	1/1/2018	12/1/2025	96	5	0.0	0.0	0.0	X	sum	uk_trade.csv
1012	USA_exp_JM	XEMP0004	Exports USA	NaN	MS	1/1/1985	2/1/2026	494	3	0.0	0.0	0.0	X	sum	uscensus_trade.csv
1013	USA_imp_JM	XIMP0004	Imports USA	NaN	MS	1/1/1985	2/1/2026	494	3	0.0	0.0	0.0	X	sum	uscensus_trade.csv
1014	gtrends_mean	UGTR0000	Google Trends Mean	NaN	MS	1/1/2004	4/1/2026	268	1	0.0	0.0	1.0	U	mean	gtrends.csv

	RGDP0000	UMBL0000	MLOA2001	MLOA2002	MLOA2003	MLOA2000	MLOA3001	MLOA3002	MLOA3003	MLOA3000	...	UNTL1001	UNTL1002	UNTL1003	UNTL1004	UNTL1005	UNTL1006	UNTL1007	UNTL1008	UNTL0000	UGTR0000
date
2025-04-01	0.255034	10.958810	1.427355	-0.604903	28.386646	2.695625	0.502526	-2.540292	-0.018099	-0.092262	...	-0.210377	-0.526405	-1.221423	-12.475316	-0.741264	7.785676	-2.211186	3.070306	1.072993	-1.378025
2025-07-01	0.350465	-5.192822	1.005118	-0.509148	11.906890	0.608908	-4.520931	0.331945	-5.514979	-4.150932	...	-4.115468	7.009391	3.511097	10.053891	4.574950	-8.349466	9.846684	1.524475	1.142558	6.175978
2025-10-01	-7.272064	3.983996	1.501917	1.480881	22.450232	3.304328	6.309028	-7.929882	4.494010	4.039204	...	4.477629	-4.466167	-4.825762	1.842673	-9.213132	2.627329	-21.439797	-1.472492	-17.157254	-3.359553
2026-01-01	NaN	0.000000	5.054587	-0.452474	-27.389896	-0.399219	2.278535	14.489188	-0.607195	2.857437	...	24.784444	-4.342795	-6.158915	24.504711	2.986346	-5.176758	52.870129	-12.147083	12.148945	2.680622
2026-04-01	NaN	0.000000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	-4.945805	-0.274397	0.917710	-8.877931	-0.894484	4.393968	3.180475	2.290434	-5.862798	-6.699033

	RGDP0000	UMBL0000	MLOA2001	MLOA2002	MLOA2003	MLOA2000	MLOA3001	MLOA3002	MLOA3003	MLOA3000	...	UNTL1001	UNTL1002	UNTL1003	UNTL1004	UNTL1005	UNTL1006	UNTL1007	UNTL1008	UNTL0000	UGTR0000
date
2025-04-01	0.255034	10.958810	1.427355	-0.604903	28.386646	2.695625	0.502526	-2.540292	-0.018099	-0.092262	...	-0.210377	-0.526405	-1.221423	-12.475316	-0.741264	7.785676	-2.211186	3.070306	1.072993	-1.378025
2025-07-01	0.350465	-5.192822	1.005118	-0.509148	11.906890	0.608908	-4.520931	0.331945	-5.514979	-4.150932	...	-4.115468	7.009391	3.511097	10.053891	4.574950	-8.349466	9.846684	1.524475	1.142558	6.175978
2025-10-01	-7.272064	3.983996	1.501917	1.480881	22.450232	3.304328	6.309028	-7.929882	4.494010	4.039204	...	4.477629	-4.466167	-4.825762	1.842673	-9.213132	2.627329	-21.439797	-1.472492	-17.157254	-3.359553
2026-01-01	NaN	0.000000	5.054587	-0.452474	-27.389896	-0.399219	2.278535	14.489188	-0.607195	2.857437	...	24.784444	-4.342795	-6.158915	24.504711	2.986346	-5.176758	52.870129	-12.147083	12.148945	2.680622
2026-04-01	NaN	0.000000	1.034374	1.228224	3.119266	0.905187	1.631157	1.404333	-0.412707	1.460727	...	-4.945805	-0.274397	0.917710	-8.877931	-0.894484	4.393968	3.180475	2.290434	-5.862798	-6.699033

	series	ticket	description	description 2	freq	first_date	last_date	No Obs	months_lag	nominal	type	agg_method	file_name
63	bank_loans_government_services_local_government	MLOAB002	Commercial Bank Loans (by sector): Government ...	NaN	MS	4/1/1986	2/1/2026	479	3	NaN	M	mean	boj_bank_loans_bysector.csv
64	bank_loans_government_services_other_public_en...	MLOAB004	Commercial Bank Loans (by sector): Government ...	NaN	MS	4/1/1986	2/1/2026	479	3	NaN	M	mean	boj_bank_loans_bysector.csv
89	bank_nonperf_loans_electricity_gas_and_water	MNPL0004	Non-Performing Loans (by customer): Electricit...	NaN	MS	3/1/2017	2/1/2026	108	3	NaN	M	mean	boj_bank_nonperf.csv
95	bank_nonperf_loans_nonbank_financial_institutions	MNPL0009	Non-Performing Loans (by customer): Non-Bank F...	NaN	MS	3/1/2017	2/1/2026	108	3	NaN	M	mean	boj_bank_nonperf.csv
100	bank_nonperf_loans_tourism	MNPL0013	Non-Performing Loans (by customer): Tourism	NaN	MS	3/1/2017	2/1/2026	108	3	NaN	M	mean	boj_bank_nonperf.csv
101	bank_nonperf_loans_transport_storage_and_commu...	MNPL0014	Non-Performing Loans (by customer): Transport,...	NaN	MS	3/1/2017	2/1/2026	108	3	NaN	M	mean	boj_bank_nonperf.csv
677	EDNA_USD_XDC_RATE	MXRT0101	Nominal Exchange Rate (USD per XDC)	NaN	MS	1/1/2010	3/1/2026	195	2	0.0	M	mean	boj_forex.csv
959	exp_eu	XEMP0002	Exports EU	NaN	MS	1/1/1988	1/1/2026	457	4	0.0	X	sum	eu_trade.csv
960	imp_eu	XIMP0002	Imports EU	NaN	MS	1/1/1988	1/1/2026	457	4	0.0	X	sum	eu_trade.csv
1009	jse_volume	REXC0002	Jamaica Exchange Volume	NaN	MS	4/1/2021	4/1/2026	61	1	0.0	R	sum	jamstockex_jse.csv

Variable Selection and First Nowcast¶

Load Libraries¶

Load the data¶

Variable Selection: LARS¶

Benchmark¶

LARS Estimation¶

Basic Nowcasting¶

Estimation with one information set and one period¶

Estimation and Prediction Loop¶

	selected_vars	rmse	ratio
size
18	[MNPL0009, XIMP0002, MXRT0101, MNPL0014, XEMP0...	2.582414	0.944401

	RGDP0000	MLOA9306	MLOA9305	MLOAB002	MLOAB004	MLOA9803	MNPL0004	MNPL0008	MNPL0009	MNPL0012	MNPL0013	MNPL0014	MXRT0101	MRES0302	XEMP0002	XIMP0002	UGTR0039	REXC0002	XIMP0003
date
2026-03-01	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	-24.561172	-200.0	NaN	NaN	-11.774792	-91.115475	NaN
2026-04-01	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	34.731320	205.063026	NaN

	date	RGDP0000	MLOA9306	MLOA9305	MLOAB002	MLOAB004	MLOA9803	MNPL0004	MNPL0008	MNPL0009	MNPL0012	MNPL0013	MNPL0014	MXRT0101	MRES0302	XEMP0002	XIMP0002	UGTR0039	REXC0002	XIMP0003
0	2015-03-01	NaN	70.581649	-156.014000	206.382511	-13.394708	-103.833087	NaN	NaN	NaN	NaN	NaN	NaN	-24.530991	7.942238	82.842399	114.706300	29.627432	NaN	NaN
1	2015-04-01	NaN	-22.066866	-54.147714	2.246387	-3.972572	260.282773	NaN	NaN	NaN	NaN	NaN	NaN	-24.176115	-0.395257	-27.938881	10.384304	-41.261853	NaN	NaN
2	2015-05-01	NaN	-60.563864	-820.679555	29.434202	6.992200	281.927439	NaN	NaN	NaN	NaN	NaN	NaN	-8.874198	-0.732601	-38.716723	-58.806569	-10.064627	NaN	NaN
3	2015-06-01	-1.270239	49.866245	5.172589	2.981190	11.281302	-53.577912	NaN	NaN	NaN	NaN	NaN	NaN	-9.746181	-0.738007	14.083589	59.983465	34.924762	NaN	NaN
4	2015-07-01	NaN	80.752667	-64.199888	6.484053	8.233454	121.231796	NaN	NaN	NaN	NaN	NaN	NaN	-10.791980	-0.743494	168.512788	174.104441	12.253482	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
92	2022-11-01	NaN	-2.756959	0.773653	203.430472	22.600409	-1.381557	8719.181019	-20.456409	-79.291752	-85.620472	435.159765	186.828672	-0.009201	66.666667	-66.102954	-62.385073	10.429894	-10.274989	-26.804241
93	2022-12-01	-0.033343	-2.721185	-0.430474	-9.428629	2143.637362	-2.890795	-24.760891	30.523767	-47.280363	39.098252	332.407920	-20.122008	-0.009203	40.000000	-2.106303	-116.986859	-14.283783	34.411901	57.384308
94	2023-01-01	NaN	-0.199178	5.378630	-33.059522	-0.070748	-8.624457	-77.636481	3.806025	-756.630124	-84.928800	-91.044652	6.338960	0.002543	28.571429	33.715417	-179.599756	32.776704	-23.902725	-2.798848
95	2023-02-01	NaN	2.176954	-0.609010	-37.443600	0.697953	0.180432	27.048129	29.897415	-121.956153	-335.225110	-354.265823	14.709651	297.499819	12.037037	-63.534486	2520.003138	-29.238343	-35.172703	-2.047682
96	2023-03-01	0.629504	6.988356	-9.475858	-54.198410	0.127445	6.372435	182.112309	-33.659641	-150.439002	-329.455899	-330.377548	-1.809008	-24.553601	-200.000000	190.837484	25.934506	6.163746	-41.767104	-31.546363

	date	RGDP0000	MLOA9306	MLOA9305	MLOAB002	MLOAB004	MLOA9803	MNPL0004	MNPL0008	MNPL0009	MNPL0012	MNPL0013	MNPL0014	MXRT0101	MRES0302	XEMP0002	XIMP0002	UGTR0039	REXC0002	XIMP0003
0	2015-03-01	0.477115	70.581649	-156.014000	206.382511	-13.394708	-103.833087	154.875868	41.943409	8153.039153	155.480071	53.699068	2074.538028	-24.530991	7.942238	82.842399	114.706300	29.627432	-102.305110	5.943257
1	2015-04-01	0.477115	-22.066866	-54.147714	2.246387	-3.972572	260.282773	154.875868	41.943409	8153.039153	155.480071	53.699068	2074.538028	-24.176115	-0.395257	-27.938881	10.384304	-41.261853	-102.305110	5.943257
2	2015-05-01	0.477115	-60.563864	-820.679555	29.434202	6.992200	281.927439	154.875868	41.943409	8153.039153	155.480071	53.699068	2074.538028	-8.874198	-0.732601	-38.716723	-58.806569	-10.064627	-102.305110	5.943257
3	2015-06-01	-1.270239	49.866245	5.172589	2.981190	11.281302	-53.577912	154.875868	41.943409	8153.039153	155.480071	53.699068	2074.538028	-9.746181	-0.738007	14.083589	59.983465	34.924762	-102.305110	5.943257
4	2015-07-01	0.477115	80.752667	-64.199888	6.484053	8.233454	121.231796	154.875868	41.943409	8153.039153	155.480071	53.699068	2074.538028	-10.791980	-0.743494	168.512788	174.104441	12.253482	-102.305110	5.943257
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
92	2022-11-01	0.477115	-2.756959	0.773653	203.430472	22.600409	-1.381557	8719.181019	-20.456409	-79.291752	-85.620472	435.159765	186.828672	-0.009201	66.666667	-66.102954	-62.385073	10.429894	-10.274989	-26.804241
93	2022-12-01	-0.033343	-2.721185	-0.430474	-9.428629	2143.637362	-2.890795	-24.760891	30.523767	-47.280363	39.098252	332.407920	-20.122008	-0.009203	40.000000	-2.106303	-116.986859	-14.283783	34.411901	57.384308
94	2023-01-01	0.477115	-0.199178	5.378630	-33.059522	-0.070748	-8.624457	-77.636481	3.806025	-756.630124	-84.928800	-91.044652	6.338960	0.002543	28.571429	33.715417	-179.599756	32.776704	-23.902725	-2.798848
95	2023-02-01	0.477115	2.176954	-0.609010	-37.443600	0.697953	0.180432	27.048129	29.897415	-121.956153	-335.225110	-354.265823	14.709651	297.499819	12.037037	-63.534486	2520.003138	-29.238343	-35.172703	-2.047682
96	2023-03-01	0.629504	6.988356	-9.475858	-54.198410	0.127445	6.372435	182.112309	-33.659641	-150.439002	-329.455899	-330.377548	-1.809008	-24.553601	-200.000000	190.837484	25.934506	6.163746	-41.767104	-31.546363

	date	RGDP0000	PC_0
0	2015-03-01	0.477115	0.274108
1	2015-04-01	0.477115	0.112061
2	2015-05-01	0.477115	-0.250956
3	2015-06-01	-1.270239	0.086945
4	2015-07-01	0.477115	0.603317
...	...	...	...
92	2022-11-01	0.477115	0.825841
93	2022-12-01	-0.033343	0.074104
94	2023-01-01	0.477115	0.056413
95	2023-02-01	0.477115	-0.239374
96	2023-03-01	0.629504	-0.265951

	date	RGDP0000	PC_0	PC_0_1	PC_0_2
28	2022-03-01	0.339268	-0.939623	-1.368314	0.135421
29	2022-06-01	1.719328	-0.228787	0.406830	-0.429631
30	2022-09-01	1.917136	-2.102984	-0.837849	-1.079423
31	2022-12-01	-0.033343	0.074104	0.825841	1.335292
32	2023-03-01	0.629504	-0.265951	-0.239374	0.056413

	fit_intercept fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).	True
	copy_X copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.	True
	tol tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7	1e-06
	n_jobs n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.	None
	positive positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24	False

	date	RGDP0000	PC_0
0	2015-03-01	0.477115	0.317943
1	2015-04-01	0.477115	0.113131
2	2015-05-01	0.477115	-0.234214
3	2015-06-01	-1.270239	0.096767
4	2015-07-01	0.477115	0.674946
...	...	...	...
95	2023-02-01	0.477115	0.210000
96	2023-03-01	0.477115	-0.011387
97	2023-04-01	0.477115	-0.005684
98	2023-05-01	0.477115	-0.005684
99	2023-06-01	0.477115	-0.005684

	observed	two_back	one_back	zero_back	one_ahead	two_ahead	estimator	spec
2023-06-01	-0.066758	0.241352	0.228400	0.227395	0.202704	0.247751	OLS	MNPL0009 XIMP0002 MXRT0101 MNPL0014 XEMP0002 R...
2023-09-01	1.988200	0.233756	0.237726	0.227992	0.144382	0.735062	OLS	MNPL0009 XIMP0002 MXRT0101 MNPL0014 XEMP0002 R...
2023-12-01	-0.778219	0.225436	0.225853	0.333641	0.312621	0.684162	OLS	MNPL0009 XIMP0002 MXRT0101 MNPL0014 XEMP0002 R...
2024-03-01	-0.054704	0.025233	0.081679	0.159636	0.090580	0.866465	OLS	MNPL0009 XIMP0002 MXRT0101 MNPL0014 XEMP0002 R...
2024-06-01	-0.997453	0.067071	0.090062	0.094643	-0.113808	0.007407	OLS	MNPL0009 XIMP0002 MXRT0101 MNPL0014 XEMP0002 R...
2024-09-01	-1.055556	0.028028	0.073419	0.030234	-0.227382	0.604104	OLS	MNPL0009 XIMP0002 MXRT0101 MNPL0014 XEMP0002 R...
2024-12-01	1.691583	0.020713	0.026770	0.369550	0.224744	0.185128	OLS	MNPL0009 XIMP0002 MXRT0101 MNPL0014 XEMP0002 R...
2025-03-01	1.086629	0.012091	0.045920	0.120176	-0.043362	0.505321	OLS	MNPL0009 XIMP0002 MXRT0101 MNPL0014 XEMP0002 R...
2025-06-01	0.255034	0.081326	-0.003036	0.003558	-0.209346	-0.192858	OLS	MNPL0009 XIMP0002 MXRT0101 MNPL0014 XEMP0002 R...
2025-09-01	0.350465	0.116619	0.134579	0.101413	-0.135728	0.476378	OLS	MNPL0009 XIMP0002 MXRT0101 MNPL0014 XEMP0002 R...
2025-12-01	-7.272064	0.083480	0.074779	0.322412	0.084462	-0.028520	OLS	MNPL0009 XIMP0002 MXRT0101 MNPL0014 XEMP0002 R...

	RMSE	estimator	spec
Vintage
-2	2.423923	OLS	MNPL0009 XIMP0002 MXRT0101 MNPL0014 XEMP0002 R...
-1	2.422983	OLS	MNPL0009 XIMP0002 MXRT0101 MNPL0014 XEMP0002 R...
0	2.473896	OLS	MNPL0009 XIMP0002 MXRT0101 MNPL0014 XEMP0002 R...
1	2.414897	OLS	MNPL0009 XIMP0002 MXRT0101 MNPL0014 XEMP0002 R...
2	2.406682	OLS	MNPL0009 XIMP0002 MXRT0101 MNPL0014 XEMP0002 R...

	date	RGDP0000	PC_0	PC_0_1	PC_0_2	RGDP0000_l1	RGDP0000_l2
0	2015-03-01	0.477115	0.274108	0.002770	0.005319	0.472352	0.488665
1	2015-06-01	-1.270239	0.086945	-0.250956	0.112061	0.477115	0.488665
2	2015-09-01	-0.988166	3.023573	-0.044457	0.603317	-1.270239	0.477115
3	2015-12-01	0.515743	0.031364	0.391376	-0.171456	-0.988166	-1.270239
4	2016-03-01	2.572249	0.345397	-0.076448	0.062946	0.515743	-0.988166
5	2016-06-01	0.376227	0.056120	-0.029241	0.095254	2.572249	0.515743
6	2016-09-01	0.481795	-0.631007	-0.476203	0.015039	0.376227	2.572249
7	2016-12-01	0.377205	0.202950	0.152533	0.155131	0.481795	0.376227
8	2017-03-01	0.558470	0.733819	0.329005	0.101917	0.377205	0.481795
9	2017-06-01	0.278591	-0.158065	-0.081536	-0.057012	0.558470	0.377205
10	2017-09-01	1.644420	-2.043313	-1.807598	-0.705708	0.278591	0.558470
11	2017-12-01	0.264170	-0.103078	1.195116	0.582402	1.644420	0.278591
12	2018-03-01	0.258148	-0.645802	-0.064074	0.016744	0.264170	1.644420
13	2018-06-01	0.708712	0.018259	-0.184621	-0.063280	0.258148	0.264170
14	2018-09-01	0.909166	-1.900543	-0.879999	3.507623	0.708712	0.258148
15	2018-12-01	0.652833	0.416049	0.869942	0.759895	0.909166	0.708712
16	2019-03-01	0.188423	-0.977965	0.116948	-0.711308	0.652833	0.909166
17	2019-06-01	0.415626	-0.046141	-0.098648	0.000716	0.188423	0.652833
18	2019-09-01	0.313624	-1.354224	-1.053962	-0.713807	0.415626	0.188423
19	2019-12-01	0.093842	-0.060561	10.645788	0.840889	0.313624	0.415626
20	2020-03-01	-0.591655	-0.848246	0.513478	0.440003	0.093842	0.313624
21	2020-06-01	-16.689321	-0.067624	-1.058634	0.038459	-0.591655	0.093842
22	2020-09-01	9.557435	-1.319565	-2.140412	-0.739627	-16.689321	-0.591655
23	2020-12-01	2.073340	0.189140	0.474064	0.756060	9.557435	-16.689321
24	2021-03-01	0.787645	-0.953202	-0.064767	0.557820	2.073340	9.557435
25	2021-06-01	2.794481	-0.109026	-0.208039	0.848926	0.787645	2.073340
26	2021-09-01	1.001584	-1.727148	-1.127072	-0.454161	2.794481	0.787645
27	2021-12-01	3.411424	0.532588	0.121950	0.650857	1.001584	2.794481
28	2022-03-01	0.339268	-0.939623	-1.368314	0.135421	3.411424	1.001584
29	2022-06-01	1.719328	-0.228787	0.406830	-0.429631	0.339268	3.411424
30	2022-09-01	1.917136	-2.102984	-0.837849	-1.079423	1.719328	0.339268
31	2022-12-01	-0.033343	0.074104	0.825841	1.335292	1.917136	1.719328
32	2023-03-01	0.629504	-0.265951	-0.239374	0.056413	-0.033343	1.917136