import glob, os, re          # standard library modules: file search, OS paths, regular expressions
import pandas as pd            # pandas is the main data-manipulation library; aliased as pd

d = os.getcwd()+"/"           # os.getcwd() returns the current working directory as a string
PATH_RAW = f"{d}/../raw"      # f-string: inserts d into the string; /../raw goes one folder up then into raw/

# ----------------------------------
# Append Google Trends

# glob.glob finds all files matching a pattern; * is a wildcard
csv_files = glob.glob(os.path.join(PATH_RAW, "*multi*.csv"))   # files whose name contains "multi"
ts_files  = glob.glob(os.path.join(PATH_RAW, "time_series_*.csv"))  # files starting with "time_series_"

if len(csv_files) > 0:         # len() returns the number of items in a list; > 0 means "not empty"
    df0 = pd.DataFrame()       # create an empty DataFrame to collect results across loop iterations
    for csv in csv_files:      # loop: csv takes the value of each file path in the list one by one
        dfi = pd.read_csv(csv, skiprows=1)   # read one CSV into a DataFrame; skiprows=1 skips the header Google adds
        dfi.rename(columns={"Month": 'date'}, inplace=True)   # rename a column; inplace=True modifies dfi directly
        dfi.date = pd.to_datetime(dfi['date'])                 # convert the date column from string to datetime objects
        dfi.columns = dfi.columns.astype(str).str.lower().str.strip()  # make all column names lowercase and remove whitespace
        dfi.columns = [re.sub(r'\W+', '_', col).strip('_') for col in dfi.columns]  # list comprehension: replace non-word chars with _ in each column name
        dfi.set_index('date', inplace=True)   # use the date column as the row index instead of 0,1,2,…

        df0 = pd.concat([df0, dfi], axis=1)  # pd.concat stacks DataFrames; axis=1 means side-by-side (add columns)
    df0.columns = ["gtrends_" + col for col in df0.columns]  # list comprehension: prepend "gtrends_" to every column name

    # Replace zeroes with NaN
    df0.replace(0, pd.NA, inplace=True)   # pd.NA is pandas' missing-value sentinel; zeroes in Google Trends mean no data

    df0.to_csv(f'{PATH_RAW}/gtrends.csv')           # write the combined DataFrame to a new CSV file
    print(f'Last updated: {df0.index.max()}')        # df0.index.max() returns the most recent date in the index

    for file in csv_files:
        os.remove(file)   # delete each raw downloaded file now that it has been merged

elif len(ts_files) > 0:   # elif = "else if"; runs only when csv_files was empty but ts_files is not
    df0 = pd.DataFrame()
    for csv in ts_files:
        # Extract country code from filename: time_series_TT_... -> "tt"
        basename = os.path.basename(csv)                        # strip directory path, keep just the filename
        match = re.match(r'time_series_([A-Za-z]+)_', basename) # re.match tries to match a regex at the start of the string; () captures the country code
        country_suffix = f"_{match.group(1).lower()}" if match else ""  # ternary expression: use captured group if match succeeded, else empty string

        dfi = pd.read_csv(csv, skiprows=0)   # skiprows=0 means keep all rows (no extra header to skip here)
        dfi.rename(columns={"Time": 'date'}, inplace=True)
        dfi.date = pd.to_datetime(dfi['date'])
        dfi.columns = dfi.columns.astype(str).str.lower().str.strip()
        dfi.columns = [re.sub(r'\W+', '_', col).strip('_') for col in dfi.columns]
        dfi.set_index('date', inplace=True)

        # Append country code to each column
        if country_suffix:                                          # non-empty string is truthy in Python
            dfi.columns = [col + country_suffix for col in dfi.columns]  # add e.g. "_tt" to every column name

        df0 = pd.concat([df0, dfi], axis=1)
    df0.columns = ["gtrends_" + col for col in df0.columns]

    # Replace zeroes with NaN
    df0.replace(0, pd.NA, inplace=True)

    df0.to_csv(f'{PATH_RAW}/gtrends.csv')
    print(f'Last updated: {df0.index.max()}')

    for file in ts_files:
        os.remove(file)

df0 = df0.apply(pd.to_numeric, errors='ignore')  # convert all columns to numeric if possible; ignore errors for non-numeric columns
df0.head(5)   # display the first 5 rows of the final DataFrame (Jupyter shows this automatically as output)

C:\Users\guerr\AppData\Local\Temp\ipykernel_17564\540321788.py:67: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead
  df0 = df0.apply(pd.to_numeric, errors='ignore')  # convert all columns to numeric if possible; ignore errors for non-numeric columns

df0.plot()

<Axes: xlabel='date'>

	gtrends_restaurants_jm	gtrends_jamaica_fl_worldwide
date
2004-01-01	NaN	NaN
2004-02-01	NaN	NaN
2004-03-01	NaN	NaN
2004-04-01	NaN	NaN
2004-05-01	NaN	NaN