In [3]:
import glob, os, re # standard library modules: file search, OS paths, regular expressions
import pandas as pd # pandas is the main data-manipulation library; aliased as pd
d = os.getcwd()+"/" # os.getcwd() returns the current working directory as a string
PATH_RAW = f"{d}/../raw" # f-string: inserts d into the string; /../raw goes one folder up then into raw/
# ----------------------------------
# Append Google Trends
# glob.glob finds all files matching a pattern; * is a wildcard
csv_files = glob.glob(os.path.join(PATH_RAW, "*multi*.csv")) # files whose name contains "multi"
ts_files = glob.glob(os.path.join(PATH_RAW, "time_series_*.csv")) # files starting with "time_series_"
if len(csv_files) > 0: # len() returns the number of items in a list; > 0 means "not empty"
df0 = pd.DataFrame() # create an empty DataFrame to collect results across loop iterations
for csv in csv_files: # loop: csv takes the value of each file path in the list one by one
dfi = pd.read_csv(csv, skiprows=1) # read one CSV into a DataFrame; skiprows=1 skips the header Google adds
dfi.rename(columns={"Month": 'date'}, inplace=True) # rename a column; inplace=True modifies dfi directly
dfi.date = pd.to_datetime(dfi['date']) # convert the date column from string to datetime objects
dfi.columns = dfi.columns.astype(str).str.lower().str.strip() # make all column names lowercase and remove whitespace
dfi.columns = [re.sub(r'\W+', '_', col).strip('_') for col in dfi.columns] # list comprehension: replace non-word chars with _ in each column name
dfi.set_index('date', inplace=True) # use the date column as the row index instead of 0,1,2,…
df0 = pd.concat([df0, dfi], axis=1) # pd.concat stacks DataFrames; axis=1 means side-by-side (add columns)
df0.columns = ["gtrends_" + col for col in df0.columns] # list comprehension: prepend "gtrends_" to every column name
# Replace zeroes with NaN
df0.replace(0, pd.NA, inplace=True) # pd.NA is pandas' missing-value sentinel; zeroes in Google Trends mean no data
df0.to_csv(f'{PATH_RAW}/gtrends.csv') # write the combined DataFrame to a new CSV file
print(f'Last updated: {df0.index.max()}') # df0.index.max() returns the most recent date in the index
for file in csv_files:
os.remove(file) # delete each raw downloaded file now that it has been merged
elif len(ts_files) > 0: # elif = "else if"; runs only when csv_files was empty but ts_files is not
df0 = pd.DataFrame()
for csv in ts_files:
# Extract country code from filename: time_series_TT_... -> "tt"
basename = os.path.basename(csv) # strip directory path, keep just the filename
match = re.match(r'time_series_([A-Za-z]+)_', basename) # re.match tries to match a regex at the start of the string; () captures the country code
country_suffix = f"_{match.group(1).lower()}" if match else "" # ternary expression: use captured group if match succeeded, else empty string
dfi = pd.read_csv(csv, skiprows=0) # skiprows=0 means keep all rows (no extra header to skip here)
dfi.rename(columns={"Time": 'date'}, inplace=True)
dfi.date = pd.to_datetime(dfi['date'])
dfi.columns = dfi.columns.astype(str).str.lower().str.strip()
dfi.columns = [re.sub(r'\W+', '_', col).strip('_') for col in dfi.columns]
dfi.set_index('date', inplace=True)
# Append country code to each column
if country_suffix: # non-empty string is truthy in Python
dfi.columns = [col + country_suffix for col in dfi.columns] # add e.g. "_tt" to every column name
df0 = pd.concat([df0, dfi], axis=1)
df0.columns = ["gtrends_" + col for col in df0.columns]
# Replace zeroes with NaN
df0.replace(0, pd.NA, inplace=True)
df0.to_csv(f'{PATH_RAW}/gtrends.csv')
print(f'Last updated: {df0.index.max()}')
for file in ts_files:
os.remove(file)
df0 = df0.apply(pd.to_numeric, errors='ignore') # convert all columns to numeric if possible; ignore errors for non-numeric columns
df0.head(5) # display the first 5 rows of the final DataFrame (Jupyter shows this automatically as output)
C:\Users\guerr\AppData\Local\Temp\ipykernel_17564\540321788.py:67: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead df0 = df0.apply(pd.to_numeric, errors='ignore') # convert all columns to numeric if possible; ignore errors for non-numeric columns
Out[3]:
| gtrends_restaurants_jm | gtrends_jamaica_fl_worldwide | |
|---|---|---|
| date | ||
| 2004-01-01 | NaN | NaN |
| 2004-02-01 | NaN | NaN |
| 2004-03-01 | NaN | NaN |
| 2004-04-01 | NaN | NaN |
| 2004-05-01 | NaN | NaN |
In [4]:
df0.plot()
Out[4]:
<Axes: xlabel='date'>