Change date to number and normalize | 1 hot encode symbol

This commit is contained in:
2026-02-16 23:02:36 -08:00
parent a24ea523a8
commit 99f0d96858
+8 -5
View File
@@ -15,7 +15,7 @@ def pull():
# Scrape the data # Scrape the data
all_data = [] all_data = []
for i, symbol in enumerate(tickers): # Try first 20 for i, symbol in enumerate(tickers):
print(f"Processing: {i} of {len(tickers)}") print(f"Processing: {i} of {len(tickers)}")
df = yf.download(symbol, period="max", auto_adjust=True) df = yf.download(symbol, period="max", auto_adjust=True)
if not df.empty: if not df.empty:
@@ -24,11 +24,10 @@ def pull():
# Make sure Date is actually a Date Object # Make sure Date is actually a Date Object
df = df.reset_index() df = df.reset_index()
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d") df['Date'] = pd.to_numeric(pd.to_datetime(df['Date']))
df.set_index('Date', inplace=True)
# Add the Symbol column for tracking # Add the Symbol column for tracking | as an int 1 hot encoded
df['Symbol'] = symbol df['Symbol'] = i
# Add feature Spread # Add feature Spread
df['Spread'] = abs( df['High'] - df['Low'] ) df['Spread'] = abs( df['High'] - df['Low'] )
@@ -48,6 +47,10 @@ def pull():
print("Processing data") print("Processing data")
final_df = pd.concat(all_data) final_df = pd.concat(all_data)
# Nomralize the Date
final_df['Date'] = (final_df['Date'] - final_df['Date'].min()) / (final_df['Date'].max() - final_df['Date'].min())
final_df.set_index('Date', inplace=True)
# Drop rows with null values # Drop rows with null values
final_df.dropna(inplace=True) final_df.dropna(inplace=True)