diff --git a/WebServer/AIPython/datapuller.py b/WebServer/AIPython/datapuller.py index 333ef732..ffaedfd7 100644 --- a/WebServer/AIPython/datapuller.py +++ b/WebServer/AIPython/datapuller.py @@ -15,7 +15,7 @@ def pull(): # Scrape the data all_data = [] - for i, symbol in enumerate(tickers): # Try first 20 + for i, symbol in enumerate(tickers): print(f"Processing: {i} of {len(tickers)}") df = yf.download(symbol, period="max", auto_adjust=True) if not df.empty: @@ -24,11 +24,10 @@ def pull(): # Make sure Date is actually a Date Object df = df.reset_index() - df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d") - df.set_index('Date', inplace=True) + df['Date'] = pd.to_numeric(pd.to_datetime(df['Date'])) - # Add the Symbol column for tracking - df['Symbol'] = symbol + # Add the Symbol column for tracking | as an int 1 hot encoded + df['Symbol'] = i # Add feature Spread df['Spread'] = abs( df['High'] - df['Low'] ) @@ -48,6 +47,10 @@ def pull(): print("Processing data") final_df = pd.concat(all_data) + # Nomralize the Date + final_df['Date'] = (final_df['Date'] - final_df['Date'].min()) / (final_df['Date'].max() - final_df['Date'].min()) + final_df.set_index('Date', inplace=True) + # Drop rows with null values final_df.dropna(inplace=True)