commit af7f1927e1836f15cc326852af97bc8590bbf321 Author: Derek Holloway Date: Mon Feb 9 20:15:00 2026 -0800 Add in my working data source diff --git a/Data/datapuller.py b/Data/datapuller.py new file mode 100644 index 00000000..68e0efa3 --- /dev/null +++ b/Data/datapuller.py @@ -0,0 +1,49 @@ +import yfinance as yf +import pandas as pd + +# Import the S&P 500 symbols +symbols = pd.read_excel("./Data/stock_symbols.xlsx") +symbols.columns = symbols.columns.str.strip() +tickers = symbols['Symbol'].tolist() + +# Scrape the data +all_data = [] +for i, symbol in enumerate(tickers[:2]): # Try first 20 + print(f"Processing: {i} of {len(tickers)}") + df = yf.download(symbol, period="max") + if not df.empty: + # Remove the ticker column + df.columns = df.columns.get_level_values(0) + + # Make sure Date is actually a Date Object + df = df.reset_index() + df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d") + df.set_index('Date', inplace=True) + + # Add the Symbol column for tracking + df['Symbol'] = symbol + + # Add feature Spread + df['Spread'] = abs( df['High'] - df['Low'] ) + + # Add feature for Returns + df['Return'] = df['Close'].pct_change() + + # Add feature for volitility last 5 + df['Volatility_5'] = df['Return'].transform(lambda x: x.rolling(5).std()) + + # Add feature for volitility last 20 + df['Volatility_20'] = df['Return'].transform(lambda x: x.rolling(20).std()) + + all_data.append(df) + +# Concatinate into a combined list and cache +print("Processing data") +final_df = pd.concat(all_data) + +# Drop rows with null values +final_df.dropna(inplace=True) + +print("Writing data to file") +final_df.to_parquet("./Data/stocks.parquet") +final_df.head(200).to_csv("./Data/stocks_preview.csv") \ No newline at end of file diff --git a/Data/stock_symbols.xlsx b/Data/stock_symbols.xlsx new file mode 100644 index 00000000..f65b062d Binary files /dev/null and b/Data/stock_symbols.xlsx differ