import yfinance as yf import pandas as pd class DataPuller: @staticmethod def pull(): # Import the S&P 500 symbols symbols = pd.read_excel("./data/stock_symbols.xlsx") symbols.columns = symbols.columns.str.strip() tickers = symbols['Symbol'].tolist() # Scrape the data all_data = [] for i, symbol in enumerate(tickers): # Try first 20 print(f"Processing: {i} of {len(tickers)}") df = yf.download(symbol, period="max", auto_adjust=True) if not df.empty: # Remove the ticker column df.columns = df.columns.get_level_values(0) # Make sure Date is actually a Date Object df = df.reset_index() df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d") df.set_index('Date', inplace=True) # Add the Symbol column for tracking df['Symbol'] = symbol # Add feature Spread df['Spread'] = abs( df['High'] - df['Low'] ) # Add feature for Returns df['Return'] = df['Close'].pct_change() # Add feature for volitility last 5 df['Volatility_5'] = df['Return'].transform(lambda x: x.rolling(5).std()) # Add feature for volitility last 20 df['Volatility_20'] = df['Return'].transform(lambda x: x.rolling(20).std()) all_data.append(df) # Concatinate into a combined list and cache print("Processing data") final_df = pd.concat(all_data) # Drop rows with null values final_df.dropna(inplace=True) print("Writing data to file") final_df.to_parquet("./data/stocks.parquet") final_df.head(200).to_csv("./data/stocks_preview.csv")