diff --git a/datapuller.py b/datapuller.py index 8f5e6b26..78f7120a 100644 --- a/datapuller.py +++ b/datapuller.py @@ -1,49 +1,53 @@ import yfinance as yf import pandas as pd -# Import the S&P 500 symbols -symbols = pd.read_excel("./data/stock_symbols.xlsx") -symbols.columns = symbols.columns.str.strip() -tickers = symbols['Symbol'].tolist() +class DataPuller: -# Scrape the data -all_data = [] -for i, symbol in enumerate(tickers): # Try first 20 - print(f"Processing: {i} of {len(tickers)}") - df = yf.download(symbol, period="max") - if not df.empty: - # Remove the ticker column - df.columns = df.columns.get_level_values(0) - - # Make sure Date is actually a Date Object - df = df.reset_index() - df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d") - df.set_index('Date', inplace=True) + @staticmethod + def pull(): + # Import the S&P 500 symbols + symbols = pd.read_excel("./data/stock_symbols.xlsx") + symbols.columns = symbols.columns.str.strip() + tickers = symbols['Symbol'].tolist() - # Add the Symbol column for tracking - df['Symbol'] = symbol - - # Add feature Spread - df['Spread'] = abs( df['High'] - df['Low'] ) + # Scrape the data + all_data = [] + for i, symbol in enumerate(tickers): # Try first 20 + print(f"Processing: {i} of {len(tickers)}") + df = yf.download(symbol, period="max", auto_adjust=True) + if not df.empty: + # Remove the ticker column + df.columns = df.columns.get_level_values(0) + + # Make sure Date is actually a Date Object + df = df.reset_index() + df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d") + df.set_index('Date', inplace=True) - # Add feature for Returns - df['Return'] = df['Close'].pct_change() + # Add the Symbol column for tracking + df['Symbol'] = symbol + + # Add feature Spread + df['Spread'] = abs( df['High'] - df['Low'] ) - # Add feature for volitility last 5 - df['Volatility_5'] = df['Return'].transform(lambda x: x.rolling(5).std()) + # Add feature for Returns + df['Return'] = df['Close'].pct_change() - # Add feature for volitility last 20 - df['Volatility_20'] = df['Return'].transform(lambda x: x.rolling(20).std()) + # Add feature for volitility last 5 + df['Volatility_5'] = df['Return'].transform(lambda x: x.rolling(5).std()) - all_data.append(df) + # Add feature for volitility last 20 + df['Volatility_20'] = df['Return'].transform(lambda x: x.rolling(20).std()) -# Concatinate into a combined list and cache -print("Processing data") -final_df = pd.concat(all_data) + all_data.append(df) -# Drop rows with null values -final_df.dropna(inplace=True) + # Concatinate into a combined list and cache + print("Processing data") + final_df = pd.concat(all_data) -print("Writing data to file") -final_df.to_parquet("./data/stocks.parquet") -final_df.head(200).to_csv("./data/stocks_preview.csv") \ No newline at end of file + # Drop rows with null values + final_df.dropna(inplace=True) + + print("Writing data to file") + final_df.to_parquet("./data/stocks.parquet") + final_df.head(200).to_csv("./data/stocks_preview.csv") \ No newline at end of file