Wrap in a class and set function as static

This commit is contained in:
2026-02-11 20:03:05 -08:00
parent cccf4650c2
commit 0fcdfc91bf
+39 -35
View File
@@ -1,49 +1,53 @@
import yfinance as yf import yfinance as yf
import pandas as pd import pandas as pd
# Import the S&P 500 symbols class DataPuller:
symbols = pd.read_excel("./data/stock_symbols.xlsx")
symbols.columns = symbols.columns.str.strip()
tickers = symbols['Symbol'].tolist()
# Scrape the data @staticmethod
all_data = [] def pull():
for i, symbol in enumerate(tickers): # Try first 20 # Import the S&P 500 symbols
print(f"Processing: {i} of {len(tickers)}") symbols = pd.read_excel("./data/stock_symbols.xlsx")
df = yf.download(symbol, period="max") symbols.columns = symbols.columns.str.strip()
if not df.empty: tickers = symbols['Symbol'].tolist()
# Remove the ticker column
df.columns = df.columns.get_level_values(0)
# Make sure Date is actually a Date Object # Scrape the data
df = df.reset_index() all_data = []
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d") for i, symbol in enumerate(tickers): # Try first 20
df.set_index('Date', inplace=True) print(f"Processing: {i} of {len(tickers)}")
df = yf.download(symbol, period="max", auto_adjust=True)
if not df.empty:
# Remove the ticker column
df.columns = df.columns.get_level_values(0)
# Add the Symbol column for tracking # Make sure Date is actually a Date Object
df['Symbol'] = symbol df = df.reset_index()
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d")
df.set_index('Date', inplace=True)
# Add feature Spread # Add the Symbol column for tracking
df['Spread'] = abs( df['High'] - df['Low'] ) df['Symbol'] = symbol
# Add feature for Returns # Add feature Spread
df['Return'] = df['Close'].pct_change() df['Spread'] = abs( df['High'] - df['Low'] )
# Add feature for volitility last 5 # Add feature for Returns
df['Volatility_5'] = df['Return'].transform(lambda x: x.rolling(5).std()) df['Return'] = df['Close'].pct_change()
# Add feature for volitility last 20 # Add feature for volitility last 5
df['Volatility_20'] = df['Return'].transform(lambda x: x.rolling(20).std()) df['Volatility_5'] = df['Return'].transform(lambda x: x.rolling(5).std())
all_data.append(df) # Add feature for volitility last 20
df['Volatility_20'] = df['Return'].transform(lambda x: x.rolling(20).std())
# Concatinate into a combined list and cache all_data.append(df)
print("Processing data")
final_df = pd.concat(all_data)
# Drop rows with null values # Concatinate into a combined list and cache
final_df.dropna(inplace=True) print("Processing data")
final_df = pd.concat(all_data)
print("Writing data to file") # Drop rows with null values
final_df.to_parquet("./data/stocks.parquet") final_df.dropna(inplace=True)
final_df.head(200).to_csv("./data/stocks_preview.csv")
print("Writing data to file")
final_df.to_parquet("./data/stocks.parquet")
final_df.head(200).to_csv("./data/stocks_preview.csv")