59 lines
2.0 KiB
Python
59 lines
2.0 KiB
Python
import os
|
|
import yfinance as yf
|
|
import pandas as pd
|
|
|
|
def pull():
|
|
|
|
# Get the CWD for pathing due to being called from C# now
|
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
DATA_DIR = os.path.join(SCRIPT_DIR, "data")
|
|
|
|
# Import the S&P 500 symbols
|
|
symbols = pd.read_excel(os.path.join(DATA_DIR, "stock_symbols.xlsx"))
|
|
symbols.columns = symbols.columns.str.strip()
|
|
tickers = symbols['Symbol'].tolist()
|
|
|
|
# Scrape the data
|
|
all_data = []
|
|
for i, symbol in enumerate(tickers):
|
|
print(f"Processing: {i} of {len(tickers)}")
|
|
df = yf.download(symbol, period="max", auto_adjust=True)
|
|
if not df.empty:
|
|
# Remove the ticker column
|
|
df.columns = df.columns.get_level_values(0)
|
|
|
|
# Make sure Date is actually a Date Object
|
|
df = df.reset_index()
|
|
df['Date'] = pd.to_numeric(pd.to_datetime(df['Date']))
|
|
|
|
# Add the Symbol column for tracking | as an int 1 hot encoded
|
|
df['Symbol'] = i
|
|
|
|
# Add feature Spread
|
|
df['Spread'] = abs( df['High'] - df['Low'] )
|
|
|
|
# Add feature for Returns
|
|
df['Return'] = df['Close'].pct_change()
|
|
|
|
# Add feature for volitility last 5
|
|
df['Volatility_5'] = df['Return'].transform(lambda x: x.rolling(5).std())
|
|
|
|
# Add feature for volitility last 20
|
|
df['Volatility_20'] = df['Return'].transform(lambda x: x.rolling(20).std())
|
|
|
|
all_data.append(df)
|
|
|
|
# Concatinate into a combined list and cache
|
|
print("Processing data")
|
|
final_df = pd.concat(all_data)
|
|
|
|
# Nomralize the Date
|
|
final_df['Date'] = (final_df['Date'] - final_df['Date'].min()) / (final_df['Date'].max() - final_df['Date'].min())
|
|
final_df.set_index('Date', inplace=True)
|
|
|
|
# Drop rows with null values
|
|
final_df.dropna(inplace=True)
|
|
|
|
print("Writing data to file")
|
|
final_df.to_parquet(os.path.join(DATA_DIR, "stocks.parquet"))
|
|
final_df.head(200).to_csv(os.path.join(DATA_DIR, "stocks.preview.csv")) |