import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from joblib import load
import yfinance as yf
from datetime import datetime
import os.path
from alpaca.trading.client import TradingClient
from alpaca.trading.requests import MarketOrderRequest, GetAssetsRequest, AssetClass
from alpaca.trading.enums import OrderSide, TimeInForce
server = 'fs.rice.edu'
database = 'stocks'
username = 'stocks'
password = '6LAZH1'
driver = 'SQL+Server'
string = f"mssql+pyodbc://{username}:{password}@{server}/{database}"
try:
conn = create_engine(string + "?driver='SQL+Server'").connect()
except:
try:
conn = create_engine(string + "?driver='ODBC+Driver+18+for+SQL+Server'").connect()
except:
import pymssql
string = f"mssql+pymssql://{username}:{password}@{server}/{database}"
conn = create_engine(string).connect()
sep_weekly = pd.read_sql(
"""
select date, ticker, closeadj, closeunadj, volume, lastupdated from sep_weekly
where date >= '2022-01-01'
order by ticker, date, lastupdated
""",
conn,
)
sep_weekly = sep_weekly.groupby(["ticker", "date"]).last()
sep_weekly = sep_weekly.drop(columns=["lastupdated"])
ret = sep_weekly.groupby("ticker", group_keys=False).closeadj.pct_change()
ret.name = "ret"
price = sep_weekly.closeunadj
price.name = "price"
volume = sep_weekly.volume
volume.name = "volume"
ret_annual = sep_weekly.groupby("ticker", group_keys=False).closeadj.pct_change(52)
ret_monthly = sep_weekly.groupby("ticker", group_keys=False).closeadj.pct_change(4)
mom = (1 + ret_annual) / (1 + ret_monthly) - 1
mom.name = "mom"
weekly = pd.read_sql(
"""
select date, ticker, pb, marketcap, lastupdated from weekly
where date>='2022-01-01'
order by ticker, date, lastupdated
""",
conn,
)
weekly = weekly.groupby(["ticker", "date"]).last()
weekly = weekly.drop(columns=["lastupdated"])
pb = weekly.pb
pb.name = "pb"
marketcap = weekly.marketcap
marketcap.name = "marketcap"
sf1 = pd.read_sql(
"""
select datekey as date, ticker, assets, netinc, equity, lastupdated from sf1
where datekey>='2022-01-01' and dimension='ARY' and assets>0 and equity>0
order by ticker, datekey, lastupdated
""",
conn,
)
sf1 = sf1.groupby(["ticker", "date"]).last()
sf1 = sf1.drop(columns=["lastupdated"])
# change dates to Fridays
from datetime import timedelta
sf1 = sf1.reset_index()
sf1.date =sf1.date.map(
lambda x: x + timedelta(4 - x.weekday())
)
sf1 = sf1.set_index(["ticker", "date"])
sf1 = sf1[~sf1.index.duplicated()]
assets = sf1.assets
assets.name = "assets"
netinc = sf1.netinc
netinc.name = "netinc"
equity = sf1.equity
equity.name = "equity"
equity = equity.groupby("ticker", group_keys=False).shift()
roe = netinc / equity
roe.name = "roe"
assetgr = assets.groupby("ticker", group_keys=False).pct_change()
assetgr.name = "assetgr"
df = pd.concat(
(
mom,
volume,
price,
pb,
marketcap,
roe,
assetgr
),
axis=1
)
df["roe"] = df.groupby("ticker", group_keys=False).roe.ffill()
df["assetgr"] = df.groupby("ticker", group_keys=False).assetgr.ffill()
df = df.reset_index()
df.date = df.date.astype(str)
df = df[df.date==df.date.max()]
df = df[df.price >= 5]
df = df.dropna()
features = [
"mom",
"volume",
"pb",
"marketcap",
"roe",
"assetgr"
]
industries = pd.read_sql(
"""
select ticker, famaindustry as industry from tickers
""",
conn,
)
industries["industry"] = industries.industry.fillna("Almost Nothing")
df = df.merge(industries, on="ticker", how="left")
df = df.dropna()
for x in features:
df[f"{x}_industry"] = df.groupby(
["industry"],
group_keys=False
)[x].apply(
lambda x: x - x.median()
)
features += [f"{x}_industry" for x in features]
for f in features:
df[f] = df[f].rank(pct=True)
with open("keys.txt", "r") as f:
keys = f.readlines()
key, secret_key = [x.strip() for x in keys]
trading_client = TradingClient(key, secret_key, paper=True)
search_params = GetAssetsRequest(asset_class=AssetClass.US_EQUITY)
assets = trading_client.get_all_assets(search_params)
not_shortable = [x.symbol for x in assets if not x.shortable]
df["not_shortable"] = df.ticker.isin(not_shortable)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
confusion_matrix,
ConfusionMatrixDisplay,
roc_curve,
RocCurveDisplay
)
from sklearn.model_selection import train_test_split, GridSearchCV
rcf = RandomForestClassifier(class_weight="balanced")
param_grid = {"max_depth": range(2, 22, 2)}
cv = GridSearchCV(
rcf,
param_grid = param_grid
)
train, test = train_test_split(df, test_size=0.2)
X_train = train[["volume", "marketcap"]].to_numpy()
X_test = test[["volume", "marketcap"]].to_numpy()
y_train = train["not_shortable"].to_numpy()
y_test = test["not_shortable"].to_numpy()
cv.fit(X_train, y_train)
print(f"best max_depth is {cv.best_params_['max_depth']}")
best max_depth is 20
RocCurveDisplay.from_estimator(
estimator=cv,
X=X_test,
y=y_test
)
<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x2157bf5cc40>
threshold_prob = 0.15
prob = cv.predict_proba(X_test)[:, 1]
predict = 1 * (prob > threshold_prob)
cm = confusion_matrix(1 * y_test, predict)
cmd = ConfusionMatrixDisplay(cm, display_labels=["Shortable", "Not Shortable"])
cmd.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x21510e7be80>
from joblib import dump
dump(cv.best_estimator_, "not_shortable.joblib")
['not_shortable.joblib']