Can always use this code to connect (I hope).
from sqlalchemy import create_engine
server = 'fs.rice.edu'
database = 'stocks'
username = 'stocks'
password = '6LAZH1'
driver = 'SQL+Server'
string = f"mssql+pyodbc://{username}:{password}@{server}/{database}"
try:
conn = create_engine(string + "?driver='SQL+Server'").connect()
except:
try:
conn = create_engine(string + "?driver='ODBC+Driver+18+for+SQL+Server'").connect()
except:
import pymssql
string = f"mssql+pymssql://{username}:{password}@{server}/{database}"
conn = create_engine(string).connect()
import pandas as pd
pd.read_sql("select * from information_schema.tables", conn)
TABLE_CATALOG | TABLE_SCHEMA | TABLE_NAME | TABLE_TYPE | |
---|---|---|---|---|
0 | stocks | dbo | sf1 | BASE TABLE |
1 | stocks | dbo | sep_weekly | BASE TABLE |
2 | stocks | dbo | weekly | BASE TABLE |
3 | stocks | dbo | today | BASE TABLE |
4 | stocks | dbo | ghz | BASE TABLE |
5 | stocks | dbo | indicators | BASE TABLE |
6 | stocks | dbo | tickers | BASE TABLE |
tickers has one row for each ticker, with general company information
tickers = pd.read_sql("select top 3 * from tickers", conn)
tickers
permaticker | siccode | lastupdated | firstadded | firstpricedate | lastpricedate | firstquarter | lastquarter | isdelisted | ticker | ... | famaindustry | sector | industry | scalemarketcap | scalerevenue | relatedtickers | currency | location | secfilings | companysite | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 196290 | 3826 | 2023-12-20 | 2014-09-26 | 1999-11-18 | 2024-01-30 | 1997-06-30 | 2023-09-30 | N | A | ... | Measuring and Control Equipment | Healthcare | Diagnostics & Research | 5 - Large | 5 - Large | None | USD | California; U.S.A | https://www.sec.gov/cgi-bin/browse-edgar?actio... | https://www.agilent.com |
1 | 124392 | 3334 | 2023-10-26 | 2016-11-01 | 2016-11-01 | 2024-01-30 | 2014-12-31 | 2023-09-30 | N | AA | ... | Steel Works Etc | Basic Materials | Aluminum | 5 - Large | 5 - Large | None | USD | Pennsylvania; U.S.A | https://www.sec.gov/cgi-bin/browse-edgar?actio... | http://www.alcoa.com |
2 | 122827 | 6022 | 2019-07-29 | 2017-09-09 | 1998-09-25 | 2003-01-28 | 1997-09-30 | 2002-09-30 | Y | AAAB | ... | Banking | Financial Services | Banks - Regional | 2 - Micro | 1 - Nano | None | USD | Florida; U.S.A | https://www.sec.gov/cgi-bin/browse-edgar?actio... | None |
3 rows × 26 columns
for col in tickers.columns: print(col)
permaticker siccode lastupdated firstadded firstpricedate lastpricedate firstquarter lastquarter isdelisted ticker name exchange cusips sicsector sicindustry famasector famaindustry sector industry scalemarketcap scalerevenue relatedtickers currency location secfilings companysite
indicators has one row for each variable in the other tables with definitions
indicators = pd.read_sql("select * from indicators", conn)
indicators.head()
tbl | indicator | isfilter | isprimarykey | title | description | unittype | |
---|---|---|---|---|---|---|---|
0 | SF1 | revenue | N | N | Revenues | [Income Statement] The amount of Revenue recog... | currency |
1 | SF1 | cor | N | N | Cost of Revenue | [Income Statement] The aggregate cost of goods... | currency |
2 | SF1 | sgna | N | N | Selling General and Administrative Expense | [Income Statement] A component of [OpEx] repre... | currency |
3 | SF1 | rnd | N | N | Research and Development Expense | [Income Statement] A component of [OpEx] repre... | currency |
4 | SF1 | opex | N | N | Operating Expenses | [Income Statement] Operating expenses represen... | currency |
indicators.to_excel("indicators.xlsx")
for col in indicators.columns: print(col)
tbl indicator isfilter isprimarykey title description unittype
sf1 has annual and quarterly reports for all NYSE/Nasdaq stocks since 2000
sf1 = pd.read_sql("select top 3 * from sf1", conn)
sf1
ticker | dimension | calendardate | datekey | reportperiod | lastupdated | accoci | assets | assetsavg | assetsc | ... | sharesbas | shareswa | shareswadil | sps | tangibles | taxassets | taxexp | taxliabilities | tbvps | workingcapital | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | MET | ARQ | 2011-03-31 | 2011-05-10 | 2011-03-31 | 2023-11-02 | 1.115000e+09 | 7.513410e+11 | None | None | ... | 1.057040e+09 | 1.057040e+09 | None | 15.053 | 7.393950e+11 | 0.0 | 4.280000e+08 | 2.351000e+09 | 699.495 | None |
1 | MET | ARQ | 2011-06-30 | 2011-08-05 | 2011-06-30 | 2023-11-02 | 3.356000e+09 | 7.714830e+11 | None | None | ... | 1.057494e+09 | 1.057494e+09 | None | 16.218 | 7.594470e+11 | 0.0 | 5.190000e+08 | 3.897000e+09 | 718.158 | None |
2 | MET | ARQ | 2011-09-30 | 2011-11-04 | 2011-09-30 | 2023-11-02 | 6.813000e+09 | 7.852300e+11 | None | None | ... | 1.057634e+09 | 1.057634e+09 | None | 19.343 | 7.732240e+11 | 0.0 | 1.734000e+09 | 7.599000e+09 | 731.088 | None |
3 rows × 111 columns
for col in sf1.columns: print(col)
ticker dimension calendardate datekey reportperiod lastupdated accoci assets assetsavg assetsc assetsnc assetturnover bvps capex cashneq cashnequsd cor consolinc currentratio de debt debtc debtnc debtusd deferredrev depamor deposits divyield dps ebit ebitda ebitdamargin ebitdausd ebitusd ebt eps epsdil epsusd equity equityavg equityusd ev evebit evebitda fcf fcfps fxusd gp grossmargin intangibles intexp invcap invcapavg inventory investments investmentsc investmentsnc liabilities liabilitiesc liabilitiesnc marketcap ncf ncfbus ncfcommon ncfdebt ncfdiv ncff ncfi ncfinv ncfo ncfx netinc netinccmn netinccmnusd netincdis netincnci netmargin opex opinc payables payoutratio pb pe pe1 ppnenet prefdivis price ps ps1 receivables retearn revenue revenueusd rnd roa roe roic ros sbcomp sgna sharefactor sharesbas shareswa shareswadil sps tangibles taxassets taxexp taxliabilities tbvps workingcapital
sep_weekly has weekly open (opn), high, low, closeadj, closeunad, and average daily volume
sep_weekly = pd.read_sql("select top 3 * from sep_weekly", conn)
weekly has end-of-week enterprise value, enterprise value to ebit, enterprise value to ebitda, marketcap, price to book, price to earnings, and price to sales
pd.read_sql("select top 3 * from weekly", conn)
ticker | date | lastupdated | ev | evebit | evebitda | marketcap | pb | pe | ps | |
---|---|---|---|---|---|---|---|---|---|---|
0 | A | 2000-01-07 | 2019-03-28 | 32040.0 | 47.9 | 28.9 | 32040.0 | 10.0 | 62.6 | 4.7 |
1 | A | 2000-01-14 | 2019-03-28 | 30678.3 | 45.9 | 27.7 | 30678.3 | 9.5 | 59.9 | 4.5 |
2 | A | 2000-01-21 | 2019-03-28 | 31817.5 | 47.6 | 28.7 | 31817.5 | 9.9 | 62.1 | 4.7 |
sep_weekly = pd.read_sql(
"""
select date, ticker, closeadj, closeunadj, lastupdated from sep_weekly
where date >= '2010-01-01'
order by ticker, date, lastupdated
""",
conn,
)
sep_weekly = sep_weekly.groupby(["ticker", "date"]).last()
sep_weekly = sep_weekly.drop(columns=["lastupdated"])
ret = sep_weekly.groupby("ticker", group_keys=False).closeadj.pct_change()
ret.name = "ret"
price = sep_weekly.closeunadj
price.name = "price"
ret_annual = sep_weekly.groupby("ticker", group_keys=False).closeadj.pct_change(52)
ret_monthly = sep_weekly.groupby("ticker", group_keys=False).closeadj.pct_change(4)
mom = (1 + ret_annual) / (1 + ret_monthly) - 1
mom.name = "mom"
weekly = pd.read_sql(
"""
select date, ticker, pb, marketcap, lastupdated from weekly
where date>='2010-01-01'
order by ticker, date, lastupdated
""",
conn,
)
weekly = weekly.groupby(["ticker", "date"]).last()
weekly = weekly.drop(columns=["lastupdated"])
pb = weekly.pb
pb.name = "pb"
marketcap = weekly.marketcap
marketcap.name = "marketcap"
sf1 = pd.read_sql(
"""
select datekey as date, ticker, assets, netinc, equity, lastupdated from sf1
where datekey>='2010-01-01' and dimension='ARY' and assets>0 and equity>0
order by ticker, datekey, lastupdated
""",
conn,
)
sf1 = sf1.groupby(["ticker", "date"]).last()
sf1 = sf1.drop(columns=["lastupdated"])
# change dates to Fridays
from datetime import timedelta
sf1 = sf1.reset_index()
sf1.date =sf1.date.map(
lambda x: x + timedelta(4 - x.weekday())
)
sf1 = sf1.set_index(["ticker", "date"])
sf1 = sf1[~sf1.index.duplicated()]
assets = sf1.assets
assets.name = "assets"
netinc = sf1.netinc
netinc.name = "netinc"
equity = sf1.equity
equity.name = "equity"
df = pd.concat((ret, mom, price), axis=1)
df["ret"] = df.groupby("ticker", group_keys=False).ret.shift(-1)
df = df[df.price >= 5]
df = df.dropna()
df["mom10"] = df.groupby("date", group_keys=False).mom.apply(
lambda x: pd.qcut(x, 10, labels=range(1, 11))
)
mom10 = df.groupby(
["date", "mom10"],
observed=True,
group_keys=True
).ret.mean().unstack()
mom10.head()
mom10 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
---|---|---|---|---|---|---|---|---|---|---|
date | ||||||||||
2010-12-31 | 0.016544 | 0.014900 | 0.008000 | 0.008447 | 0.005864 | 0.005382 | 0.008048 | 0.015140 | 0.008667 | 0.016340 |
2011-01-07 | -0.002317 | -0.002727 | -0.001495 | -0.005026 | -0.005653 | -0.005736 | -0.001783 | -0.005401 | -0.000301 | 0.004876 |
2011-01-14 | 0.016414 | 0.018445 | 0.015778 | 0.014730 | 0.012619 | 0.011439 | 0.012473 | 0.014187 | 0.014875 | 0.011677 |
2011-01-21 | -0.023235 | -0.016761 | -0.016063 | -0.008631 | -0.012379 | -0.011164 | -0.015339 | -0.014652 | -0.022201 | -0.038894 |
2011-01-28 | -0.006885 | -0.002044 | 0.001404 | -0.000768 | 0.001841 | 0.002883 | 0.005465 | 0.008136 | 0.010907 | 0.010893 |
(100 * 52 * mom10.mean()).round(2)
mom10 1 3.83 2 8.82 3 10.68 4 11.91 5 13.25 6 12.76 7 11.34 8 11.34 9 13.91 10 14.47 dtype: float64
Repeat for small caps, defined as not in the top 1,000 by marketcap.
df = pd.concat((ret, mom, price, marketcap), axis=1)
df["ret"] = df.groupby("ticker", group_keys=False).ret.shift(-1)
df = df[df.price >= 5]
df["rnk"] = df.groupby("date", group_keys=False).marketcap.rank(ascending=False)
df = df[df.rnk>1000]
df = df.dropna()
df["mom10"] = df.groupby("date", group_keys=False).mom.apply(
lambda x: pd.qcut(x, 10, labels=range(1, 11))
)
mom10 = df.groupby(
["date", "mom10"],
observed=True,
group_keys=True
).ret.mean().unstack()
(100 * 52 * mom10.mean()).round(2)
mom10 1 2.02 2 7.71 3 10.56 4 10.37 5 12.59 6 12.99 7 11.68 8 10.90 9 13.44 10 14.58 dtype: float64
df = pd.concat((ret, mom, pb, price), axis=1)
df["ret"] = df.groupby("ticker", group_keys=False).ret.shift(-1)
df = df[df.price >= 5]
df = df.dropna()
df["mom5"] = df.groupby("date", group_keys=False).mom.apply(
lambda x: pd.qcut(x, 5, labels=range(1, 6))
)
df["pb5"] = df.groupby("date", group_keys=False).pb.apply(
lambda x: pd.qcut(x, 5, labels=range(1, 6))
)
mom5_pb5 = df.groupby(
["date", "mom5", "pb5"],
observed=True,
group_keys=True
).ret.mean().unstack(level=["pb5", "mom5"])
(100 * 52 * mom5_pb5.mean()).round(2).unstack()
mom5 | 1 | 2 | 3 | 4 | 5 |
---|---|---|---|---|---|
pb5 | |||||
1 | 4.20 | 13.23 | 16.01 | 13.85 | 15.04 |
2 | 7.47 | 10.68 | 10.72 | 10.71 | 12.76 |
3 | 8.19 | 10.27 | 11.47 | 11.12 | 14.55 |
4 | 7.42 | 11.34 | 13.33 | 11.56 | 11.82 |
5 | 2.63 | 10.90 | 12.64 | 12.29 | 15.23 |
Intersect quintile sorts on momentum and marketcap and compute mean portfolio returns.
equity = equity.groupby("ticker", group_keys=False).shift()
roe = netinc / equity
roe.name = "roe"
df = pd.concat((ret, roe, price), axis=1)
df["ret"] = df.groupby("ticker", group_keys=False).ret.shift(-1)
## forward fill
df["roe"] = df.groupby("ticker", group_keys=False).roe.ffill()
df = df[df.price >= 5]
df = df[df.index.get_level_values("date").astype(str) >= "2012-01-01"]
df = df.dropna()
df["roe10"] = df.groupby("date", group_keys=False).roe.apply(
lambda x: pd.qcut(x, 10, labels=range(1, 11))
)
roe10 = df.groupby(
["date", "roe10"],
observed=True,
group_keys=True
).ret.mean().unstack()
(100 * 52 * roe10.mean()).round(2)
roe10 1 6.78 2 10.69 3 11.34 4 12.54 5 12.78 6 13.17 7 13.42 8 13.98 9 14.71 10 13.21 dtype: float64
assetgr = assets.groupby("ticker", group_keys=False).pct_change()
assetgr.name = "assetgr"
df = pd.concat((ret, assetgr, price), axis=1)
df["ret"] = df.groupby("ticker", group_keys=False).ret.shift(-1)
## forward fill
df["assetgr"] = df.groupby("ticker", group_keys=False).assetgr.ffill()
df = df[df.price >= 5]
df = df[df.index.get_level_values("date").astype(str) >= "2012-01-01"]
df = df.dropna()
df["assetgr10"] = df.groupby("date", group_keys=False).assetgr.apply(
lambda x: pd.qcut(x, 10, labels=range(1, 11))
)
assetgr10 = df.groupby(
["date", "assetgr10"],
observed=True,
group_keys=True
).ret.mean().unstack()
(100 * 52 * assetgr10.mean()).round(2)
assetgr10 1 11.65 2 12.88 3 11.04 4 12.78 5 13.88 6 13.69 7 13.94 8 13.58 9 12.21 10 7.01 dtype: float64