In [1]:
import pandas as pd
import numpy as np
from datetime import date
from datetime import datetime
from selenium import webdriver
from bs4 import BeautifulSoup
import urllib3
from requests import get
import time
import re
import os
import math

pd.options.display.max_rows = 8

Auto pull oil futures price data from CME

Open web browser, navigate to site, pull raw data

In [ ]:
#write url based on current time
url = "https://www.cmegroup.com/CmeWS/mvc/Quotes/Future/425/G?pageSize=50&_="
urlext = np.round(time.time(), decimals=0)
urlext = str(urlext)
url = url + urlext

# create a new Firefox session
driver = webdriver.Firefox(executable_path=r'C:/Users/yourdirectory/geckodriver-v0.23.0-win64/geckodriver.exe')
driver.get(url)
soup = BeautifulSoup(url)

#scrape site source page
urltext = soup.findAll(text=True)
urltext = driver.current_url
httptext = urllib3.PoolManager()
responsetext = httptext.request('GET', urltext)

#close browser
driver.close()

#convert BeautifulSoup object to text
souptext = BeautifulSoup(responsetext.data)
souptext2 = str(souptext)

Parse and clean pulled raw text

In [3]:
#create lists for pricing and dates
settlePri = []
settleDate = []

#state the patterns to search on within raw data
pricetxt2 = '"priorSettle":"\d+.\d\d"'
pricetxt3 = '"priorSettle":"-"'

#find the above patterns in the text and append them to their corresponding lists from above
p = re.compile("(%s|%s)" % (pricetxt2, pricetxt3)).findall(souptext2) 
d = re.findall('"expirationDate":"\d{8}"', souptext2)
if p:
    settlePri.append(p)
if d:
    settleDate.append(d)

#combine those two lists
pricelist = {'Date':d,'Oil_Price':p}

#convert to dataframe
pricelist = pd.DataFrame(pricelist)

#remove unnecessary text, format numbers and dates, remove non-numeric price place holders from web site
pricelist['Date'] = pricelist['Date'].map(lambda x: x.replace('"expirationDate":"', ""))
pricelist['Date'] = [datetime(year=int(x[0:4]), month=int(x[4:6]), day=int(x[6:8])) for x in pricelist['Date']]
pricelist['Month'] = pricelist['Date'].dt.month.astype(int)
pricelist['Year'] = pricelist['Date'].dt.year.astype(int)
pricelist['Oil_Price'] = pricelist['Oil_Price'].map(lambda x: x.replace('"priorSettle":"', "").rstrip('"'))
pricelist['Oil_Price'] = pricelist['Oil_Price'].replace('-', '0.00')
pricelist['Oil_Price'] = pricelist['Oil_Price'].astype(float)

#final dataframe
pricelist = pricelist[['Year', 'Month', 'Oil_Price']]
In [4]:
pricelist
Out[4]:
Year Month Oil_Price
0 2019 8 60.43
1 2019 9 60.52
2 2019 10 60.46
3 2019 11 60.34
... ... ... ...
123 2029 11 55.62
124 2029 12 55.62
125 2030 1 55.62
126 2030 2 55.62

127 rows × 3 columns

Optional conversion to a yearly deck

In [5]:
#replace 0s with "nan" as .mean() does not include "nan's" in the calculation
pricelist['Oil_Price'] = pricelist['Oil_Price'].replace(0, np.nan)

#group by the year and average each the price by year
yearlyprice = pricelist.groupby('Year', as_index=False)['Oil_Price'].mean()
In [6]:
yearlyprice
Out[6]:
Year Oil_Price
0 2019 60.378000
1 2020 58.386667
2 2021 56.021667
3 2022 55.008333
... ... ...
8 2027 55.553333
9 2028 55.566667
10 2029 55.620000
11 2030 55.620000

12 rows × 2 columns

Auto pull historic prices from the EIA website

Open web browser, navigate to site, pull raw data

In [ ]:
url = "https://www.eia.gov/dnav/pet/hist/LeafHandler.ashx?n=pet&s=rwtc&f=m"
driver = webdriver.Firefox(executable_path=r'C:/Users/yourdirectory/geckodriver-v0.23.0-win64/geckodriver.exe')
driver.get(url)

soup = BeautifulSoup(url)
urltext = soup.findAll(text=True)
urltext = driver.current_url
httptext = urllib3.PoolManager()
responsetext = httptext.request('GET', urltext)
souptext = BeautifulSoup(responsetext.data)
Price_table = souptext.find("table", {"class": "FloatTitle"})
driver.close()

Parse and clean pulled raw text

In this case, the text parsing is going to be different than above because of the data format. 'tr' is the beginning of a table, while 'td' is the beginning of a cell in HTML so, for the table, 'tr', find all of the cells 'td'

In [8]:
#In this case, the text parsing is going to be different than above because of the data format
#'tr' is the beginning of a table, while 'td' is the beginning of a cell in HTML
# so, for the table, 'tr', find all of the cells 'td'

Histprice = pd.DataFrame()
rowsp = Price_table.find_all('tr')

for row in rowsp:
    cols=row.find_all('td')
    cols=[x.text.strip() for x in cols]
    cols = pd.DataFrame(cols)
    Histprice = pd.concat([Histprice,cols], ignore_index=True, axis=1)

Histprice = Histprice.dropna(axis = 1)
Header = Histprice.iloc[0]
Histprice = Histprice[1:]
Histprice.columns = Header

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
monthsdict = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12}
months = np.asarray(months)

Histprice = Histprice.set_index(months, drop=True)
Histprice2 = pd.DataFrame(Histprice.unstack(level=0))
Histprice2.index = Histprice2.index.set_names(['Year', 'Monthstr'])
Histprice2 = Histprice2.reset_index()
Histprice2['Month']= Histprice2['Monthstr'].map(monthsdict)
Histprice2.columns = ['Year', 'Monthstr', 'Oil_Price', 'Month']
Histprice2 = Histprice2[['Year', 'Month', 'Oil_Price']]
Histprice2[['Year', 'Month', 'Oil_Price']] = Histprice2[['Year', 'Month','Oil_Price']].apply(pd.to_numeric)
Histprice2 = Histprice2.dropna()
In [9]:
Histprice2
Out[9]:
Year Month Oil_Price
0 1986 1 22.93
1 1986 2 15.46
2 1986 3 12.61
3 1986 4 12.84
... ... ... ...
398 2019 3 58.15
399 2019 4 63.86
400 2019 5 60.83
401 2019 6 54.66

402 rows × 3 columns

Optional conversion to a yearly deck

In [10]:
Histprice2yrly = Histprice2.groupby('Year', as_index=False)['Oil_Price'].mean()
Histprice2yrly
Out[10]:
Year Oil_Price
0 1986 15.036667
1 1987 19.171667
2 1988 15.982500
3 1989 19.640833
... ... ...
30 2016 43.144167
31 2017 50.884167
32 2018 64.938333
33 2019 57.305000

34 rows × 2 columns