import pandas as pd
import numpy as np
from datetime import date
from datetime import datetime
from selenium import webdriver
from bs4 import BeautifulSoup
import urllib3
from requests import get
import time
import re
import os
import math

pd.options.display.max_rows = 8

Auto pull oil futures price data from CME¶

Open web browser, navigate to site, pull raw data¶

#write url based on current time
url = "https://www.cmegroup.com/CmeWS/mvc/Quotes/Future/425/G?pageSize=50&_="
urlext = np.round(time.time(), decimals=0)
urlext = str(urlext)
url = url + urlext

# create a new Firefox session
driver = webdriver.Firefox(executable_path=r'C:/Users/yourdirectory/geckodriver-v0.23.0-win64/geckodriver.exe')
driver.get(url)
soup = BeautifulSoup(url)

#scrape site source page
urltext = soup.findAll(text=True)
urltext = driver.current_url
httptext = urllib3.PoolManager()
responsetext = httptext.request('GET', urltext)

#close browser
driver.close()

#convert BeautifulSoup object to text
souptext = BeautifulSoup(responsetext.data)
souptext2 = str(souptext)

Parse and clean pulled raw text¶

#create lists for pricing and dates
settlePri = []
settleDate = []

#state the patterns to search on within raw data
pricetxt2 = '"priorSettle":"\d+.\d\d"'
pricetxt3 = '"priorSettle":"-"'

#find the above patterns in the text and append them to their corresponding lists from above
p = re.compile("(%s|%s)" % (pricetxt2, pricetxt3)).findall(souptext2) 
d = re.findall('"expirationDate":"\d{8}"', souptext2)
if p:
    settlePri.append(p)
if d:
    settleDate.append(d)

#combine those two lists
pricelist = {'Date':d,'Oil_Price':p}

#convert to dataframe
pricelist = pd.DataFrame(pricelist)

#remove unnecessary text, format numbers and dates, remove non-numeric price place holders from web site
pricelist['Date'] = pricelist['Date'].map(lambda x: x.replace('"expirationDate":"', ""))
pricelist['Date'] = [datetime(year=int(x[0:4]), month=int(x[4:6]), day=int(x[6:8])) for x in pricelist['Date']]
pricelist['Month'] = pricelist['Date'].dt.month.astype(int)
pricelist['Year'] = pricelist['Date'].dt.year.astype(int)
pricelist['Oil_Price'] = pricelist['Oil_Price'].map(lambda x: x.replace('"priorSettle":"', "").rstrip('"'))
pricelist['Oil_Price'] = pricelist['Oil_Price'].replace('-', '0.00')
pricelist['Oil_Price'] = pricelist['Oil_Price'].astype(float)

#final dataframe
pricelist = pricelist[['Year', 'Month', 'Oil_Price']]

pricelist

Optional conversion to a yearly deck¶

#replace 0s with "nan" as .mean() does not include "nan's" in the calculation
pricelist['Oil_Price'] = pricelist['Oil_Price'].replace(0, np.nan)

#group by the year and average each the price by year
yearlyprice = pricelist.groupby('Year', as_index=False)['Oil_Price'].mean()

yearlyprice

Auto pull historic prices from the EIA website¶

Open web browser, navigate to site, pull raw data¶

url = "https://www.eia.gov/dnav/pet/hist/LeafHandler.ashx?n=pet&s=rwtc&f=m"
driver = webdriver.Firefox(executable_path=r'C:/Users/yourdirectory/geckodriver-v0.23.0-win64/geckodriver.exe')
driver.get(url)

soup = BeautifulSoup(url)
urltext = soup.findAll(text=True)
urltext = driver.current_url
httptext = urllib3.PoolManager()
responsetext = httptext.request('GET', urltext)
souptext = BeautifulSoup(responsetext.data)
Price_table = souptext.find("table", {"class": "FloatTitle"})
driver.close()

Parse and clean pulled raw text¶

In this case, the text parsing is going to be different than above because of the data format. 'tr' is the beginning of a table, while 'td' is the beginning of a cell in HTML so, for the table, 'tr', find all of the cells 'td'¶

#In this case, the text parsing is going to be different than above because of the data format
#'tr' is the beginning of a table, while 'td' is the beginning of a cell in HTML
# so, for the table, 'tr', find all of the cells 'td'

Histprice = pd.DataFrame()
rowsp = Price_table.find_all('tr')

for row in rowsp:
    cols=row.find_all('td')
    cols=[x.text.strip() for x in cols]
    cols = pd.DataFrame(cols)
    Histprice = pd.concat([Histprice,cols], ignore_index=True, axis=1)

Histprice = Histprice.dropna(axis = 1)
Header = Histprice.iloc[0]
Histprice = Histprice[1:]
Histprice.columns = Header

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
monthsdict = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12}
months = np.asarray(months)

Histprice = Histprice.set_index(months, drop=True)
Histprice2 = pd.DataFrame(Histprice.unstack(level=0))
Histprice2.index = Histprice2.index.set_names(['Year', 'Monthstr'])
Histprice2 = Histprice2.reset_index()
Histprice2['Month']= Histprice2['Monthstr'].map(monthsdict)
Histprice2.columns = ['Year', 'Monthstr', 'Oil_Price', 'Month']
Histprice2 = Histprice2[['Year', 'Month', 'Oil_Price']]
Histprice2[['Year', 'Month', 'Oil_Price']] = Histprice2[['Year', 'Month','Oil_Price']].apply(pd.to_numeric)
Histprice2 = Histprice2.dropna()

Histprice2

Optional conversion to a yearly deck¶

Histprice2yrly = Histprice2.groupby('Year', as_index=False)['Oil_Price'].mean()
Histprice2yrly

	Year	Month	Oil_Price
0	2019	8	60.43
1	2019	9	60.52
2	2019	10	60.46
3	2019	11	60.34
...	...	...	...
123	2029	11	55.62
124	2029	12	55.62
125	2030	1	55.62
126	2030	2	55.62

	Year	Oil_Price
0	2019	60.378000
1	2020	58.386667
2	2021	56.021667
3	2022	55.008333
...	...	...
8	2027	55.553333
9	2028	55.566667
10	2029	55.620000
11	2030	55.620000

	Year	Month	Oil_Price
0	1986	1	22.93
1	1986	2	15.46
2	1986	3	12.61
3	1986	4	12.84
...	...	...	...
398	2019	3	58.15
399	2019	4	63.86
400	2019	5	60.83
401	2019	6	54.66

	Year	Oil_Price
0	1986	15.036667
1	1987	19.171667
2	1988	15.982500
3	1989	19.640833
...	...	...
30	2016	43.144167
31	2017	50.884167
32	2018	64.938333
33	2019	57.305000