import pandas as pd
import numpy as np
from datetime import date
from datetime import datetime
from selenium import webdriver
from bs4 import BeautifulSoup
import urllib3
from requests import get
import time
import re
import os
import math
pd.options.display.max_rows = 8
#write url based on current time
url = "https://www.cmegroup.com/CmeWS/mvc/Quotes/Future/425/G?pageSize=50&_="
urlext = np.round(time.time(), decimals=0)
urlext = str(urlext)
url = url + urlext
# create a new Firefox session
driver = webdriver.Firefox(executable_path=r'C:/Users/yourdirectory/geckodriver-v0.23.0-win64/geckodriver.exe')
driver.get(url)
soup = BeautifulSoup(url)
#scrape site source page
urltext = soup.findAll(text=True)
urltext = driver.current_url
httptext = urllib3.PoolManager()
responsetext = httptext.request('GET', urltext)
#close browser
driver.close()
#convert BeautifulSoup object to text
souptext = BeautifulSoup(responsetext.data)
souptext2 = str(souptext)
#create lists for pricing and dates
settlePri = []
settleDate = []
#state the patterns to search on within raw data
pricetxt2 = '"priorSettle":"\d+.\d\d"'
pricetxt3 = '"priorSettle":"-"'
#find the above patterns in the text and append them to their corresponding lists from above
p = re.compile("(%s|%s)" % (pricetxt2, pricetxt3)).findall(souptext2)
d = re.findall('"expirationDate":"\d{8}"', souptext2)
if p:
settlePri.append(p)
if d:
settleDate.append(d)
#combine those two lists
pricelist = {'Date':d,'Oil_Price':p}
#convert to dataframe
pricelist = pd.DataFrame(pricelist)
#remove unnecessary text, format numbers and dates, remove non-numeric price place holders from web site
pricelist['Date'] = pricelist['Date'].map(lambda x: x.replace('"expirationDate":"', ""))
pricelist['Date'] = [datetime(year=int(x[0:4]), month=int(x[4:6]), day=int(x[6:8])) for x in pricelist['Date']]
pricelist['Month'] = pricelist['Date'].dt.month.astype(int)
pricelist['Year'] = pricelist['Date'].dt.year.astype(int)
pricelist['Oil_Price'] = pricelist['Oil_Price'].map(lambda x: x.replace('"priorSettle":"', "").rstrip('"'))
pricelist['Oil_Price'] = pricelist['Oil_Price'].replace('-', '0.00')
pricelist['Oil_Price'] = pricelist['Oil_Price'].astype(float)
#final dataframe
pricelist = pricelist[['Year', 'Month', 'Oil_Price']]
pricelist
#replace 0s with "nan" as .mean() does not include "nan's" in the calculation
pricelist['Oil_Price'] = pricelist['Oil_Price'].replace(0, np.nan)
#group by the year and average each the price by year
yearlyprice = pricelist.groupby('Year', as_index=False)['Oil_Price'].mean()
yearlyprice
url = "https://www.eia.gov/dnav/pet/hist/LeafHandler.ashx?n=pet&s=rwtc&f=m"
driver = webdriver.Firefox(executable_path=r'C:/Users/yourdirectory/geckodriver-v0.23.0-win64/geckodriver.exe')
driver.get(url)
soup = BeautifulSoup(url)
urltext = soup.findAll(text=True)
urltext = driver.current_url
httptext = urllib3.PoolManager()
responsetext = httptext.request('GET', urltext)
souptext = BeautifulSoup(responsetext.data)
Price_table = souptext.find("table", {"class": "FloatTitle"})
driver.close()
#In this case, the text parsing is going to be different than above because of the data format
#'tr' is the beginning of a table, while 'td' is the beginning of a cell in HTML
# so, for the table, 'tr', find all of the cells 'td'
Histprice = pd.DataFrame()
rowsp = Price_table.find_all('tr')
for row in rowsp:
cols=row.find_all('td')
cols=[x.text.strip() for x in cols]
cols = pd.DataFrame(cols)
Histprice = pd.concat([Histprice,cols], ignore_index=True, axis=1)
Histprice = Histprice.dropna(axis = 1)
Header = Histprice.iloc[0]
Histprice = Histprice[1:]
Histprice.columns = Header
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
monthsdict = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12}
months = np.asarray(months)
Histprice = Histprice.set_index(months, drop=True)
Histprice2 = pd.DataFrame(Histprice.unstack(level=0))
Histprice2.index = Histprice2.index.set_names(['Year', 'Monthstr'])
Histprice2 = Histprice2.reset_index()
Histprice2['Month']= Histprice2['Monthstr'].map(monthsdict)
Histprice2.columns = ['Year', 'Monthstr', 'Oil_Price', 'Month']
Histprice2 = Histprice2[['Year', 'Month', 'Oil_Price']]
Histprice2[['Year', 'Month', 'Oil_Price']] = Histprice2[['Year', 'Month','Oil_Price']].apply(pd.to_numeric)
Histprice2 = Histprice2.dropna()
Histprice2
Histprice2yrly = Histprice2.groupby('Year', as_index=False)['Oil_Price'].mean()
Histprice2yrly