import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
#################### Open the file and get an idea of what is inside.
#################### These files have a sort of dictionary at the top and
#################### you can see the pattern of the file all the way through
with open('D:/NewMexicoData/OCDExport/T_WC.xml') as xmlf:
print(xmlf.read(1000)) #use a much larger number here - I am just keeping it small to fit on a page
#################### From the sizable list, we care about:
#################### 'api_st_cde', 'api_cnty_cde', 'api_well_idn', 'dpth_perf_top_num', 'dpth_perf_btm_num'
#################### api state code, api county code, api well id, initial perf, final perf
tree = ET.parse('D:/NewMexicoData/OCDExport/T_WC.xml')
root = tree.getroot()
#################### Get all of your raw data from this loop
apistdf = pd.DataFrame()
apicntydf = pd.DataFrame()
apiwelldf = pd.DataFrame()
topdepth = pd.DataFrame()
btmdepth= pd.DataFrame()
APIstate, APIcounty, APIwellid, BTMdepth, TOPdepth = ([], [], [], [], [])
vararrays = [APIstate, APIcounty, APIwellid, BTMdepth, TOPdepth]
varnames = ['api_st_cde', 'api_cnty_cde', 'api_well_idn', 'dpth_perf_top_num', 'dpth_perf_btm_num']
dfnames = [apistdf, apicntydf, apiwelldf, topdepth, btmdepth]
for j in range(0, len(varnames)):
for i in root.iter(varnames[j]):
a = (i.text)
vararrays[j] = np.append(vararrays[j], a)
dfnames[j] = pd.DataFrame(vararrays[j], columns = [varnames[j]])
Perfresult = pd.concat([dfnames[0],dfnames[1],dfnames[2],dfnames[3],dfnames[4]], axis=1)
#Perfresult.to_pickle('D:/NewMexicoData/Perfresult.pkl') #<- Optional - use this to save your dataframe result somewhere
Perfresult = pd.read_pickle("D:/NewMexicoData/Perfresult.pkl", compression='infer') #<- to read that saved dataframe
pd.options.display.max_rows = 12 #<- Controls how many lines are shown when you run this cell
Perfresult
#Calculates effective lateral length
Perfresult = Perfresult.astype(int)
Perfresult['perfdist'] = Perfresult['dpth_perf_btm_num'] - Perfresult['dpth_perf_top_num']
Perfresult = Perfresult.sort_values(by = 'perfdist')
#Drops wells above 17,000' in length (your call)
Perfresult2 = Perfresult[Perfresult["perfdist"]>=-17000]
Perfresult2 = Perfresult2[Perfresult2["perfdist"]<=17000]
#Properly formats values to combine into full API number
Perfresult2['api_cnty_cde'] = Perfresult2['api_cnty_cde'].apply(lambda x: '{0:0>3}'.format(x))
Perfresult2['api_well_idn'] = Perfresult2['api_well_idn'].apply(lambda x: '{0:0>5}'.format(x))
#Sets data type for each column
Perfresult2[['api_cnty_cde', 'api_well_idn', 'api_st_cde']] = Perfresult2[['api_cnty_cde', 'api_well_idn',
'api_st_cde']].astype(str)
Perfresult2[['dpth_perf_top_num', 'dpth_perf_btm_num', 'perfdist']] = Perfresult2[['dpth_perf_top_num',
'dpth_perf_btm_num',
'perfdist']].astype(float)
Perfresult2['API'] = Perfresult2['api_st_cde'] + '-' + Perfresult2['api_cnty_cde'] + '-' + Perfresult2['api_well_idn']
Perfresult3 = Perfresult2[['API', 'dpth_perf_top_num', 'dpth_perf_btm_num', 'perfdist']]
#Accounts for reversal of top and bottom perf data entry from state data
Perfresult3 = Perfresult3.reset_index(drop = True)
Perfresult3['FinalPerfDist'] = [((Perfresult3['perfdist'].iloc[x])*-1) if ((Perfresult3['perfdist'].iloc[x])<0) else
(Perfresult3['perfdist'].iloc[x]) for x in range(0, len(Perfresult3))]
Perfresult3
#Get an idea of how many wells have a certain measurment for the depth of the top perf
Perfresult3.sort_values(by="dpth_perf_top_num", ascending = False)
Perfcheck = pd.value_counts(Perfresult3["dpth_perf_top_num"])
Perfcheck