In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
In [2]:
#################### Open the file and get an idea of what is inside.

#################### These files have a sort of dictionary at the top and 
#################### you can see the pattern of the file all the way through

with open('D:/NewMexicoData/OCDExport/T_WC.xml') as xmlf:
    print(xmlf.read(1000))  #use a much larger number here - I am just keeping it small to fit on a page
<?xml version="1.0" standalone="yes"?>
<T_WC>
  <xs:schema id="T_WC" xmlns="" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata">
    <xs:element name="T_WC" msdata:IsDataSet="true" msdata:UseCurrentLocale="true">
      <xs:complexType>
        <xs:choice minOccurs="0" maxOccurs="unbounded">
          <xs:element name="Table">
            <xs:complexType>
              <xs:sequence>
                <xs:element name="api_st_cde" type="xs:short" minOccurs="0" />
                <xs:element name="api_cnty_cde" type="xs:short" minOccurs="0" />
                <xs:element name="api_well_idn" type="xs:int" minOccurs="0" />
                <xs:element name="pool_idn" type="xs:int" minOccurs="0" />
                <xs:element name="eff_dte" type="xs:dateTime" minOccurs="0" />
                <xs:element name="ogrid_cde" type="xs:int" minOccurs="0" />
                <xs:element name="well_typ_cde" type="xs:string" minOccurs="0" />
                <x
In [3]:
#################### From the sizable list, we care about:
#################### 'api_st_cde', 'api_cnty_cde', 'api_well_idn', 'dpth_perf_top_num', 'dpth_perf_btm_num'
#################### api state code, api county code, api well id, initial perf, final perf
In [4]:
tree = ET.parse('D:/NewMexicoData/OCDExport/T_WC.xml')
root = tree.getroot()
In [ ]:
#################### Get all of your raw data from this loop

apistdf = pd.DataFrame() 
apicntydf = pd.DataFrame() 
apiwelldf = pd.DataFrame() 
topdepth = pd.DataFrame() 
btmdepth= pd.DataFrame() 
APIstate, APIcounty, APIwellid, BTMdepth, TOPdepth = ([], [], [], [], [])
vararrays = [APIstate, APIcounty, APIwellid, BTMdepth, TOPdepth]
varnames = ['api_st_cde', 'api_cnty_cde', 'api_well_idn', 'dpth_perf_top_num', 'dpth_perf_btm_num']
dfnames = [apistdf, apicntydf, apiwelldf, topdepth, btmdepth]

for j in range(0, len(varnames)):    
    for i in root.iter(varnames[j]):
        a = (i.text)
        vararrays[j] = np.append(vararrays[j], a)
    dfnames[j] = pd.DataFrame(vararrays[j], columns = [varnames[j]])
    
Perfresult = pd.concat([dfnames[0],dfnames[1],dfnames[2],dfnames[3],dfnames[4]], axis=1)
In [5]:
#Perfresult.to_pickle('D:/NewMexicoData/Perfresult.pkl')  #<- Optional - use this to save your dataframe result somewhere
Perfresult = pd.read_pickle("D:/NewMexicoData/Perfresult.pkl", compression='infer') #<- to read that saved dataframe
In [6]:
pd.options.display.max_rows = 12 #<- Controls how many lines are shown when you run this cell
Perfresult
Out[6]:
api_st_cde api_cnty_cde api_well_idn dpth_perf_top_num dpth_perf_btm_num
0 30 1 5002 1 1
1 30 1 20007 1 1
2 30 1 20010 1 1
3 30 1 20012 1 0
4 30 1 20014 1 1
5 30 3 20019 999 999
... ... ... ... ... ...
122464 30 39 30953 4868 5563
122465 30 39 30954 7157 7324
122466 30 39 30954 4993 5464
122467 30 39 30956 8026 8108
122468 30 39 30956 4432 6145
122469 30 39 30957 8332 8542

122470 rows × 5 columns

In [7]:
#Calculates effective lateral length
Perfresult = Perfresult.astype(int)
Perfresult['perfdist'] = Perfresult['dpth_perf_btm_num'] - Perfresult['dpth_perf_top_num']
Perfresult = Perfresult.sort_values(by = 'perfdist')

#Drops wells above 17,000' in length (your call)
Perfresult2 = Perfresult[Perfresult["perfdist"]>=-17000]
Perfresult2 = Perfresult2[Perfresult2["perfdist"]<=17000]

#Properly formats values to combine into full API number
Perfresult2['api_cnty_cde'] = Perfresult2['api_cnty_cde'].apply(lambda x: '{0:0>3}'.format(x))
Perfresult2['api_well_idn'] = Perfresult2['api_well_idn'].apply(lambda x: '{0:0>5}'.format(x))

#Sets data type for each column
Perfresult2[['api_cnty_cde', 'api_well_idn', 'api_st_cde']] = Perfresult2[['api_cnty_cde', 'api_well_idn', 
                                                                           'api_st_cde']].astype(str)
Perfresult2[['dpth_perf_top_num', 'dpth_perf_btm_num', 'perfdist']] = Perfresult2[['dpth_perf_top_num', 
                                                                                   'dpth_perf_btm_num', 
                                                                                   'perfdist']].astype(float)

Perfresult2['API'] = Perfresult2['api_st_cde'] + '-' + Perfresult2['api_cnty_cde'] + '-' + Perfresult2['api_well_idn']
Perfresult3 = Perfresult2[['API', 'dpth_perf_top_num', 'dpth_perf_btm_num', 'perfdist']]
In [8]:
#Accounts for reversal of top and bottom perf data entry from state data
Perfresult3 = Perfresult3.reset_index(drop = True)
Perfresult3['FinalPerfDist'] = [((Perfresult3['perfdist'].iloc[x])*-1) if ((Perfresult3['perfdist'].iloc[x])<0) else 
                            (Perfresult3['perfdist'].iloc[x]) for x in range(0, len(Perfresult3))]
Perfresult3
Out[8]:
API dpth_perf_top_num dpth_perf_btm_num perfdist FinalPerfDist
0 30-015-27351 18902.0 2266.0 -16636.0 16636.0
1 30-025-36016 12199.0 211.0 -11988.0 11988.0
2 30-015-35557 12406.0 710.0 -11696.0 11696.0
3 30-015-35346 12184.0 1494.0 -10690.0 10690.0
4 30-015-29322 11980.0 1990.0 -9990.0 9990.0
5 30-015-23079 20908.0 10954.0 -9954.0 9954.0
... ... ... ... ... ...
122327 30-015-27085 489.0 9999.0 9510.0 9510.0
122328 30-015-39972 9318.0 18912.0 9594.0 9594.0
122329 30-015-40155 3843.0 13597.0 9754.0 9754.0
122330 30-015-34756 1200.0 12421.0 11221.0 11221.0
122331 30-015-24041 503.0 11795.0 11292.0 11292.0
122332 30-015-29661 1349.0 18081.0 16732.0 16732.0

122333 rows × 5 columns

In [9]:
#Get an idea of how many wells have a certain measurment for the depth of the top perf
Perfresult3.sort_values(by="dpth_perf_top_num", ascending = False)
Perfcheck = pd.value_counts(Perfresult3["dpth_perf_top_num"])
Perfcheck
Out[9]:
99999.0    33178
0.0        12327
9999.0       967
9.0          748
99.0         716
999.0        621
           ...  
13314.0        1
10.0           1
12964.0        1
11776.0        1
11306.0        1
8738.0         1
Name: dpth_perf_top_num, Length: 11703, dtype: int64
In [ ]: