VD Parsing and Arrangement

Ed Chao
3 min readJul 27, 2020

Original code:

%%time
## complete works
import pandas as pd
import numpy as np
import glob
import xml.etree.ElementTree as et
from xml.etree.ElementTree import parse
import re

def level1(root):
## level1
level1_list = []
idx_list = []
level1 = [data for data in root.iter('Info')]
idx_list = [len(level1[i]) for i in range(len(level1))]
for i in range(len(level1)):
vdid = level1[i].attrib['vdid']
status = level1[i].attrib['status']
datatime = level1[i].attrib['datacollecttime']
level1_str = vdid + '/' + status + '/' + datatime
level1_str = idx_list[i]*[level1_str]*3
level1_list.append(level1_str)
flat_list = []
for sublist in level1_list:
for item in sublist:
flat_list.append(item)
return flat_list

def level2(root):
vsrdir_list = []
vsrid_list = []
speed_list = []
laneoc_list = []
level2 = [data for data in root.iter('lane')]
for i in range(len(level2)):
'''
vsrdir = level2[i].attrib['vsrdir']
vsrdir_list.append(vsrdir)
vsrdir_list.append(vsrdir)
vsrdir_list.append(vsrdir)
'''
vsrid = level2[i].attrib['vsrid']
vsrid_list.append(vsrid)
vsrid_list.append('0')
vsrid_list.append('0')
speed = level2[i].attrib['speed']
speed_list.append(speed)
speed_list.append('0')
speed_list.append('0')
laneoccupy = level2[i].attrib['laneoccupy']
laneoc_list.append(laneoccupy)
laneoc_list.append('0')
laneoc_list.append('0')
level2_df = pd.DataFrame({'vsrid':vsrid_list, 'speed':speed_list, 'laneoccupy':laneoc_list})
level2_df = level2_df.astype('int32')
return level2_df
def level3(root):
carsid_list = []
volume_list = []
level3 = [data for data in root.iter('cars')]
for i in range(len(level3)):
carsid = level3[i].attrib['carid']
carsid_list.append(carsid)
volume = level3[i].attrib['volume']
volume_list.append(volume)
level3_df = pd.DataFrame({'carstype':carsid_list, 'volume':volume_list})
level3_df.volume = level3_df.volume.astype('int32')
level3_df.carstype = level3_df.carstype.astype('category')
return level3_df
##combination
def combination(flat_list, level2_df, level3_df):
df_all = []
level_df = pd.concat([level2_df, level3_df], axis = 1)
level_df.insert(loc = 0, column = 'vdinfo', value = flat_list)
level_df.vsrid = level_df.vsrid.astype('int32')
datetime = [re.findall("([\d]{4}/[\d]{2}/[\d]{2} [\d]{2}:[\d]{2}:[\d]{2})", level_df.vdinfo[i])
for i in range(len(level_df))]
datetime = pd.DataFrame(datetime, columns = ['datetime'])
level_df.insert(loc = 1, column = "datetime", value = datetime)
waybound = [re.findall("[N][\d]-[A-Z]", level_df.vdinfo[i]) for i in range(len(level_df))]
waybound = pd.DataFrame(waybound, columns = ['waybound'])
#waybound = waybound.fillna('T')
level_df.insert(loc = 2, column = 'waybound', value = waybound)
level_df.waybound = level_df.waybound.astype('category')
level_df.vdinfo = level_df.vdinfo.str.slice(9,17)
milage = [re.sub("[^\d*\.]", "", level_df.vdinfo[i]) for i in range(len(level_df))]
milage = abs(pd.DataFrame(milage, columns = ['milage']).astype('float32')).fillna(0)
level_df.insert(loc = 3, column = 'milage', value = milage)
level_df = level_df.drop(labels = 'vdinfo', axis = 1)
#level_df = level_df[level_df.waybound == 'N3-T']
return level_df
## main section
files = glob.glob('D:/00_我在管院研究所的日子/97_ETC資料區/2019VD/20191231VD/*.xml')
print('待處理檔案數:%d' %len(files))
df_all = []
count = 0
for file in files:
tree = et.parse(file)
root = tree.getroot()
flat_list = level1(root)
level2_df = level2(root)
level3_df = level3(root)
level_df = combination(flat_list, level2_df, level3_df)
df_all.append(level_df)
count += 1
print('已完成檔案數:%d' %count)
df = pd.concat(df_all, ignore_index = True)
df = df.rename(columns = {'datetime':'資料時間','waybound':'國道編號及方向', 'milage':'安裝位置(里程數)', 'vsrid':'車道編號', 'speed':'1分鐘平均速度', 'laneoccupy':'車道佔有率', 'carstype':'車種代碼', 'volume':'流量'})
df.to_csv('vd.csv', index = False, encoding = 'utf_8_sig')
df

Better one:

%%time
## complete works
import pandas as pd
import numpy as np
import glob
import xml.etree.ElementTree as et
from xml.etree.ElementTree import parse
import re

def level1(root):
level1_list = []
idx_list = []
level1 = [data for data in root.iter('Info')]
idx_list = [len(level1[i]) for i in range(len(level1))]
for i in range(len(level1)):
vdid = level1[i].attrib['vdid']
status = level1[i].attrib['status']
datatime = level1[i].attrib['datacollecttime']
level1_str = '*'.join([datatime, vdid, status])
level1_str = idx_list[i]*[level1_str]*3
level1_list.append(level1_str)
flat_list = []
for sublist in level1_list:
for item in sublist:
flat_list.append(item)
level1_df = pd.DataFrame([flat_list[i].split('*') for i in range(len(flat_list))], columns = ['datatime', 'vdinfo', 'status'])
level1_df.vdinfo = level1_df.vdinfo.str.replace('--', '-')
level1_df.vdinfo = level1_df.vdinfo.str.replace('nfbVD-', '')
level1_df[['roadcode', 'direction', 'milage', 'vdtype']] = level1_df.vdinfo.str.split(pat="-", expand = True, n = 3)
level1_df.drop('vdinfo', axis = 1, inplace = True)
return level1_df

def level2(root):
vsrdir_list = []
vsrid_list = []
speed_list = []
laneoc_list = []
level2 = [data for data in root.iter('lane')]
for i in range(len(level2)):
[vsrdir_list.append(level2[i].attrib['vsrdir']) for j in range(3)]
[vsrid_list.append(level2[i].attrib['vsrid']) for j in range(3)]
[speed_list.append(level2[i].attrib['speed']) for j in range(3)]
[laneoc_list.append(level2[i].attrib['laneoccupy']) for j in range(3)]
level2_df = pd.DataFrame({'vsrdir': vsrdir_list, 'vsrid':vsrid_list, 'speed':speed_list, 'laneoccupy':laneoc_list})
return level2_df
def level3(root):
carsid_list = []
volume_list = []
level3 = [data for data in root.iter('cars')]
carsid = pd.DataFrame([level3[i].attrib['carid'] for i in range(len(level3))], columns = ['carsid']).astype('category')
volume = pd.DataFrame([level3[i].attrib['volume'] for i in range(len(level3))], columns = ['volume']).astype('int32')
level3_df = pd.concat([carsid, volume], axis = 1)
return level3_df
## main section
files = glob.glob('D:/00_我在管院研究所的日子/97_ETC資料區/2019VD/20191231VD/*.xml')
print('待處理檔案數:%d' %len(files))
df_all = []
count = 0
for file in files:
tree = et.parse(file)
root = tree.getroot()
level1_df = level1(root)
level2_df = level2(root)
level3_df = level3(root)
df = pd.concat([level1_df, level2_df, level3_df], axis = 1)
df_all.append(df)
count += 1
print('已完成檔案數:%d' %count)
df_all = pd.concat(df_all, axis = 0, ignore_index = True)
df_all.to_csv('vd.csv', index = False, encoding = 'utf_8_sig')
df_all = df_all.astype({'datatime':'datetime64', 'status':'category', 'roadcode':'category', 'direction':'category',
'milage':'float32', 'vsrdir':'category', 'vsrid':'category', 'speed':'int32', 'laneoccupy':'int32',
'volume':'int32'})
df_all

--

--

Ed Chao

Playground for a old student. Records about learning, life and interesting stuff