2016-08-23 5 views
0

Ich habe versucht, einige Informationen eines XML-Tags mit Python zu erhalten, mein Ziel ist es, ein Wörterbuch zu haben, das für jede Situation Tag ID speichert, alle Kinddaten, aber ich weiß nicht wie beschäftigen Sie sich mit der Tatsache, dass Daten aus Textknoten extrahieren, danke.Python XML DOM Daten sammeln

Mein Code:

from xml.dom.minidom import * 
import requests 

print("GETTING XML...") 
resp = requests.get('http://infocar.dgt.es/datex2/dgt/SituationPublication/all/content.xml', stream = True) #XML that I need 
if resp.status_code != 200: 
    raise ApiError('GET /tasks/ {}'.format(resp.status_code)) 
print("XML RECIBIDO 200 OK") 
#resp.raw.decode_content = True 
print("GUARDANDO XML") 
with open("DGT_DATEX.xml", "wb") as handle: 
    for data in (resp.iter_content()): 
     handle.write(data) 

print("XML GUARDADO") 
print("INICIANDO PARSEO..") 
dom3 = parse("DGT_DATEX.xml") 
print(dom3)#memory dir 
print("DATEX PARSEADO") 




def getText(nodelist): 

    dict = {} 
    listofdata = list() 
    for node in nodelistofPayloadTag: 
     if node.nodeType != node.TEXT_NODE: 
      dict[node.getAttribute('id')] = listofdata 
      listofdata = goDeep(node.childNodes ,listofdata) 

    print(str.format("El diccionario antes de ser retornado es {0}", dict)) 
    return dict 

def goDeep(childsOfElement, l): 

    for i in childsOfElement: 
     if i.nodeType != i.TEXT_NODE: 
      goDeep(i.childNodes, l) 
     else: 
      l.append(i.data) 

    return l 

def getSituation(payloadTag): 

    getText(payloadTag.childNodes) 



def getPayLoad(dom): 
    print(str.format("Tag to be processed:{0}",dom.getElementsByTagNameNS('*', 'payloadPublication')[0])) 
    getSituation(dom.getElementsByTagNameNS('*', 'payloadPublication')[0]) 


print(str.format("Verificando que el dato retornado es un diccionario, {0}, y contiene {1}", type(getPayLoad(dom3)), getPayLoad(dom3))) 
+0

Haben Sie mit lxml.etree versucht? und .xpath ("// * [name() = '_ 0: situation']"))? –

Antwort

0

hier die Art und Weise, die mir erlauben, Daten von Childs zu sammeln, dank

import xml.etree.ElementTree as ET 

from xml.dom.minidom import * 

import requests 

print("GETTING XML...") 
resp = requests.get('http://infocar.dgt.es/datex2/dgt/SituationPublication/all/content.xml', stream = True) #XML that I need 
if resp.status_code != 200: 
    raise ApiError('GET /tasks/ {}'.format(resp.status_code)) 
print("XML RECIBIDO 200 OK") 
#resp.raw.decode_content = True 
print("GUARDANDO XML") 
with open("DGT_DATEX.xml", "wb") as handle: 
    for data in (resp.iter_content()): 
     handle.write(data) 

print("XML GUARDADO") 
print("INICIANDO PARSEO..") 
dom3 = parse("DGT_DATEX.xml") 
print(dom3)#memory dir 
print("DATEX PARSEADO") 

def getAttributeID(element): 
    return element.getAttribute('id') 

def getText(element): 
    return element.data 

def getPayLoad(dom): 
    dict = {} 
    index = 1 #esto sirve para relacionar los atributos con el situation que les corresponde 
    indexRecord = 1 #esto sirve para relacionar los atributos con el situationRecord que les corresponde 
    for i in dom.getElementsByTagNameNS('*', 'situation'): 
     #Por cada situation del XML vamos a sacar el situation id y todos los campos que pertecen a este de la siguiente manera 
     print(str.format("Situation ID: {0} numero {1}", getAttributeID(i), index)) 
     print(getText(dom.getElementsByTagNameNS('*','confidentiality')[index].firstChild))#por ejemplo aquí, se coge el first text de la lista de atributos confidentiality dado el index, que nos indica la relacion con el situation 
     print(getText(dom.getElementsByTagNameNS('*', 'informationStatus')[index].firstChild)) 
     for record in dom.getElementsByTagNameNS('*', 'situation')[index].childNodes:#buscamos el hijo del corespondiente situation que tenga un ID, lo que nos deveulve elsituationRecord 
      if record.nodeType != record.TEXT_NODE: 
       print(str.format("SituationRecord ID: {0} numero {1}", getAttributeID(record), indexRecord)) 
       print(getText(dom.getElementsByTagNameNS('*', 'situationRecordCreationReference')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'situationRecordCreationTime')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'situationRecordVersion')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'situationRecordVersionTime')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'situationRecordFirstSupplierVersionTime')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'probabilityOfOccurrence')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'sourceCountry')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'sourceIdentification')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'validityStatus')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'overallStartTime')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'overallEndTime')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'impactOnTraffic')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'locationDescriptor')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'tpegDirection')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'latitude')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'longitude')[indexRecord].firstChild)) 
       print(str.format("VALUE FIELD: {0}", getText(dom.getElementsByTagNameNS('*', 'descriptor')[indexRecord].firstChild))) 
       indexRecord = indexRecord + 1 
     index = index + 1 

getPayLoad(dom3) 
1

ich zu diesem Code kam, ist es das, was Sie suchen ?

def getText(element): 
    return element.data.encode('utf-8').strip() 


def getPayLoad(dom): 
    attrs = ['confidentiality', 'informationStatus', 'situationRecordCreationReference', 'situationRecordCreationTime', 'situationRecordVersion', 'situationRecordVersionTime', 'situationRecordFirstSupplierVersionTime', 'probabilityOfOccurrence', 'sourceCountry', 'sourceIdentification', 'validityStatus', 'overallStartTime', 'overallEndTime', 'impactOnTraffic', 'locationDescriptor', 'tpegDirection', 'latitude', 'longitude', 'tpegDescriptorType', 'from'] 

    for index, node in enumerate(dom.getElementsByTagNameNS('*', 'situation'), 1): 
     print("\nSituation ID: {0} numero {1}".format(getAttributeID(node), index)) 
     for attr in attrs: 
      key = node.getElementsByTagNameNS('*', attr) 
      if key: 
       value = getText(key[0].firstChild) 
       if value: 
        print('{0}: {1}'.format(attr, value)) 
+0

Danke für Ihren Smart Code, es hat mir einen guten Standpunkt gegeben – Datex2