2016-08-26 5 views
0

Hallo, ich habe eine große XML-Datei mit mehreren Start-Tags bitte helfen Sie mir, es zu analysieren und es basiert auf Gruppe meine XML-Datei wirdParsen von XML mit stax Parser Java

Ich möchte jeden Benutzer und Inhalt und Unterhaltung sein anzeigen ID hinzugefügt werden in einer einzigen Zeile zu nehmen, wenn andere Gespräch ID dann die zweite Zeile

für ex: Ausgabedatei 1 converID username Gespräch + username Gespräch sein sollte .... 2 anotherConvID username Gespräch + username Gespräch .. ..

 <?xml version="1.0" encoding="UTF-8" standalone="no"?> 
<!-- Data provided by Bloomberg LP. --> 
<FileDump> 
<Version>IBXML 1.3</Version> 
<Conversation Perspective=" " RoomType="P"> 
<RoomID>PCHAT-0x3000001CA8361</RoomID> 
<StartTime>03/31/2016 13:39:01</StartTime> 
<StartTimeUTC>1459431541</StartTimeUTC> 
<ParticipantEntered InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>SWONG00</LoginName> 
<FirstName>STEPHEN</FirstName> 
<LastName>WONG</LastName> 
<UUID>4397109</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>231115</AccountNumber> 
<CompanyName>DBS BANK LIMITED HON</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>03/31/2016 13:39:01</DateTime> 
<DateTimeUTC>1459431541</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantEntered> 
<ParticipantEntered InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>G_LO</LoginName> 
<FirstName>GARY</FirstName> 
<LastName>LO</LastName> 
<UUID>7054548</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>91189</AccountNumber> 
<CompanyName>DBS BANK (HONG KONG)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>03/31/2016 14:56:22</DateTime> 
<DateTimeUTC>1459436182</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantEntered> 
<ParticipantLeft InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>G_LO</LoginName> 
<FirstName>GARY</FirstName> 
<LastName>LO</LastName> 
<UUID>7054548</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>91189</AccountNumber> 
<CompanyName>DBS BANK (HONG KONG)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>03/31/2016 19:30:01</DateTime> 
<DateTimeUTC>1459452601</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantLeft> 
<ParticipantLeft InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>SWONG00</LoginName> 
<FirstName>STEPHEN</FirstName> 
<LastName>WONG</LastName> 
<UUID>4397109</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>231115</AccountNumber> 
<CompanyName>DBS BANK LIMITED HON</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>03/31/2016 19:33:56</DateTime> 
<DateTimeUTC>1459452836</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantLeft> 
<ParticipantEntered InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>SWONG00</LoginName> 
<FirstName>STEPHEN</FirstName> 
<LastName>WONG</LastName> 
<UUID>4397109</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>231115</AccountNumber> 
<CompanyName>DBS BANK LIMITED HON</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>03/31/2016 19:45:16</DateTime> 
<DateTimeUTC>1459453516</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantEntered> 
<ParticipantLeft InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>SWONG00</LoginName> 
<FirstName>STEPHEN</FirstName> 
<LastName>WONG</LastName> 
<UUID>4397109</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>231115</AccountNumber> 
<CompanyName>DBS BANK LIMITED HON</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>03/31/2016 23:08:09</DateTime> 
<DateTimeUTC>1459465689</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantLeft> 
<ParticipantEntered InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>G_LO</LoginName> 
<FirstName>GARY</FirstName> 
<LastName>LO</LastName> 
<UUID>7054548</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>91189</AccountNumber> 
<CompanyName>DBS BANK (HONG KONG)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>03/31/2016 23:14:23</DateTime> 
<DateTimeUTC>1459466063</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantEntered> 
<Message InteractionType="N"> 
<User> 
<LoginName>G_LO</LoginName> 
<FirstName>GARY</FirstName> 
<LastName>LO</LastName> 
<UUID>7054548</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>91189</AccountNumber> 
<CompanyName>DBS BANK (HONG KONG)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 00:10:57</DateTime> 
<DateTimeUTC>1459469457</DateTimeUTC> 
<Content> 
abcdefgghhhhhh 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<ParticipantEntered InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>WVU</LoginName> 
<FirstName>WHEELOCK</FirstName> 
<LastName>VU</LastName> 
<UUID>8266852</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>91189</AccountNumber> 
<CompanyName>DBS BANK (HONG KONG)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 00:14:05</DateTime> 
<DateTimeUTC>1459469645</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantEntered> 
<ParticipantEntered InteractionType="N"> 
<User> 
<LoginName>FCHAN95</LoginName> 
<FirstName>FLORENCE</FirstName> 
<LastName>CHAN</LastName> 
<CompanyName>GOLDMAN SACHS (ASIA)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress></CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 00:29:19</DateTime> 
<DateTimeUTC>1459470559</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantEntered> 
<Message InteractionType="N"> 
<User> 
<LoginName>FCHAN95</LoginName> 
<FirstName>FLORENCE</FirstName> 
<LastName>CHAN</LastName> 
<CompanyName>GOLDMAN SACHS (ASIA)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress></CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 00:29:19</DateTime> 
<DateTimeUTC>1459470559</DateTimeUTC> 
<Content> 
ajdakjgdljsgdsafhkafa 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<Message InteractionType="N"> 
<User> 
<LoginName>FCHAN95</LoginName> 
<FirstName>FLORENCE</FirstName> 
<LastName>CHAN</LastName> 
<CompanyName>GOLDMAN SACHS (ASIA)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress></CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 00:29:19</DateTime> 
<DateTimeUTC>1459470559</DateTimeUTC> 
<Content> 
akjdgljsafdlshf;kdsjf 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<Message InteractionType="N"> 
<User> 
<LoginName>WVU</LoginName> 
<FirstName>WHEELOCK</FirstName> 
<LastName>VU</LastName> 
<UUID>8266852</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>91189</AccountNumber> 
<CompanyName>DBS BANK (HONG KONG)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 00:39:32</DateTime> 
<DateTimeUTC>1459471172</DateTimeUTC> 
<Content> 
sagdksajdlsahd 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<ParticipantEntered InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>SWONG00</LoginName> 
<FirstName>STEPHEN</FirstName> 
<LastName>WONG</LastName> 
<UUID>4397109</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>231115</AccountNumber> 
<CompanyName>DBS BANK LIMITED HON</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 01:01:27</DateTime> 
<DateTimeUTC>1459472487</DateTimeUTC> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</ParticipantEntered> 
<Message InteractionType="N"> 
<User> 
<LoginName>SWONG00</LoginName> 
<FirstName>STEPHEN</FirstName> 
<LastName>WONG</LastName> 
<UUID>4397109</UUID> 
<FirmNumber>13133</FirmNumber> 
<AccountNumber>231115</AccountNumber> 
<CompanyName>DBS BANK LIMITED HON</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress>[email protected]</CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 01:31:29</DateTime> 
<DateTimeUTC>1459474289</DateTimeUTC> 
<Content> 
ajdslsahdsj;a 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<Message InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>FCHAN95</LoginName> 
<FirstName>FLORENCE</FirstName> 
<LastName>CHAN</LastName> 
<CompanyName>GOLDMAN SACHS (ASIA)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress></CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 02:49:46</DateTime> 
<DateTimeUTC>1459478986</DateTimeUTC> 
<Content> 
sagdkjsagdkjashdlasjd 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<Message InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>FCHAN95</LoginName> 
<FirstName>FLORENCE</FirstName> 
<LastName>CHAN</LastName> 
<CompanyName>GOLDMAN SACHS (ASIA)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress></CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 02:49:46</DateTime> 
<DateTimeUTC>1459478986</DateTimeUTC> 
<Content> 
jsdhkshdksjdlsjdlks 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<Message InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>FCHAN95</LoginName> 
<FirstName>FLORENCE</FirstName> 
<LastName>CHAN</LastName> 
<CompanyName>GOLDMAN SACHS (ASIA)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress></CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 03:47:37</DateTime> 
<DateTimeUTC>1459482457</DateTimeUTC> 
<Content> 
jshdkshdksjdlskld 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<Message InteractionType="N" DeviceType="M"> 
<User> 
<LoginName>FCHAN95</LoginName> 
<FirstName>FLORENCE</FirstName> 
<LastName>CHAN</LastName> 
<CompanyName>GOLDMAN SACHS (ASIA)</CompanyName> 
<EmailAddress>[email protected]</EmailAddress> 
<CorporateEmailAddress></CorporateEmailAddress> 
</User> 
<DateTime>04/01/2016 03:47:37</DateTime> 
<DateTimeUTC>1459482457</DateTimeUTC> 
<Content> 
aasasasasas 
</Content> 
<ConversationID>PCHAT-0x3000001CA8361</ConversationID> 
</Message> 
<EndTime>04/01/2016 03:47:37</EndTime> 
<EndTimeUTC>1459482457</EndTimeUTC> 
</Conversation> 
</FileDump> 
+0

geänderten Eingabedatei und einige Änderungen pls helfen erforderlich –

Antwort

1

Wenn der Text von allen Knoten "Content", "LoginName" und "ConversationID" in den Speicher passt, dann könnte die Lösung wie die sein, die ich unten poste (liest aus input.xml, schreibt zu output.txt; Außerdem nehme ich an, dass Ihre Zeilen mit 1, 2, ... nummeriert werden sollen und dass das "+" - Zeichen Daten von verschiedenen Nachrichten trennen soll, wie Sie in der Frage angegeben haben.

Wenn diese Daten jedoch nicht in den Speicher passen, müssen Sie sie beispielsweise mit StAX im Format (ConversationID, LoginName, Content) in eine Datei extrahieren und dann in external memory (auf Diskette oder auf mehreren Rechnern), führen Sie dann sequenzielle Zeilen mit derselben ConversationID zusammen. Oder teilen Sie einfach die anfängliche XML-Datei in mehrere Teile auf, verarbeiten Sie sie jeweils mithilfe der folgenden Anweisungen. Sie müssen dann die resultierenden Dateien zusammenführen, was jedoch einfacher ist.

import java.io.FileInputStream; 
import java.io.FileWriter; 
import java.io.IOException; 
import java.util.ArrayList; 
import java.util.HashSet; 
import java.util.List; 
import java.util.Set; 

import javax.xml.stream.XMLInputFactory; 
import javax.xml.stream.XMLStreamConstants; 
import javax.xml.stream.XMLStreamException; 
import javax.xml.stream.XMLStreamReader; 

import org.apache.commons.lang3.StringUtils; 

public class Solution { 

    private static final String ROOM_ID = "RoomID"; 
    private static final String CONTENT = "Content"; 
    private static final String LOGIN_NAME = "LoginName"; 
    private static final String CONVERSATION_ID = "ConversationID"; 
    private static final String FILE_DUMP = "FileDump"; 
    private static final String MESSAGE = "Message"; 
    private static final String CONVERSATION = "Conversation"; 
    private static final String START_TIME = "StartTime"; 

    static class ConversationInfo { 
     private String startTimeStr; 

     private String conversationId; 

     private final Set<String> users = new HashSet<>(); 

     private final List<Message> messages = new ArrayList<>(); 

     @Override 
     public String toString() { 
      return String.format("%s %s (%d) %s", startTimeStr, conversationId, users.size(), 
       StringUtils.join(messages, " + ")); 
     } 
    } 

    static class Message { 

     public final String userName; 

     public final String content; 

     public Message(String name, String content) { 
      this.userName = name; 
      this.content = content; 
     } 

     @Override 
     public String toString() { 
      return userName + " " + content; 
     } 
    } 

    public static void main(String[] args) 
      throws XMLStreamException, IOException { 
     XMLInputFactory xf = XMLInputFactory.newFactory(); 
     List<ConversationInfo> m = new ArrayList<>(); 
     try (FileInputStream fin = new FileInputStream("input.xml")) { 
      XMLStreamReader xr = xf.createXMLStreamReader(fin); 
      LOOP: while (xr.hasNext()) { 
       int event = xr.next(); 
       switch (event) { 
        case XMLStreamConstants.START_ELEMENT: { 
         String elName = xr.getLocalName(); 
         if (CONVERSATION.equals(elName)) { 
          ConversationInfo convInfo = parseConversation(xr); 
          if (convInfo != null) { 
           m.add(convInfo); 
          } 
         } 
         break; 
        } 
        case XMLStreamConstants.END_ELEMENT: { 
         String elName = xr.getLocalName(); 
         if (FILE_DUMP.equals(elName)) { 
          break LOOP; 
         } 
         break; 
        } 
        case XMLStreamConstants.END_DOCUMENT: 
         throw new IllegalStateException("xml not well-formed: <" 
          + FILE_DUMP + "> tag not closed"); 
       } 
      } 
     } 
     try (FileWriter w = new FileWriter("output.txt")) { 
      int i = 1; 
      for (ConversationInfo convInfo : m) { 
       w.write(String.format("%d %s\n", i++, convInfo)); 
      } 
     } 
    } 

    private static ConversationInfo parseConversation(XMLStreamReader xr) 
      throws XMLStreamException { 
     ConversationInfo convInfo = new ConversationInfo(); 
     while (xr.hasNext()) { 
      int event = xr.next(); 
      switch (event) { 
       case XMLStreamConstants.START_ELEMENT: { 
        String elName = xr.getLocalName(); 
        if (MESSAGE.equals(elName)) { 
         Message message = parseMessage(xr); 
         if (message != null) { 
          convInfo.messages.add(message); 
          convInfo.users.add(message.userName); 
         } 
        } else if (START_TIME.equals(elName)) { 
         convInfo.startTimeStr = xr.getElementText(); 
        } else if (ROOM_ID.equals(elName)) { 
         convInfo.conversationId = xr.getElementText(); 
        } 
        break; 
       } 
       case XMLStreamConstants.END_ELEMENT: { 
        String elName = xr.getLocalName(); 
        if (CONVERSATION.equals(elName)) { 
         return convInfo; 
        } 
        break; 
       } 
       case XMLStreamConstants.END_DOCUMENT: 
        throw new XMLStreamException("xml not well-formed: <" 
         + CONVERSATION + "> tag not closed"); 
      } 
     } 
     throw new XMLStreamException(
      "unexpected end of xml file while parsing a conversation"); 
    } 

    private static Message parseMessage(XMLStreamReader xr) 
      throws XMLStreamException { 
     String userName = null; 
     String content = null; 
     while (xr.hasNext()) { 
      int event = xr.next(); 
      switch (event) { 
       case XMLStreamConstants.START_ELEMENT: { 
        String elName = xr.getLocalName(); 
        if (LOGIN_NAME.equals(elName)) { 
         userName = xr.getElementText(); 
        } else if (CONTENT.equals(elName)) { 
         content = StringUtils.trimToEmpty(xr.getElementText()); 
        } 
        break; 
       } 
       case XMLStreamConstants.END_ELEMENT: { 
        String elName = xr.getLocalName(); 
        if (MESSAGE.equals(elName)) { 
         return new Message(userName, content); 
        } 
        break; 
       } 
       case XMLStreamConstants.END_DOCUMENT: 
        throw new XMLStreamException("xml not well-formed: <" 
         + MESSAGE + "> tag not closed"); 
      } 
     } 
     throw new XMLStreamException(
      "unexpected end of xml file while parsing a message"); 
    } 
} 
+0

Multimap ist aus der Guava-Bibliothek, die neueste Version davon, wie jetzt von 19 – starikoff

+0

ich die Lösung aktualisiert, so dass nicht verwendet Multimap und Guava überhaupt – starikoff

+0

Ich habe Fragen zu Datenrelationen in diesem XML. 1) Können wir annehmen, dass alle Nachrichten (was ist drin? ...) in einer Konversation ( ...) nur zu dieser Konversation gehören? 2) Können wir annehmen, dass die ConversationID einer Nachricht, die innerhalb einer Konversation verschachtelt ist, gleich ihrer RoomID ist? 3) können wir annehmen, dass es keine Gespräche mit doppelter RoomID in der Datei gibt? – starikoff