2016-07-02 2 views
0

Ich möchte doppelte Einträge aus einer langen BLAST-Ausgabedatei mit Hilfe des Feldes "Hsp_query-from" herausfiltern, dh meine XML-Ausgabedatei ist nach diesem Feld sortiert und ich möchte nur die auswählen erster Eintrag für jeden eindeutigen "Hsp_query-from" -Wert. Auch dies sollte für „Hsp_num“ 1, und separat für „Hsp_num“ 2. Mein Beispiel Eingabedatei sieht wie folgt geschehen:Filterduplikate von BLAST XML

<?xml version="1.0" encoding="UTF-8" ?> 
    <!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd"> 
    <BlastOutput> 
     <BlastOutput_program>blastn</BlastOutput_program> 
    <BlastOutput_version>BLASTN 2.3.0+</BlastOutput_version> 
     <BlastOutput_reference>Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), "A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14.</BlastOutput_reference> 
     <BlastOutput_db>ABC</BlastOutput_db> 
     <BlastOutput_query-ID>Query_1</BlastOutput_query-ID> 
     <BlastOutput_query-def>m151221</BlastOutput_query-def> 
     <BlastOutput_query-len>1790</BlastOutput_query-len> 
     <BlastOutput_param> 
     <Parameters> 
      <Parameters_expect>0.001</Parameters_expect> 
      <Parameters_sc-match>1</Parameters_sc-match> 
      <Parameters_sc-mismatch>-2</Parameters_sc-mismatch> 
      <Parameters_gap-open>0</Parameters_gap-open> 
      <Parameters_gap-extend>0</Parameters_gap-extend> 
      <Parameters_filter>L;m;</Parameters_filter> 
     </Parameters> 
     </BlastOutput_param> 
     <BlastOutput_iterations> 
     <Iteration> 
      <Iteration_iter-num>1</Iteration_iter-num> 
      <Iteration_query-ID>Query_1</Iteration_query-ID> 
      <Iteration_query-def>m151221</Iteration_query-def> 
      <Iteration_query-len>1790</Iteration_query-len> 
      <Iteration_hits> 
      <Hit> 
       <Hit_num>14</Hit_num> 
       <Hit_id>A1</Hit_id> 
       <Hit_def>A1-def</Hit_def> 
       <Hit_accession>A1</Hit_accession> 
       <Hit_len>249</Hit_len> 
       <Hit_hsps> 
       <Hsp> 
        <Hsp_num>1</Hsp_num> 
        <Hsp_bit-score>130.386</Hsp_bit-score> 
        <Hsp_score>70</Hsp_score> 
        <Hsp_evalue>5.24249e-32</Hsp_evalue> 
        <Hsp_query-from>1</Hsp_query-from> 
        <Hsp_query-to>73</Hsp_query-to> 
        <Hsp_hit-from>74</Hsp_hit-from> 
        <Hsp_hit-to>1</Hsp_hit-to> 
        <Hsp_query-frame>1</Hsp_query-frame> 
        <Hsp_hit-frame>-1</Hsp_hit-frame> 
        <Hsp_identity>73</Hsp_identity> 
        <Hsp_positive>73</Hsp_positive> 
        <Hsp_gaps>1</Hsp_gaps> 
        <Hsp_align-len>74</Hsp_align-len> 
        <Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq> 
        <Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq> 
        <Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline> 
       </Hsp> 
       </Hit_hsps> 
      </Hit> 
      <Hit> 
       <Hit_num>15</Hit_num> 
       <Hit_id>D1</Hit_id> 
       <Hit_def>D1-def</Hit_def> 
       <Hit_accession>D1</Hit_accession> 
       <Hit_len>261</Hit_len> 
       <Hit_hsps> 
       <Hsp> 
        <Hsp_num>1</Hsp_num> 
        <Hsp_bit-score>130.386</Hsp_bit-score> 
        <Hsp_score>70</Hsp_score> 
        <Hsp_evalue>5.24249e-32</Hsp_evalue> 
        <Hsp_query-from>1</Hsp_query-from> 
        <Hsp_query-to>73</Hsp_query-to> 
        <Hsp_hit-from>80</Hsp_hit-from> 
        <Hsp_hit-to>7</Hsp_hit-to> 
        <Hsp_query-frame>1</Hsp_query-frame> 
        <Hsp_hit-frame>-1</Hsp_hit-frame> 
        <Hsp_identity>73</Hsp_identity> 
        <Hsp_positive>73</Hsp_positive> 
        <Hsp_gaps>1</Hsp_gaps> 
        <Hsp_align-len>74</Hsp_align-len> 
        <Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq> 
        <Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq> 
        <Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline> 
       </Hsp> 
       </Hit_hsps> 
      </Hit> 
      <Hit> 
       <Hit_num>16</Hit_num> 
       <Hit_id>B1</Hit_id> 
       <Hit_def>B1-def</Hit_def> 
       <Hit_accession>B1</Hit_accession> 
       <Hit_len>253</Hit_len> 
       <Hit_hsps> 
       <Hsp> 
        <Hsp_num>1</Hsp_num> 
        <Hsp_bit-score>130.386</Hsp_bit-score> 
        <Hsp_score>70</Hsp_score> 
        <Hsp_evalue>5.24249e-32</Hsp_evalue> 
        <Hsp_query-from>1</Hsp_query-from> 
        <Hsp_query-to>73</Hsp_query-to> 
        <Hsp_hit-from>74</Hsp_hit-from> 
        <Hsp_hit-to>1</Hsp_hit-to> 
        <Hsp_query-frame>1</Hsp_query-frame> 
        <Hsp_hit-frame>-1</Hsp_hit-frame> 
        <Hsp_identity>73</Hsp_identity> 
        <Hsp_positive>73</Hsp_positive> 
        <Hsp_gaps>1</Hsp_gaps> 
        <Hsp_align-len>74</Hsp_align-len> 
        <Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq> 
        <Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq> 
        <Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline> 
       </Hsp> 
       <Hsp> 
        <Hsp_num>2</Hsp_num> 
        <Hsp_bit-score>71.293</Hsp_bit-score> 
        <Hsp_score>38</Hsp_score> 
        <Hsp_evalue>3.22284e-14</Hsp_evalue> 
        <Hsp_query-from>1735</Hsp_query-from> 
        <Hsp_query-to>1783</Hsp_query-to> 
        <Hsp_hit-from>233</Hsp_hit-from> 
        <Hsp_hit-to>188</Hsp_hit-to> 
        <Hsp_query-frame>1</Hsp_query-frame> 
        <Hsp_hit-frame>-1</Hsp_hit-frame> 
        <Hsp_identity>46</Hsp_identity> 
        <Hsp_positive>46</Hsp_positive> 
        <Hsp_gaps>3</Hsp_gaps> 
        <Hsp_align-len>49</Hsp_align-len> 
        <Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq> 
        <Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq> 
        <Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline> 
       </Hsp> 
       </Hit_hsps> 
      </Hit> 
      <Hit> 
       <Hit_num>17</Hit_num> 
       <Hit_id>E1</Hit_id> 
       <Hit_def>E1-def</Hit_def> 
       <Hit_accession>E1</Hit_accession> 
       <Hit_len>267</Hit_len> 
       <Hit_hsps> 
       <Hsp> 
        <Hsp_num>1</Hsp_num> 
        <Hsp_bit-score>130.386</Hsp_bit-score> 
        <Hsp_score>70</Hsp_score> 
        <Hsp_evalue>5.24249e-32</Hsp_evalue> 
        <Hsp_query-from>1</Hsp_query-from> 
        <Hsp_query-to>73</Hsp_query-to> 
        <Hsp_hit-from>81</Hsp_hit-from> 
        <Hsp_hit-to>8</Hsp_hit-to> 
        <Hsp_query-frame>1</Hsp_query-frame> 
        <Hsp_hit-frame>-1</Hsp_hit-frame> 
        <Hsp_identity>73</Hsp_identity> 
        <Hsp_positive>73</Hsp_positive> 
        <Hsp_gaps>1</Hsp_gaps> 
        <Hsp_align-len>74</Hsp_align-len> 
        <Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq> 
        <Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq> 
        <Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline> 
       </Hsp> 
       <Hsp> 
        <Hsp_num>2</Hsp_num> 
        <Hsp_bit-score>71.293</Hsp_bit-score> 
        <Hsp_score>38</Hsp_score> 
        <Hsp_evalue>3.22284e-14</Hsp_evalue> 
        <Hsp_query-from>1735</Hsp_query-from> 
        <Hsp_query-to>1783</Hsp_query-to> 
        <Hsp_hit-from>240</Hsp_hit-from> 
        <Hsp_hit-to>195</Hsp_hit-to> 
        <Hsp_query-frame>1</Hsp_query-frame> 
        <Hsp_hit-frame>-1</Hsp_hit-frame> 
        <Hsp_identity>46</Hsp_identity> 
        <Hsp_positive>46</Hsp_positive> 
        <Hsp_gaps>3</Hsp_gaps> 
        <Hsp_align-len>49</Hsp_align-len> 
        <Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq> 
        <Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq> 
        <Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline> 
       </Hsp> 
       </Hit_hsps> 
      </Hit> 
      <Hit> 
       <Hit_num>18</Hit_num> 
       <Hit_id>F1</Hit_id> 
       <Hit_def>F1-def</Hit_def> 
       <Hit_accession>F1</Hit_accession> 
       <Hit_len>274</Hit_len> 
       <Hit_hsps> 
       <Hsp> 
        <Hsp_num>1</Hsp_num> 
        <Hsp_bit-score>130.386</Hsp_bit-score> 
        <Hsp_score>70</Hsp_score> 
        <Hsp_evalue>5.24249e-32</Hsp_evalue> 
        <Hsp_query-from>1</Hsp_query-from> 
        <Hsp_query-to>73</Hsp_query-to> 
        <Hsp_hit-from>87</Hsp_hit-from> 
        <Hsp_hit-to>14</Hsp_hit-to> 
        <Hsp_query-frame>1</Hsp_query-frame> 
        <Hsp_hit-frame>-1</Hsp_hit-frame> 
        <Hsp_identity>73</Hsp_identity> 
        <Hsp_positive>73</Hsp_positive> 
        <Hsp_gaps>1</Hsp_gaps> 
        <Hsp_align-len>74</Hsp_align-len> 
        <Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq> 
        <Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq> 
        <Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline> 
       </Hsp> 
       <Hsp> 
        <Hsp_num>2</Hsp_num> 
        <Hsp_bit-score>71.293</Hsp_bit-score> 
        <Hsp_score>38</Hsp_score> 
        <Hsp_evalue>3.22284e-14</Hsp_evalue> 
        <Hsp_query-from>1735</Hsp_query-from> 
        <Hsp_query-to>1783</Hsp_query-to> 
        <Hsp_hit-from>246</Hsp_hit-from> 
        <Hsp_hit-to>201</Hsp_hit-to> 
        <Hsp_query-frame>1</Hsp_query-frame> 
        <Hsp_hit-frame>-1</Hsp_hit-frame> 
        <Hsp_identity>46</Hsp_identity> 
        <Hsp_positive>46</Hsp_positive> 
        <Hsp_gaps>3</Hsp_gaps> 
        <Hsp_align-len>49</Hsp_align-len> 
        <Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq> 
        <Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq> 
        <Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline> 
       </Hsp> 
       </Hit_hsps> 
      </Hit> 
      <Hit> 
       <Hit_num>19</Hit_num> 
       <Hit_id>G1</Hit_id> 
       <Hit_def>G1-def</Hit_def> 
       <Hit_accession>G1</Hit_accession> 
       <Hit_len>267</Hit_len> 
       <Hit_hsps> 
       <Hsp> 
        <Hsp_num>1</Hsp_num> 
        <Hsp_bit-score>130.386</Hsp_bit-score> 
        <Hsp_score>70</Hsp_score> 
        <Hsp_evalue>5.24249e-32</Hsp_evalue> 
        <Hsp_query-from>1</Hsp_query-from> 
        <Hsp_query-to>73</Hsp_query-to> 
        <Hsp_hit-from>80</Hsp_hit-from> 
        <Hsp_hit-to>7</Hsp_hit-to> 
        <Hsp_query-frame>1</Hsp_query-frame> 
        <Hsp_hit-frame>-1</Hsp_hit-frame> 
        <Hsp_identity>73</Hsp_identity> 
        <Hsp_positive>73</Hsp_positive> 
        <Hsp_gaps>1</Hsp_gaps> 
        <Hsp_align-len>74</Hsp_align-len> 
        <Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq> 
        <Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq> 
        <Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline> 
       </Hsp> 
       <Hsp> 
        <Hsp_num>2</Hsp_num> 
        <Hsp_bit-score>71.293</Hsp_bit-score> 
        <Hsp_score>38</Hsp_score> 
        <Hsp_evalue>3.22284e-14</Hsp_evalue> 
        <Hsp_query-from>1735</Hsp_query-from> 
        <Hsp_query-to>1783</Hsp_query-to> 
        <Hsp_hit-from>239</Hsp_hit-from> 
        <Hsp_hit-to>194</Hsp_hit-to> 
        <Hsp_query-frame>1</Hsp_query-frame> 
        <Hsp_hit-frame>-1</Hsp_hit-frame> 
        <Hsp_identity>46</Hsp_identity> 
        <Hsp_positive>46</Hsp_positive> 
        <Hsp_gaps>3</Hsp_gaps> 
        <Hsp_align-len>49</Hsp_align-len> 
        <Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq> 
        <Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq> 
        <Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline> 
       </Hsp> 
       </Hit_hsps> 
      </Hit> 
      <Hit> 
       <Hit_num>1</Hit_num> 
       <Hit_id>C1</Hit_id> 
       <Hit_def>C1-def</Hit_def> 
       <Hit_accession>C1</Hit_accession> 
       <Hit_len>568</Hit_len> 
       <Hit_hsps> 
       <Hsp> 
        <Hsp_num>1</Hsp_num> 
        <Hsp_bit-score>1037.09</Hsp_bit-score> 
        <Hsp_score>561</Hsp_score> 
        <Hsp_evalue>0</Hsp_evalue> 
        <Hsp_query-from>74</Hsp_query-from> 
        <Hsp_query-to>639</Hsp_query-to> 
        <Hsp_hit-from>568</Hsp_hit-from> 
        <Hsp_hit-to>1</Hsp_hit-to> 
        <Hsp_query-frame>1</Hsp_query-frame> 
        <Hsp_hit-frame>-1</Hsp_hit-frame> 
        <Hsp_identity>566</Hsp_identity> 
        <Hsp_positive>566</Hsp_positive> 
        <Hsp_gaps>2</Hsp_gaps> 
        <Hsp_align-len>568</Hsp_align-len> 
        <Hsp_qseq>TGCAGGTCGACTCAGATCTGGGTAACTGGCCTAACTGGCCTTGGAGGAGCTGGCAACTCAAAATCCCTTTGCCAAAA-CCAACATCATGCCATCCACCATGCTTGTATCCAGCTGCGCGCAATGTACCCCGGGCTGTGTATCC-AAAGCCTCATGCAACCTAACAGATGGATCGTTTGGAAGGCCTATAACAGCAACCACAGACTTAAAACCTTGCGCCTCCATAGACTTAAGCAAATGTGTGTACAATGTGGATCCTAGGCCCAACCTTTGATGCCTATGTGACACGTAAACAGTACTCTCAACTGTCCAATCGTAAGCGTTCCTAGCCTTCCAGGGCCCAGCGTAAGCAATACCAGCCACAACACCCTCAACCTCAGCAACCAACCAAGGGTATCTATCTTGCAACCTCTCTAGATCATCAATCCACTCTTGTGGTGTTTGTGGCTCTGTCCTAAAGTTCACTGTAGACGTCTCAATGTAATGGTTAACGATATCACAAACCGCGGCCATATCAGCTGCTGTAGCTGGCCTAATCTCAACTGGTCTCCTCTCCGGAGACATGTCGA</Hsp_qseq> 
        <Hsp_hseq>TGCAGGTCGACTCAGATCTGGGTAACTGGCCTAACTGGCCTTGGAGGAGCTGGCAACTCAAAATCCCTTTGCCAAAAACCAACATCATGCCATCCACCATGCTTGTATCCAGCTGCGCGCAATGTACCCCGGGCTGTGTATCCCAAAGCCTCATGCAACCTAACAGATGGATCGTTTGGAAGGCCTATAACAGCAACCACAGACTTAAAACCTTGCGCCTCCATAGACTTAAGCAAATGTGTGTACAATGTGGATCCTAGGCCCAACCTTTGATGCCTATGTGACACGTAAACAGTACTCTCAACTGTCCAATCGTAAGCGTTCCTAGCCTTCCAGGGCCCAGCGTAAGCAATACCAGCCACAACACCCTCAACCTCAGCAACCAACCAAGGGTATCTATCTTGCAACCTCTCTAGATCATCAATCCACTCTTGTGGTGTTTGTGGCTCTGTCCTAAAGTTCACTGTAGACGTCTCAATGTAATGGTTAACGATATCACAAACCGCGGCCATATCAGCTGCTGTAGCTGGCCTAATCTCAACTGGTCTCCTCTCCGGAGACATGTCGA</Hsp_hseq> 
        <Hsp_midline>||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||</Hsp_midline> 
       </Hsp> 
       </Hit_hsps> 
      </Hit> 
      </Iteration_hits> 
      <Iteration_stat> 
      <Statistics> 
       <Statistics_db-num>78</Statistics_db-num> 
       <Statistics_db-len>54018</Statistics_db-len> 
       <Statistics_hsp-len>18</Statistics_hsp-len> 
       <Statistics_eff-space>93232008</Statistics_eff-space> 
       <Statistics_kappa>0.46</Statistics_kappa> 
       <Statistics_lambda>1.28</Statistics_lambda> 
       <Statistics_entropy>0.85</Statistics_entropy> 
      </Statistics> 
      </Iteration_stat> 
     </Iteration> 
     </BlastOutput_iterations> 
    </BlastOutput> 

Die resultierende Ausgabe sein sollte:

<?xml version="1.0" encoding="UTF-8"?> 
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd"> 
<BlastOutput> 
    <BlastOutput_program>blastn</BlastOutput_program> 
    <BlastOutput_version>BLASTN 2.3.0+</BlastOutput_version> 
    <BlastOutput_reference>Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), "A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14.</BlastOutput_reference> 
    <BlastOutput_db>ABC</BlastOutput_db> 
    <BlastOutput_query-ID>Query_1</BlastOutput_query-ID> 
    <BlastOutput_query-def>m151221</BlastOutput_query-def> 
    <BlastOutput_query-len>1790</BlastOutput_query-len> 
    <BlastOutput_param> 
    <Parameters> 
     <Parameters_expect>0.001</Parameters_expect> 
     <Parameters_sc-match>1</Parameters_sc-match> 
     <Parameters_sc-mismatch>-2</Parameters_sc-mismatch> 
     <Parameters_gap-open>0</Parameters_gap-open> 
     <Parameters_gap-extend>0</Parameters_gap-extend> 
     <Parameters_filter>L;m;</Parameters_filter> 
    </Parameters> 
    </BlastOutput_param> 
    <BlastOutput_iterations> 
    <Iteration> 
     <Iteration_iter-num>1</Iteration_iter-num> 
     <Iteration_query-ID>Query_1</Iteration_query-ID> 
     <Iteration_query-def>m151221</Iteration_query-def> 
     <Iteration_query-len>1790</Iteration_query-len> 
     <Iteration_hits> 
     <Hit> 
      <Hit_num>14</Hit_num> 
      <Hit_id>A1</Hit_id> 
      <Hit_def>A1-def</Hit_def> 
      <Hit_accession>A1</Hit_accession> 
      <Hit_len>249</Hit_len> 
      <Hit_hsps> 
      <Hsp> 
       <Hsp_num>1</Hsp_num> 
       <Hsp_bit-score>130.386</Hsp_bit-score> 
       <Hsp_score>70</Hsp_score> 
       <Hsp_evalue>5.24249e-32</Hsp_evalue> 
       <Hsp_query-from>1</Hsp_query-from> 
       <Hsp_query-to>73</Hsp_query-to> 
       <Hsp_hit-from>74</Hsp_hit-from> 
       <Hsp_hit-to>1</Hsp_hit-to> 
       <Hsp_query-frame>1</Hsp_query-frame> 
       <Hsp_hit-frame>-1</Hsp_hit-frame> 
       <Hsp_identity>73</Hsp_identity> 
       <Hsp_positive>73</Hsp_positive> 
       <Hsp_gaps>1</Hsp_gaps> 
       <Hsp_align-len>74</Hsp_align-len> 
       <Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq> 
       <Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq> 
       <Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline> 
      </Hsp> 
      </Hit_hsps> 
     </Hit> 
     <Hit> 
      <Hit_num>16</Hit_num> 
      <Hit_id>B1</Hit_id> 
      <Hit_def>B1-def</Hit_def> 
      <Hit_accession>B1</Hit_accession> 
      <Hit_len>253</Hit_len> 
      <Hit_hsps> 
      <Hsp> 
       <Hsp_num>2</Hsp_num> 
       <Hsp_bit-score>71.293</Hsp_bit-score> 
       <Hsp_score>38</Hsp_score> 
       <Hsp_evalue>3.22284e-14</Hsp_evalue> 
       <Hsp_query-from>1735</Hsp_query-from> 
       <Hsp_query-to>1783</Hsp_query-to> 
       <Hsp_hit-from>233</Hsp_hit-from> 
       <Hsp_hit-to>188</Hsp_hit-to> 
       <Hsp_query-frame>1</Hsp_query-frame> 
       <Hsp_hit-frame>-1</Hsp_hit-frame> 
       <Hsp_identity>46</Hsp_identity> 
       <Hsp_positive>46</Hsp_positive> 
       <Hsp_gaps>3</Hsp_gaps> 
       <Hsp_align-len>49</Hsp_align-len> 
       <Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq> 
       <Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq> 
       <Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline> 
      </Hsp> 
      </Hit_hsps> 
     </Hit> 
     <Hit> 
      <Hit_num>1</Hit_num> 
      <Hit_id>C1</Hit_id> 
      <Hit_def>C1-def</Hit_def> 
      <Hit_accession>C1</Hit_accession> 
      <Hit_len>568</Hit_len> 
      <Hit_hsps> 
      <Hsp> 
       <Hsp_num>1</Hsp_num> 
       <Hsp_bit-score>1037.09</Hsp_bit-score> 
       <Hsp_score>561</Hsp_score> 
       <Hsp_evalue>0</Hsp_evalue> 
       <Hsp_query-from>74</Hsp_query-from> 
       <Hsp_query-to>639</Hsp_query-to> 
       <Hsp_hit-from>568</Hsp_hit-from> 
       <Hsp_hit-to>1</Hsp_hit-to> 
       <Hsp_query-frame>1</Hsp_query-frame> 
       <Hsp_hit-frame>-1</Hsp_hit-frame> 
       <Hsp_identity>566</Hsp_identity> 
       <Hsp_positive>566</Hsp_positive> 
       <Hsp_gaps>2</Hsp_gaps> 
       <Hsp_align-len>568</Hsp_align-len> 
       <Hsp_qseq>TGCAGGTCGACTCAGATCTGGGTAACTGGCCTAACTGGCCTTGGAGGAGCTGGCAACTCAAAATCCCTTTGCCAAAA-CCAACATCATGCCATCCACCATGCTTGTATCCAGCTGCGCGCAATGTACCCCGGGCTGTGTATCC-AAAGCCTCATGCAACCTAACAGATGGATCGTTTGGAAGGCCTATAACAGCAACCACAGACTTAAAACCTTGCGCCTCCATAGACTTAAGCAAATGTGTGTACAATGTGGATCCTAGGCCCAACCTTTGATGCCTATGTGACACGTAAACAGTACTCTCAACTGTCCAATCGTAAGCGTTCCTAGCCTTCCAGGGCCCAGCGTAAGCAATACCAGCCACAACACCCTCAACCTCAGCAACCAACCAAGGGTATCTATCTTGCAACCTCTCTAGATCATCAATCCACTCTTGTGGTGTTTGTGGCTCTGTCCTAAAGTTCACTGTAGACGTCTCAATGTAATGGTTAACGATATCACAAACCGCGGCCATATCAGCTGCTGTAGCTGGCCTAATCTCAACTGGTCTCCTCTCCGGAGACATGTCGA</Hsp_qseq> 
       <Hsp_hseq>TGCAGGTCGACTCAGATCTGGGTAACTGGCCTAACTGGCCTTGGAGGAGCTGGCAACTCAAAATCCCTTTGCCAAAAACCAACATCATGCCATCCACCATGCTTGTATCCAGCTGCGCGCAATGTACCCCGGGCTGTGTATCCCAAAGCCTCATGCAACCTAACAGATGGATCGTTTGGAAGGCCTATAACAGCAACCACAGACTTAAAACCTTGCGCCTCCATAGACTTAAGCAAATGTGTGTACAATGTGGATCCTAGGCCCAACCTTTGATGCCTATGTGACACGTAAACAGTACTCTCAACTGTCCAATCGTAAGCGTTCCTAGCCTTCCAGGGCCCAGCGTAAGCAATACCAGCCACAACACCCTCAACCTCAGCAACCAACCAAGGGTATCTATCTTGCAACCTCTCTAGATCATCAATCCACTCTTGTGGTGTTTGTGGCTCTGTCCTAAAGTTCACTGTAGACGTCTCAATGTAATGGTTAACGATATCACAAACCGCGGCCATATCAGCTGCTGTAGCTGGCCTAATCTCAACTGGTCTCCTCTCCGGAGACATGTCGA</Hsp_hseq> 
       <Hsp_midline>||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||</Hsp_midline> 
      </Hsp> 
      </Hit_hsps> 
     </Hit> 
     </Iteration_hits> 
     <Iteration_stat> 
     <Statistics> 
      <Statistics_db-num>78</Statistics_db-num> 
      <Statistics_db-len>54018</Statistics_db-len> 
      <Statistics_hsp-len>18</Statistics_hsp-len> 
      <Statistics_eff-space>93232008</Statistics_eff-space> 
      <Statistics_kappa>0.46</Statistics_kappa> 
      <Statistics_lambda>1.28</Statistics_lambda> 
      <Statistics_entropy>0.85</Statistics_entropy> 
     </Statistics> 
     </Iteration_stat> 
    </Iteration> 
    </BlastOutput_iterations> 
</BlastOutput> 

Bitte helfen

<?xml version="1.0" encoding="UTF-8"?> 
<xsl:stylesheet 
version="1.0" 
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" 
> 

<xsl:strip-space elements="*"/> 

<xsl:output method="xml" encoding="UTF-8" indent="yes" doctype-public="-//NCBI//NCBI BlastOutput/EN" doctype-system="http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd"/> 


<!-- Identity template --> 
<xsl:template match="@*|node()"> 
<xsl:copy> 
    <xsl:apply-templates select="@*|node()"/> 
</xsl:copy> 
</xsl:template> 

<xsl:key name="TOP_query_from" match="Iteration_hits/Hit/Hit_hsps/Hsp" use="Hsp_query-from"/> 

<xsl:template match="Iteration_hits/Hit/"> 
<xsl:copy> 
    <xsl:apply-templates select="*[generate-id(.) = generate-id(key ('TOP_query_from', Hsp_query-from))]"/> 
</xsl:copy> 
</xsl:template> 

</xsl:stylesheet> 

Antwort

0

Betrachten verketten <Iteration_query-ID>, <Hsp_num> und <Hsp_query-from> für den Schlüssel: den folgenden Code zu verbessern. Und während die Identitätstransformation prägnant sein könnte, könnte das Erstellen des Schemas für die Art Ihres XML eine längere Lösung darstellen. Die Identität Kräfte Verwandeln Sie Knoten zu entfernen, nicht erforderlich, und Sie haben eine zusätzliche Herausforderung eines der <Hsp> Geschwister halten und nicht die andere:

<?xml version="1.0" encoding="UTF-8"?> 
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> 
<xsl:output method="xml" encoding="UTF-8" indent="yes" 
    doctype-public="-//NCBI//NCBI BlastOutput/EN" 
    doctype-system="http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd"/> 
<xsl:strip-space elements="*"/> 

<xsl:key name="hspkey" match="Hsp" 
     use="concat(ancestor::Iteration/Iteration_query-ID, Hsp_num, Hsp_query-from)"/> 

<xsl:template match="BlastOutput"> 
    <xsl:copy> 
    <xsl:copy-of select="*[position() != last()]"/> 
    <xsl:apply-templates select="BlastOutput_iterations"/> 
    </xsl:copy> 
</xsl:template> 

<xsl:template match="BlastOutput_iterations"> 
    <xsl:copy> 
    <xsl:apply-templates select="Iteration"/> 
    </xsl:copy> 
</xsl:template> 

<xsl:template match="Iteration"> 
    <xsl:copy> 
    <xsl:copy-of select="*[position() &lt; last()-1]"/> 
    <xsl:apply-templates select="Iteration_hits"/> 
    <xsl:copy-of select="Iteration_stat"/> 
    </xsl:copy> 
</xsl:template> 

<xsl:template match="Iteration_hits"> 
    <xsl:copy> 
    <xsl:apply-templates select="Hit[descendant::Hsp[generate-id(.) = 
       generate-id(key('hspkey', concat(ancestor::Iteration/Iteration_query-ID, 
          Hsp_num, Hsp_query-from))[1])]]"/> 
    </xsl:copy> 
</xsl:template> 

<xsl:template match="Hit"> 
    <xsl:copy> 
    <xsl:copy-of select="*[position() != last()]"/> 
    <xsl:apply-templates select="Hit_hsps"/> 
    </xsl:copy> 
</xsl:template> 

<xsl:template match="Hit_hsps"> 
    <xsl:copy> 
    <xsl:copy-of select="Hsp[generate-id(.) = 
       generate-id(key('hspkey', concat(ancestor::Iteration/Iteration_query-ID, 
          Hsp_num, Hsp_query-from))[1])]"/> 
    </xsl:copy> 
</xsl:template> 

</xsl:stylesheet> 
+0

Es funktioniert perfekt auf XML-Dateien mit einer einzigen Explosion Iteration (Abfrage), aber Bei Multi-Query-Dateien scheint es fälschlicherweise ein oder mehrere HITs auszulassen. Dies kann beispielsweise durch Duplizieren des Iterationsblocks aus dem obigen Beispiel, z. Hinzufügen einer neuen Abfrage (zum Beispiel query_2). Irgendwelche Vorschläge? –

+0

Siehe update, wo 'Iteration_query-ID' zum Schlüssel zum Gruppieren hinzugefügt wird. – Parfait