Ich möchte doppelte Einträge aus einer langen BLAST-Ausgabedatei mit Hilfe des Feldes "Hsp_query-from" herausfiltern, dh meine XML-Ausgabedatei ist nach diesem Feld sortiert und ich möchte nur die auswählen erster Eintrag für jeden eindeutigen "Hsp_query-from" -Wert. Auch dies sollte für „Hsp_num“ 1, und separat für „Hsp_num“ 2. Mein Beispiel Eingabedatei sieht wie folgt geschehen:Filterduplikate von BLAST XML
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">
<BlastOutput>
<BlastOutput_program>blastn</BlastOutput_program>
<BlastOutput_version>BLASTN 2.3.0+</BlastOutput_version>
<BlastOutput_reference>Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), "A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14.</BlastOutput_reference>
<BlastOutput_db>ABC</BlastOutput_db>
<BlastOutput_query-ID>Query_1</BlastOutput_query-ID>
<BlastOutput_query-def>m151221</BlastOutput_query-def>
<BlastOutput_query-len>1790</BlastOutput_query-len>
<BlastOutput_param>
<Parameters>
<Parameters_expect>0.001</Parameters_expect>
<Parameters_sc-match>1</Parameters_sc-match>
<Parameters_sc-mismatch>-2</Parameters_sc-mismatch>
<Parameters_gap-open>0</Parameters_gap-open>
<Parameters_gap-extend>0</Parameters_gap-extend>
<Parameters_filter>L;m;</Parameters_filter>
</Parameters>
</BlastOutput_param>
<BlastOutput_iterations>
<Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-ID>Query_1</Iteration_query-ID>
<Iteration_query-def>m151221</Iteration_query-def>
<Iteration_query-len>1790</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>14</Hit_num>
<Hit_id>A1</Hit_id>
<Hit_def>A1-def</Hit_def>
<Hit_accession>A1</Hit_accession>
<Hit_len>249</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>130.386</Hsp_bit-score>
<Hsp_score>70</Hsp_score>
<Hsp_evalue>5.24249e-32</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>73</Hsp_query-to>
<Hsp_hit-from>74</Hsp_hit-from>
<Hsp_hit-to>1</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>73</Hsp_identity>
<Hsp_positive>73</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>74</Hsp_align-len>
<Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
<Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>15</Hit_num>
<Hit_id>D1</Hit_id>
<Hit_def>D1-def</Hit_def>
<Hit_accession>D1</Hit_accession>
<Hit_len>261</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>130.386</Hsp_bit-score>
<Hsp_score>70</Hsp_score>
<Hsp_evalue>5.24249e-32</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>73</Hsp_query-to>
<Hsp_hit-from>80</Hsp_hit-from>
<Hsp_hit-to>7</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>73</Hsp_identity>
<Hsp_positive>73</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>74</Hsp_align-len>
<Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
<Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>16</Hit_num>
<Hit_id>B1</Hit_id>
<Hit_def>B1-def</Hit_def>
<Hit_accession>B1</Hit_accession>
<Hit_len>253</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>130.386</Hsp_bit-score>
<Hsp_score>70</Hsp_score>
<Hsp_evalue>5.24249e-32</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>73</Hsp_query-to>
<Hsp_hit-from>74</Hsp_hit-from>
<Hsp_hit-to>1</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>73</Hsp_identity>
<Hsp_positive>73</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>74</Hsp_align-len>
<Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
<Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
<Hsp>
<Hsp_num>2</Hsp_num>
<Hsp_bit-score>71.293</Hsp_bit-score>
<Hsp_score>38</Hsp_score>
<Hsp_evalue>3.22284e-14</Hsp_evalue>
<Hsp_query-from>1735</Hsp_query-from>
<Hsp_query-to>1783</Hsp_query-to>
<Hsp_hit-from>233</Hsp_hit-from>
<Hsp_hit-to>188</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>46</Hsp_identity>
<Hsp_positive>46</Hsp_positive>
<Hsp_gaps>3</Hsp_gaps>
<Hsp_align-len>49</Hsp_align-len>
<Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq>
<Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq>
<Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>17</Hit_num>
<Hit_id>E1</Hit_id>
<Hit_def>E1-def</Hit_def>
<Hit_accession>E1</Hit_accession>
<Hit_len>267</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>130.386</Hsp_bit-score>
<Hsp_score>70</Hsp_score>
<Hsp_evalue>5.24249e-32</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>73</Hsp_query-to>
<Hsp_hit-from>81</Hsp_hit-from>
<Hsp_hit-to>8</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>73</Hsp_identity>
<Hsp_positive>73</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>74</Hsp_align-len>
<Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
<Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
<Hsp>
<Hsp_num>2</Hsp_num>
<Hsp_bit-score>71.293</Hsp_bit-score>
<Hsp_score>38</Hsp_score>
<Hsp_evalue>3.22284e-14</Hsp_evalue>
<Hsp_query-from>1735</Hsp_query-from>
<Hsp_query-to>1783</Hsp_query-to>
<Hsp_hit-from>240</Hsp_hit-from>
<Hsp_hit-to>195</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>46</Hsp_identity>
<Hsp_positive>46</Hsp_positive>
<Hsp_gaps>3</Hsp_gaps>
<Hsp_align-len>49</Hsp_align-len>
<Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq>
<Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq>
<Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>18</Hit_num>
<Hit_id>F1</Hit_id>
<Hit_def>F1-def</Hit_def>
<Hit_accession>F1</Hit_accession>
<Hit_len>274</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>130.386</Hsp_bit-score>
<Hsp_score>70</Hsp_score>
<Hsp_evalue>5.24249e-32</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>73</Hsp_query-to>
<Hsp_hit-from>87</Hsp_hit-from>
<Hsp_hit-to>14</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>73</Hsp_identity>
<Hsp_positive>73</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>74</Hsp_align-len>
<Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
<Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
<Hsp>
<Hsp_num>2</Hsp_num>
<Hsp_bit-score>71.293</Hsp_bit-score>
<Hsp_score>38</Hsp_score>
<Hsp_evalue>3.22284e-14</Hsp_evalue>
<Hsp_query-from>1735</Hsp_query-from>
<Hsp_query-to>1783</Hsp_query-to>
<Hsp_hit-from>246</Hsp_hit-from>
<Hsp_hit-to>201</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>46</Hsp_identity>
<Hsp_positive>46</Hsp_positive>
<Hsp_gaps>3</Hsp_gaps>
<Hsp_align-len>49</Hsp_align-len>
<Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq>
<Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq>
<Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>19</Hit_num>
<Hit_id>G1</Hit_id>
<Hit_def>G1-def</Hit_def>
<Hit_accession>G1</Hit_accession>
<Hit_len>267</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>130.386</Hsp_bit-score>
<Hsp_score>70</Hsp_score>
<Hsp_evalue>5.24249e-32</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>73</Hsp_query-to>
<Hsp_hit-from>80</Hsp_hit-from>
<Hsp_hit-to>7</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>73</Hsp_identity>
<Hsp_positive>73</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>74</Hsp_align-len>
<Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
<Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
<Hsp>
<Hsp_num>2</Hsp_num>
<Hsp_bit-score>71.293</Hsp_bit-score>
<Hsp_score>38</Hsp_score>
<Hsp_evalue>3.22284e-14</Hsp_evalue>
<Hsp_query-from>1735</Hsp_query-from>
<Hsp_query-to>1783</Hsp_query-to>
<Hsp_hit-from>239</Hsp_hit-from>
<Hsp_hit-to>194</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>46</Hsp_identity>
<Hsp_positive>46</Hsp_positive>
<Hsp_gaps>3</Hsp_gaps>
<Hsp_align-len>49</Hsp_align-len>
<Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq>
<Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq>
<Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>C1</Hit_id>
<Hit_def>C1-def</Hit_def>
<Hit_accession>C1</Hit_accession>
<Hit_len>568</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>1037.09</Hsp_bit-score>
<Hsp_score>561</Hsp_score>
<Hsp_evalue>0</Hsp_evalue>
<Hsp_query-from>74</Hsp_query-from>
<Hsp_query-to>639</Hsp_query-to>
<Hsp_hit-from>568</Hsp_hit-from>
<Hsp_hit-to>1</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>566</Hsp_identity>
<Hsp_positive>566</Hsp_positive>
<Hsp_gaps>2</Hsp_gaps>
<Hsp_align-len>568</Hsp_align-len>
<Hsp_qseq>TGCAGGTCGACTCAGATCTGGGTAACTGGCCTAACTGGCCTTGGAGGAGCTGGCAACTCAAAATCCCTTTGCCAAAA-CCAACATCATGCCATCCACCATGCTTGTATCCAGCTGCGCGCAATGTACCCCGGGCTGTGTATCC-AAAGCCTCATGCAACCTAACAGATGGATCGTTTGGAAGGCCTATAACAGCAACCACAGACTTAAAACCTTGCGCCTCCATAGACTTAAGCAAATGTGTGTACAATGTGGATCCTAGGCCCAACCTTTGATGCCTATGTGACACGTAAACAGTACTCTCAACTGTCCAATCGTAAGCGTTCCTAGCCTTCCAGGGCCCAGCGTAAGCAATACCAGCCACAACACCCTCAACCTCAGCAACCAACCAAGGGTATCTATCTTGCAACCTCTCTAGATCATCAATCCACTCTTGTGGTGTTTGTGGCTCTGTCCTAAAGTTCACTGTAGACGTCTCAATGTAATGGTTAACGATATCACAAACCGCGGCCATATCAGCTGCTGTAGCTGGCCTAATCTCAACTGGTCTCCTCTCCGGAGACATGTCGA</Hsp_qseq>
<Hsp_hseq>TGCAGGTCGACTCAGATCTGGGTAACTGGCCTAACTGGCCTTGGAGGAGCTGGCAACTCAAAATCCCTTTGCCAAAAACCAACATCATGCCATCCACCATGCTTGTATCCAGCTGCGCGCAATGTACCCCGGGCTGTGTATCCCAAAGCCTCATGCAACCTAACAGATGGATCGTTTGGAAGGCCTATAACAGCAACCACAGACTTAAAACCTTGCGCCTCCATAGACTTAAGCAAATGTGTGTACAATGTGGATCCTAGGCCCAACCTTTGATGCCTATGTGACACGTAAACAGTACTCTCAACTGTCCAATCGTAAGCGTTCCTAGCCTTCCAGGGCCCAGCGTAAGCAATACCAGCCACAACACCCTCAACCTCAGCAACCAACCAAGGGTATCTATCTTGCAACCTCTCTAGATCATCAATCCACTCTTGTGGTGTTTGTGGCTCTGTCCTAAAGTTCACTGTAGACGTCTCAATGTAATGGTTAACGATATCACAAACCGCGGCCATATCAGCTGCTGTAGCTGGCCTAATCTCAACTGGTCTCCTCTCCGGAGACATGTCGA</Hsp_hseq>
<Hsp_midline>||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
</Iteration_hits>
<Iteration_stat>
<Statistics>
<Statistics_db-num>78</Statistics_db-num>
<Statistics_db-len>54018</Statistics_db-len>
<Statistics_hsp-len>18</Statistics_hsp-len>
<Statistics_eff-space>93232008</Statistics_eff-space>
<Statistics_kappa>0.46</Statistics_kappa>
<Statistics_lambda>1.28</Statistics_lambda>
<Statistics_entropy>0.85</Statistics_entropy>
</Statistics>
</Iteration_stat>
</Iteration>
</BlastOutput_iterations>
</BlastOutput>
Die resultierende Ausgabe sein sollte:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">
<BlastOutput>
<BlastOutput_program>blastn</BlastOutput_program>
<BlastOutput_version>BLASTN 2.3.0+</BlastOutput_version>
<BlastOutput_reference>Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), "A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14.</BlastOutput_reference>
<BlastOutput_db>ABC</BlastOutput_db>
<BlastOutput_query-ID>Query_1</BlastOutput_query-ID>
<BlastOutput_query-def>m151221</BlastOutput_query-def>
<BlastOutput_query-len>1790</BlastOutput_query-len>
<BlastOutput_param>
<Parameters>
<Parameters_expect>0.001</Parameters_expect>
<Parameters_sc-match>1</Parameters_sc-match>
<Parameters_sc-mismatch>-2</Parameters_sc-mismatch>
<Parameters_gap-open>0</Parameters_gap-open>
<Parameters_gap-extend>0</Parameters_gap-extend>
<Parameters_filter>L;m;</Parameters_filter>
</Parameters>
</BlastOutput_param>
<BlastOutput_iterations>
<Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-ID>Query_1</Iteration_query-ID>
<Iteration_query-def>m151221</Iteration_query-def>
<Iteration_query-len>1790</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>14</Hit_num>
<Hit_id>A1</Hit_id>
<Hit_def>A1-def</Hit_def>
<Hit_accession>A1</Hit_accession>
<Hit_len>249</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>130.386</Hsp_bit-score>
<Hsp_score>70</Hsp_score>
<Hsp_evalue>5.24249e-32</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>73</Hsp_query-to>
<Hsp_hit-from>74</Hsp_hit-from>
<Hsp_hit-to>1</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>73</Hsp_identity>
<Hsp_positive>73</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>74</Hsp_align-len>
<Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
<Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>16</Hit_num>
<Hit_id>B1</Hit_id>
<Hit_def>B1-def</Hit_def>
<Hit_accession>B1</Hit_accession>
<Hit_len>253</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>2</Hsp_num>
<Hsp_bit-score>71.293</Hsp_bit-score>
<Hsp_score>38</Hsp_score>
<Hsp_evalue>3.22284e-14</Hsp_evalue>
<Hsp_query-from>1735</Hsp_query-from>
<Hsp_query-to>1783</Hsp_query-to>
<Hsp_hit-from>233</Hsp_hit-from>
<Hsp_hit-to>188</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>46</Hsp_identity>
<Hsp_positive>46</Hsp_positive>
<Hsp_gaps>3</Hsp_gaps>
<Hsp_align-len>49</Hsp_align-len>
<Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq>
<Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq>
<Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>C1</Hit_id>
<Hit_def>C1-def</Hit_def>
<Hit_accession>C1</Hit_accession>
<Hit_len>568</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>1037.09</Hsp_bit-score>
<Hsp_score>561</Hsp_score>
<Hsp_evalue>0</Hsp_evalue>
<Hsp_query-from>74</Hsp_query-from>
<Hsp_query-to>639</Hsp_query-to>
<Hsp_hit-from>568</Hsp_hit-from>
<Hsp_hit-to>1</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>566</Hsp_identity>
<Hsp_positive>566</Hsp_positive>
<Hsp_gaps>2</Hsp_gaps>
<Hsp_align-len>568</Hsp_align-len>
<Hsp_qseq>TGCAGGTCGACTCAGATCTGGGTAACTGGCCTAACTGGCCTTGGAGGAGCTGGCAACTCAAAATCCCTTTGCCAAAA-CCAACATCATGCCATCCACCATGCTTGTATCCAGCTGCGCGCAATGTACCCCGGGCTGTGTATCC-AAAGCCTCATGCAACCTAACAGATGGATCGTTTGGAAGGCCTATAACAGCAACCACAGACTTAAAACCTTGCGCCTCCATAGACTTAAGCAAATGTGTGTACAATGTGGATCCTAGGCCCAACCTTTGATGCCTATGTGACACGTAAACAGTACTCTCAACTGTCCAATCGTAAGCGTTCCTAGCCTTCCAGGGCCCAGCGTAAGCAATACCAGCCACAACACCCTCAACCTCAGCAACCAACCAAGGGTATCTATCTTGCAACCTCTCTAGATCATCAATCCACTCTTGTGGTGTTTGTGGCTCTGTCCTAAAGTTCACTGTAGACGTCTCAATGTAATGGTTAACGATATCACAAACCGCGGCCATATCAGCTGCTGTAGCTGGCCTAATCTCAACTGGTCTCCTCTCCGGAGACATGTCGA</Hsp_qseq>
<Hsp_hseq>TGCAGGTCGACTCAGATCTGGGTAACTGGCCTAACTGGCCTTGGAGGAGCTGGCAACTCAAAATCCCTTTGCCAAAAACCAACATCATGCCATCCACCATGCTTGTATCCAGCTGCGCGCAATGTACCCCGGGCTGTGTATCCCAAAGCCTCATGCAACCTAACAGATGGATCGTTTGGAAGGCCTATAACAGCAACCACAGACTTAAAACCTTGCGCCTCCATAGACTTAAGCAAATGTGTGTACAATGTGGATCCTAGGCCCAACCTTTGATGCCTATGTGACACGTAAACAGTACTCTCAACTGTCCAATCGTAAGCGTTCCTAGCCTTCCAGGGCCCAGCGTAAGCAATACCAGCCACAACACCCTCAACCTCAGCAACCAACCAAGGGTATCTATCTTGCAACCTCTCTAGATCATCAATCCACTCTTGTGGTGTTTGTGGCTCTGTCCTAAAGTTCACTGTAGACGTCTCAATGTAATGGTTAACGATATCACAAACCGCGGCCATATCAGCTGCTGTAGCTGGCCTAATCTCAACTGGTCTCCTCTCCGGAGACATGTCGA</Hsp_hseq>
<Hsp_midline>||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
</Iteration_hits>
<Iteration_stat>
<Statistics>
<Statistics_db-num>78</Statistics_db-num>
<Statistics_db-len>54018</Statistics_db-len>
<Statistics_hsp-len>18</Statistics_hsp-len>
<Statistics_eff-space>93232008</Statistics_eff-space>
<Statistics_kappa>0.46</Statistics_kappa>
<Statistics_lambda>1.28</Statistics_lambda>
<Statistics_entropy>0.85</Statistics_entropy>
</Statistics>
</Iteration_stat>
</Iteration>
</BlastOutput_iterations>
</BlastOutput>
Bitte helfen
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet
version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
>
<xsl:strip-space elements="*"/>
<xsl:output method="xml" encoding="UTF-8" indent="yes" doctype-public="-//NCBI//NCBI BlastOutput/EN" doctype-system="http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd"/>
<!-- Identity template -->
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<xsl:key name="TOP_query_from" match="Iteration_hits/Hit/Hit_hsps/Hsp" use="Hsp_query-from"/>
<xsl:template match="Iteration_hits/Hit/">
<xsl:copy>
<xsl:apply-templates select="*[generate-id(.) = generate-id(key ('TOP_query_from', Hsp_query-from))]"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
Es funktioniert perfekt auf XML-Dateien mit einer einzigen Explosion Iteration (Abfrage), aber Bei Multi-Query-Dateien scheint es fälschlicherweise ein oder mehrere HITs auszulassen. Dies kann beispielsweise durch Duplizieren des Iterationsblocks aus dem obigen Beispiel, z. Hinzufügen einer neuen Abfrage (zum Beispiel query_2). Irgendwelche Vorschläge? –
Siehe update, wo 'Iteration_query-ID' zum Schlüssel zum Gruppieren hinzugefügt wird. – Parfait