2017-01-09 2 views
-1

ich ein Programm in Python 3, die lesen und vergleichen Sie Dateien (die den gleichen Namen haben) im Schlepptau Ordner „Gold“ und „predcition“Fehler beim Dekodieren: ANSI caracter â €

aber diesen Fehler erzeugen, meine Datei sind in UTF-8-Format, so dass die caracter, die den Fehler erzeugen, ist XE2 X80 (in ANSI es â € ist):

Traceback (most recent call last): 
    File "C:\scienceie2017_train\test.py", line 215, in <module> 
    calculateMeasures(folder_gold, folder_pred, remove_anno) 
    File "C:\scienceie2017_train\test.py", line 34, in calculateMeasures 
    res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno) 
    File "C:\scienceie2017_train\test.py", line 132, in normaliseAnnotations 
    for l in file_anno: 
    File "C:\Users\chedi\Anaconda3\lib\codecs.py", line 321, in decode 
    (result, consumed) = self._buffer_decode(data, self.errors, final) 
UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 915-916: invalid continuation byte 

der Code:

#!/usr/bin/python 
# by Mattew Peters, who spotted that sklearn does macro averaging not 
# micro averaging correctly and changed it 

import os 
from sklearn.metrics import precision_recall_fscore_support 
import sys 


def calculateMeasures(folder_gold="data/dev/", folder_pred="data_pred/dev/", remove_anno=""): 
    ''' 
    Calculate P, R, F1, Macro F 
    :param folder_gold: folder containing gold standard .ann files 
    :param folder_pred: folder containing prediction .ann files 
    :param remove_anno: if set if "rel", relations will be ignored. Use this setting to only evaluate 
    keyphrase boundary recognition and keyphrase classification. If set to "types", only keyphrase boundary recognition is evaluated. 
    Note that for the later, false positive 
    :return: 
    ''' 

    flist_gold = os.listdir(folder_gold) 
    res_all_gold = [] 
    res_all_pred = [] 
    targets = [] 

    for f in flist_gold: 
     # ignoring non-.ann files, should there 
     # be any 
     if not str(f).endswith(".ann"): 
      continue 
     f_gold = open(os.path.join(folder_gold, f), "r", encoding="utf") 
     try: 
      f_pred = open(os.path.join(folder_pred, f), "r", encoding="utf8") 
      res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno) 
     except IOError: 
      print(f + " file missing in " + folder_pred + ". Assuming no predictions are available for this file.") 
      res_full_pred, res_pred, spans_pred, rels_pred = [], [], [], [] 

     res_full_gold, res_gold, spans_gold, rels_gold = normaliseAnnotations(f_gold, remove_anno) 

     spans_all = set(spans_gold + spans_pred) 

     for i, r in enumerate(spans_all): 
      if r in spans_gold: 
       target = res_gold[spans_gold.index(r)].split(" ")[0] 
       res_all_gold.append(target) 
       if not target in targets: 
        targets.append(target) 
      else: 

       res_all_gold.append("NONE") 

      if r in spans_pred: 
       target_pred = res_pred[spans_pred.index(r)].split(" ")[0] 
       res_all_pred.append(target_pred) 
      else: 

       res_all_pred.append("NONE") 

     #y_true, y_pred, labels, targets 
     prec, recall, f1, support = precision_recall_fscore_support(res_all_gold, res_all_pred, labels=targets, average=None) 
     metrics = {} 
     for k, target in enumerate(targets): 
      metrics[target] = { 
       'precision': prec[k], 
       'recall': recall[k], 
       'f1-score': f1[k], 
       'support': support[k] 
      } 

     # now 
     # micro-averaged 
     if remove_anno != 'types': 
      prec, recall, f1, s = precision_recall_fscore_support(res_all_gold, res_all_pred, labels=targets, average='micro') 
      metrics['overall'] = { 
       'precision': prec, 
       'recall': recall, 
       'f1-score': f1, 
       'support': sum(support) 
      } 
     else: 
      # just 
      # binary 
      # classification, 
      # nothing 
      # to 
      # average 
      metrics['overall'] = metrics['KEYPHRASE-NOTYPES'] 

    print_report(metrics, targets) 
    return metrics 

def print_report(metrics, targets, digits=2): 
    def _get_line(results, target, columns): 
     line = [target] 
     for column in columns[:-1]: 
      line.append("{0:0.{1}f}".format(results[column], digits)) 
     line.append("%s" % results[columns[-1]]) 
     return line 

    columns = ['precision', 'recall', 'f1-score', 'support'] 

    fmt = '%11s' + '%9s' * 4 + '\n' 
    report = [fmt % tuple([''] + columns)] 
    report.append('\n') 
    for target in targets: 
     results = metrics[target] 
     line = _get_line(results, target, columns) 
     report.append(fmt % tuple(line)) 
    report.append('\n') 

    # overall 
    line = _get_line(
    metrics['overall'], 'avg/total', columns) 
    report.append(fmt % tuple(line)) 
    report.append('\n') 

    print(''.join(report)) 

def normaliseAnnotations(file_anno, remove_anno): 
    ''' 
    Parse annotations from the annotation files: remove relations (if requested), convert rel IDs to entity spans 
    :param file_anno: 
    :param remove_anno: 
    :return: 
    ''' 
    res_full_anno = [] 
    res_anno = [] 
    spans_anno = [] 
    rels_anno = [] 

    for l in file_anno: 
     print(l) 
     print(l.strip('\n')) 
     r_g = l.strip('\n').split("\t") 
     print(r_g) 
     print(len(r_g)) 
     r_g_offs = r_g[1].split() 
     print(r_g_offs) 
     if remove_anno != "" and r_g_offs[0].endswith("-of"): 
      continue 

     res_full_anno.append(l.strip()) 

     if r_g_offs[0].endswith("-of"): 
      arg1 = r_g_offs[1].replace("Arg1:", "") 
      arg2 = r_g_offs[2].replace("Arg2:", "") 
      for l in res_full_anno: 
       r_g_tmp = l.strip().split("\t") 
       if r_g_tmp[0] == arg1: 
        ent1 = r_g_tmp[1].replace(" ", "_") 
       if r_g_tmp[0] == arg2: 
        ent2 = r_g_tmp[1].replace(" ", "_") 

      spans_anno.append(" ".join([ent1, ent2])) 
      res_anno.append(" ".join([r_g_offs[0], ent1, ent2])) 
      rels_anno.append(" ".join([r_g_offs[0], ent1, ent2])) 

     else: 
      spans_anno.append(" ".join([r_g_offs[1], r_g_offs[2]])) 
      keytype = r_g[1] 
      if remove_anno == "types": 
       keytype = "KEYPHRASE-NOTYPES" 
      res_anno.append(keytype) 

    for r in rels_anno: 
     r_offs = r.split(" ") 
# reorder hyponyms to start with smallest index 
# 1, 2 
     if r_offs[0] == "Synonym-of" and r_offs[2].split("_")[1] < r_offs[1].split("_")[1]: 
      r = " ".join([r_offs[0], r_offs[2], r_offs[1]]) 
     if r_offs[0] == "Synonym-of": 
      for r2 in rels_anno: 
       r2_offs = r2.split(" ") 
       if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[1]: 
        r_new = " ".join([r2_offs[0], r_offs[2], r2_offs[2]]) 
        rels_anno[rels_anno.index(r2)] = r_new 

       if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[2]: 
        r_new = " ".join([r2_offs[0], r2_offs[1], r_offs[2]]) 
        rels_anno[rels_anno.index(r2)] = r_new 

    rels_anno = list(set(rels_anno)) 

    res_full_anno_new = [] 
    res_anno_new = [] 
    spans_anno_new = [] 

    for r in res_full_anno: 
     r_g = r.strip().split("\t") 
     if r_g[0].startswith("R") or r_g[0] == "*": 
      continue 
     ind = res_full_anno.index(r) 
     res_full_anno_new.append(r) 
     res_anno_new.append(res_anno[ind]) 
     spans_anno_new.append(spans_anno[ind]) 

    for r in rels_anno: 
     res_full_anno_new.append("R\t" + r) 
     res_anno_new.append(r) 
     spans_anno_new.append(" ".join([r.split(" ")[1], r.split(" ")[2]])) 

    return res_full_anno_new, res_anno_new, spans_anno_new, rels_anno 

if __name__ == '__main__': 
    folder_gold = "data/dev/" 
    folder_pred = "data_pred/dev/" 
    remove_anno = "" # "", "rel" or "types" 
    if len(sys.argv) >= 2: 
     folder_gold = sys.argv[1] 
    if len(sys.argv) >= 3: 
     folder_pred = sys.argv[2] 
    if len(sys.argv) == 4: 
     remove_anno = sys.argv[3] 

    calculateMeasures(folder_gold, folder_pred, remove_anno) 

Beispiel der Vorhersage Datei

T1 Task 4 20 particular phase 
T2 Task 4 26 particular phase field 
T3 Task 15 26 phase field 
T4 Task 15 32 phase field model 
T5 Task 21 32 field model 
T6 Task 93 118 dimensional thermal phase 
T7 Task 105 118 thermal phase 
T8 Task 105 124 thermal phase field 
T9 Task 15 26 phase field 
T10 Task 15 32 phase field model 
T11 Task 21 32 field model 
T12 Task 146 179 dimensional thermal-solutal phase 
T13 Task 158 179 thermal-solutal phase 
T14 Task 158 185 thermal-solutal phase field 
T15 Task 15 26 phase field 
T16 Task 15 32 phase field model 
T17 Task 21 32 field model 
T18 Task 219 235 physical problem 
T19 Task 300 330 natural relaxational phenomena 
T20 Task 308 330 relaxational phenomena 
T21 Task 340 354 resulting PDEs 
T22 Task 362 374 Allen–Cahn 
T23 Task 383 403 Carn–Hilliard type 
T24 Task 445 461 time derivatives 
T25 Task 509 532 variational derivatives 
T26 Task 541 554 functional †
T27 Task 570 581 free energy 
T28 Task 570 592 free energy functional 
T29 Task 575 592 energy functional 
T30 Task 570 581 free energy 
T31 Task 702 717 domain boundary 
T32 Task 780 797 difficult aspects 
T33 Task 817 836 relaxational aspect 
T34 Task 874 898 stable numerical schemes 
T35 Task 881 898 numerical schemes 

Beispiel Gold-Datei

T1 Material 2 20 fluctuating vacuum 
T2 Process 45 59 quantum fields 
T3 Task 45 59 quantum fields 
T4 Process 74 92 free Maxwell field 
T5 Process 135 151 Fermionic fields 
T6 Process 195 222 undergo vacuum fluctuations 
T7 Process 257 272 Casimir effects 
T8 Task 396 411 nuclear physics 
T9 Task 434 464 “MIT bag model” of the nucleon 
T10 Task 518 577 a collection of fermionic fields describing confined quarks 
T11 Process 732 804 the bag boundary condition modifies the vacuum fluctuations of the field 
T12 Task 983 998 nuclear physics 
T13 Material 1063 1080 bag-model nucleon 
T14 Material 507 514 nucleon 
T15 Task 843 856 Casimir force 
T16 Process 289 300 such fields 
+1

"ist kein ANSI-Zeichen. – martineau

+0

ich notepade, wenn ich zu ansi ändere es ist, wenn ich zu UTF8 ändere es ist XE2 X80 –

Antwort

0

"–".encode("cp1256").decode("utf8") = , ein Gedankenstrich.

Die Datei, die Sie öffnen, scheint in UTF-8 codiert zu sein und Sie geben nicht die Codierung an, die verwendet werden soll (fügen Sie einfach encoding="utf8" hinzu).

Python wird die Standardzeichencodierung des Betriebssystems verwenden und Sie scheinen Windows zu verwenden, wo es immer etwas anderes als UTF-8 ist. Werfen Sie einen Blick auf

, um herauszufinden, welche Kodierung Python standardmäßig beim Lesen und Schreiben von Dateien verwenden wird.

+0

sorry ich vergesse ti die Änderung im Code zu machen, ich benutzte auch encoding = "utf8", aber die Lösung, die ich finde, ist lateinisch encoding = "latin-1" verwenden! wo kann ich die encode decode anweisung hinzufügen? –

+0

Sie müssen das Kodier-Dekodierungs-Snippet nicht irgendwo hinzufügen, ich wollte nur zeigen, woher diese mögliche Zeichenfolge kommt (das heißt, Sie bekommen '', indem Sie ein Strichzeichen in UTF-8 kodieren und es dekodieren als 'cp1256'). –

Verwandte Themen