ich ein Programm in Python 3, die lesen und vergleichen Sie Dateien (die den gleichen Namen haben) im Schlepptau Ordner „Gold“ und „predcition“Fehler beim Dekodieren: ANSI caracter â €
aber diesen Fehler erzeugen, meine Datei sind in UTF-8-Format, so dass die caracter, die den Fehler erzeugen, ist XE2 X80 (in ANSI es â € ist):
Traceback (most recent call last):
File "C:\scienceie2017_train\test.py", line 215, in <module>
calculateMeasures(folder_gold, folder_pred, remove_anno)
File "C:\scienceie2017_train\test.py", line 34, in calculateMeasures
res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno)
File "C:\scienceie2017_train\test.py", line 132, in normaliseAnnotations
for l in file_anno:
File "C:\Users\chedi\Anaconda3\lib\codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 915-916: invalid continuation byte
der Code:
#!/usr/bin/python
# by Mattew Peters, who spotted that sklearn does macro averaging not
# micro averaging correctly and changed it
import os
from sklearn.metrics import precision_recall_fscore_support
import sys
def calculateMeasures(folder_gold="data/dev/", folder_pred="data_pred/dev/", remove_anno=""):
'''
Calculate P, R, F1, Macro F
:param folder_gold: folder containing gold standard .ann files
:param folder_pred: folder containing prediction .ann files
:param remove_anno: if set if "rel", relations will be ignored. Use this setting to only evaluate
keyphrase boundary recognition and keyphrase classification. If set to "types", only keyphrase boundary recognition is evaluated.
Note that for the later, false positive
:return:
'''
flist_gold = os.listdir(folder_gold)
res_all_gold = []
res_all_pred = []
targets = []
for f in flist_gold:
# ignoring non-.ann files, should there
# be any
if not str(f).endswith(".ann"):
continue
f_gold = open(os.path.join(folder_gold, f), "r", encoding="utf")
try:
f_pred = open(os.path.join(folder_pred, f), "r", encoding="utf8")
res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno)
except IOError:
print(f + " file missing in " + folder_pred + ". Assuming no predictions are available for this file.")
res_full_pred, res_pred, spans_pred, rels_pred = [], [], [], []
res_full_gold, res_gold, spans_gold, rels_gold = normaliseAnnotations(f_gold, remove_anno)
spans_all = set(spans_gold + spans_pred)
for i, r in enumerate(spans_all):
if r in spans_gold:
target = res_gold[spans_gold.index(r)].split(" ")[0]
res_all_gold.append(target)
if not target in targets:
targets.append(target)
else:
res_all_gold.append("NONE")
if r in spans_pred:
target_pred = res_pred[spans_pred.index(r)].split(" ")[0]
res_all_pred.append(target_pred)
else:
res_all_pred.append("NONE")
#y_true, y_pred, labels, targets
prec, recall, f1, support = precision_recall_fscore_support(res_all_gold, res_all_pred, labels=targets, average=None)
metrics = {}
for k, target in enumerate(targets):
metrics[target] = {
'precision': prec[k],
'recall': recall[k],
'f1-score': f1[k],
'support': support[k]
}
# now
# micro-averaged
if remove_anno != 'types':
prec, recall, f1, s = precision_recall_fscore_support(res_all_gold, res_all_pred, labels=targets, average='micro')
metrics['overall'] = {
'precision': prec,
'recall': recall,
'f1-score': f1,
'support': sum(support)
}
else:
# just
# binary
# classification,
# nothing
# to
# average
metrics['overall'] = metrics['KEYPHRASE-NOTYPES']
print_report(metrics, targets)
return metrics
def print_report(metrics, targets, digits=2):
def _get_line(results, target, columns):
line = [target]
for column in columns[:-1]:
line.append("{0:0.{1}f}".format(results[column], digits))
line.append("%s" % results[columns[-1]])
return line
columns = ['precision', 'recall', 'f1-score', 'support']
fmt = '%11s' + '%9s' * 4 + '\n'
report = [fmt % tuple([''] + columns)]
report.append('\n')
for target in targets:
results = metrics[target]
line = _get_line(results, target, columns)
report.append(fmt % tuple(line))
report.append('\n')
# overall
line = _get_line(
metrics['overall'], 'avg/total', columns)
report.append(fmt % tuple(line))
report.append('\n')
print(''.join(report))
def normaliseAnnotations(file_anno, remove_anno):
'''
Parse annotations from the annotation files: remove relations (if requested), convert rel IDs to entity spans
:param file_anno:
:param remove_anno:
:return:
'''
res_full_anno = []
res_anno = []
spans_anno = []
rels_anno = []
for l in file_anno:
print(l)
print(l.strip('\n'))
r_g = l.strip('\n').split("\t")
print(r_g)
print(len(r_g))
r_g_offs = r_g[1].split()
print(r_g_offs)
if remove_anno != "" and r_g_offs[0].endswith("-of"):
continue
res_full_anno.append(l.strip())
if r_g_offs[0].endswith("-of"):
arg1 = r_g_offs[1].replace("Arg1:", "")
arg2 = r_g_offs[2].replace("Arg2:", "")
for l in res_full_anno:
r_g_tmp = l.strip().split("\t")
if r_g_tmp[0] == arg1:
ent1 = r_g_tmp[1].replace(" ", "_")
if r_g_tmp[0] == arg2:
ent2 = r_g_tmp[1].replace(" ", "_")
spans_anno.append(" ".join([ent1, ent2]))
res_anno.append(" ".join([r_g_offs[0], ent1, ent2]))
rels_anno.append(" ".join([r_g_offs[0], ent1, ent2]))
else:
spans_anno.append(" ".join([r_g_offs[1], r_g_offs[2]]))
keytype = r_g[1]
if remove_anno == "types":
keytype = "KEYPHRASE-NOTYPES"
res_anno.append(keytype)
for r in rels_anno:
r_offs = r.split(" ")
# reorder hyponyms to start with smallest index
# 1, 2
if r_offs[0] == "Synonym-of" and r_offs[2].split("_")[1] < r_offs[1].split("_")[1]:
r = " ".join([r_offs[0], r_offs[2], r_offs[1]])
if r_offs[0] == "Synonym-of":
for r2 in rels_anno:
r2_offs = r2.split(" ")
if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[1]:
r_new = " ".join([r2_offs[0], r_offs[2], r2_offs[2]])
rels_anno[rels_anno.index(r2)] = r_new
if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[2]:
r_new = " ".join([r2_offs[0], r2_offs[1], r_offs[2]])
rels_anno[rels_anno.index(r2)] = r_new
rels_anno = list(set(rels_anno))
res_full_anno_new = []
res_anno_new = []
spans_anno_new = []
for r in res_full_anno:
r_g = r.strip().split("\t")
if r_g[0].startswith("R") or r_g[0] == "*":
continue
ind = res_full_anno.index(r)
res_full_anno_new.append(r)
res_anno_new.append(res_anno[ind])
spans_anno_new.append(spans_anno[ind])
for r in rels_anno:
res_full_anno_new.append("R\t" + r)
res_anno_new.append(r)
spans_anno_new.append(" ".join([r.split(" ")[1], r.split(" ")[2]]))
return res_full_anno_new, res_anno_new, spans_anno_new, rels_anno
if __name__ == '__main__':
folder_gold = "data/dev/"
folder_pred = "data_pred/dev/"
remove_anno = "" # "", "rel" or "types"
if len(sys.argv) >= 2:
folder_gold = sys.argv[1]
if len(sys.argv) >= 3:
folder_pred = sys.argv[2]
if len(sys.argv) == 4:
remove_anno = sys.argv[3]
calculateMeasures(folder_gold, folder_pred, remove_anno)
Beispiel der Vorhersage Datei
T1 Task 4 20 particular phase
T2 Task 4 26 particular phase field
T3 Task 15 26 phase field
T4 Task 15 32 phase field model
T5 Task 21 32 field model
T6 Task 93 118 dimensional thermal phase
T7 Task 105 118 thermal phase
T8 Task 105 124 thermal phase field
T9 Task 15 26 phase field
T10 Task 15 32 phase field model
T11 Task 21 32 field model
T12 Task 146 179 dimensional thermal-solutal phase
T13 Task 158 179 thermal-solutal phase
T14 Task 158 185 thermal-solutal phase field
T15 Task 15 26 phase field
T16 Task 15 32 phase field model
T17 Task 21 32 field model
T18 Task 219 235 physical problem
T19 Task 300 330 natural relaxational phenomena
T20 Task 308 330 relaxational phenomena
T21 Task 340 354 resulting PDEs
T22 Task 362 374 Allen–Cahn
T23 Task 383 403 Carn–Hilliard type
T24 Task 445 461 time derivatives
T25 Task 509 532 variational derivatives
T26 Task 541 554 functional â€
T27 Task 570 581 free energy
T28 Task 570 592 free energy functional
T29 Task 575 592 energy functional
T30 Task 570 581 free energy
T31 Task 702 717 domain boundary
T32 Task 780 797 difficult aspects
T33 Task 817 836 relaxational aspect
T34 Task 874 898 stable numerical schemes
T35 Task 881 898 numerical schemes
Beispiel Gold-Datei
T1 Material 2 20 fluctuating vacuum
T2 Process 45 59 quantum fields
T3 Task 45 59 quantum fields
T4 Process 74 92 free Maxwell field
T5 Process 135 151 Fermionic fields
T6 Process 195 222 undergo vacuum fluctuations
T7 Process 257 272 Casimir effects
T8 Task 396 411 nuclear physics
T9 Task 434 464 “MIT bag model” of the nucleon
T10 Task 518 577 a collection of fermionic fields describing confined quarks
T11 Process 732 804 the bag boundary condition modifies the vacuum fluctuations of the field
T12 Task 983 998 nuclear physics
T13 Material 1063 1080 bag-model nucleon
T14 Material 507 514 nucleon
T15 Task 843 856 Casimir force
T16 Process 289 300 such fields
"ist kein ANSI-Zeichen. – martineau
ich notepade, wenn ich zu ansi ändere es ist, wenn ich zu UTF8 ändere es ist XE2 X80 –