2016-10-15 5 views
1

Ich habe zwei Dateien. Man erstellt eine numpy Array in komprimierter spärlichen Zeilenformatnumpy.load() falscher Magic String Fehler

from sklearn.feature_extraction.text import TfidfTransformer 
import pdb 

def stem_document(document): 
    translatedict = "" 
    stemmer = PorterStemmer() 
    for word in string.punctuation: 
     translatedict = translatedict + word 
    doc_stemmed = [] 
    for word in document.split(): 
     lowerstrippedword = ''.join(c for c in word.lower() if c not in translatedict) 
     try: 
      stemmed_word = stemmer.stem(lowerstrippedword) 
      doc_stemmed.append(stemmed_word) 
     except: 
      print lowerstrippedword + " could not be stemmed." 
    return ' '.join(doc_stemmed) 

def readFileandStem(filestring): 
    with open(filestring, 'r') as file: 
     reader = csv.reader(file) 
     file_extras = [] 
     vector_data = []   
     error = False 
     while (error == False): 
      try: 
       next = reader.next() 
       if len(next) == 3 and next[2] != "": 
        document = next[2] 
        stemmed_document = stem_document(document) 
        vector_data.append(stemmed_document) 
        file_extra = [] 
        file_extra.append(next[0]) 
        file_extra.append(next[1]) 
        file_extras.append(file_extra) 
      except: 
       error = True 
    return [vector_data, file_extras] 

filestring = 'Data.csv' 
print "Reading File" 
data = readFileandStem(filestring) 
documents = data[0] 
file_extras = data[1] 
print "Vectorizing Data" 
vectorizer = CountVectorizer() 
matrix = vectorizer.fit_transform(documents) 
tf_idf_transform = TfidfTransformer(use_idf=False).fit(matrix) 
tf_idf_matrix = tf_idf_transform.transform(matrix) 
with open('matrix/matrix.npy', 'w') as matrix_file: 
    np.save(matrix_file, tf_idf_matrix) 
file_json_map = {} 
file_json_map['extras'] = file_extras 
with open('matrix/extras.json', 'w') as extras_file: 
    extras_file.write(json.dumps(file_json_map)) 
print "finished" 

Die nächste Datei die gleiche Datei laden soll ...

import numpy as np 
from scipy.cluster.hierarchy import dendrogram, linkage 
import json 
import pdb 

with open('matrix/matrix.npy', 'r') as matrix_file: 
    matrix = np.load(matrix_file) 

hcluster = linkage(matrix, "complete") 

Allerdings habe ich die folgende Fehlermeldung erhalten:

File "Cluster.py", line 7, in <module> 
    matrix = np.load(matrix_file) 
    File "C:\Users\jarek\Anaconda2\lib\site-packages\numpy\lib\npyio.py", line 406, in load 
    pickle_kwargs=pickle_kwargs) 
    File "C:\Users\jarek\Anaconda2\lib\site-packages\numpy\lib\format.py", line 620, in read_array 
    version = read_magic(fp) 
    File "C:\Users\jarek\Anaconda2\lib\site-packages\numpy\lib\format.py", line 216, in read_magic 
    raise ValueError(msg % (MAGIC_PREFIX, magic_str[:-2])) 
ValueError: the magic string is not correct; expected '\x93NUMPY', got '\x00\x00I\x1c\x00\x00' 

Ich weiß nicht, warum die magische Zeichenkette falsch wäre, denn aus dem, was ich mir angesehen habe, sollen alle .npy-Dateien den gleichen magischen String "\ x93NUMPY" haben.

Ideen?

+0

nicht mit 'mit offenem (blahblah) als matrix_file'. Versuchen Sie einfach 'np.load (blahblah)' – Jeon

+0

Kein Glück mit dieser Lösung. Versucht: "matrix = np.load ('matrix/matrix.npy')" –

Antwort

1

Ich stieß auf ähnliche Problem vor.

open('matrix/matrix.npy', 'w') 
... 
open('matrix/matrix.npy', 'r') 

zu

open('matrix/matrix.npy', 'wb') 
... 
open('matrix/matrix.npy', 'rb') 

mein Problem gelöst ändern.