Basierend auf diesem Artikel: http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/ Ich versuche, ein Gensim Word2vec-Modell mit den vortrainierten Vektoren von GloVe in einer Textklassifizierung Aufgabe zu implementieren. Allerdings möchte ich FeatureSelection auch in meinen Textdaten durchführen. Ich habe mehrere Sequenzen in der Pipeline ausprobiert, aber ich bekomme schnell einen Speicherfehler, der auf den Transformationsteil von TfidfEmbeddingVectorizer zeigt.Kombinieren w2vec und Feature-Auswahl in der Pipeline
return np.array([
np.mean([self.word2vec[w] * self.word2weight[w]
for w in words if w in self.word2vec] or
[np.zeros(self.dim)], axis=0)
for words in X
Wenn ich die TfidfEmbeddingVectorizer Klasse mit einem regelmäßigen TfIdfVectorizer ersetzen es richtig funktioniert. Gibt es eine Möglichkeit, SelectFromModel und W2vec in der Pipeline zu kombinieren?
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support as score, f1_score
import pickle
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC
import gensim
import collections
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X):
return (X[self.column])
class TextStats(BaseEstimator, TransformerMixin):
"""Extract features from each document for DictVectorizer"""
def fit(self, x, y=None):
return self
def transform(self, posts):
return [{'REPORT_M': text}
for text in posts]
class TfidfEmbeddingVectorizer(object):
def __init__(self, word2vec):
self.word2vec = word2vec
self.word2weight = None
self.dim = len(word2vec.values())
def fit(self, X, y):
tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(X)
# if a word was never seen - it must be at least as infrequent
# as any of the known words - so the default idf is the max of
# known idf's
max_idf = max(tfidf.idf_)
self.word2weight = collections.defaultdict(
lambda: max_idf,
[(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
return self
def transform(self, X):
return np.array([
np.mean([self.word2vec[w] * self.word2weight[w]
for w in words if w in self.word2vec] or
[np.zeros(self.dim)], axis=0)
for words in X
])
# training model
def train(data_train, data_val):
with open("glove.6B/glove.6B.50d.txt", "rb") as lines:
w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
for line in lines}
classifier = Pipeline([
('union', FeatureUnion([
('text', Pipeline([
('selector', ItemSelector(column='TEXT')),
("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False),threshold=0.01))
])),
('category', Pipeline([
('selector', ItemSelector(column='category')),
('stats', TextStats()),
('vect', DictVectorizer())
]))
])),
('clf',ExtraTreesClassifier(n_estimators=200, max_depth=500, min_samples_split=6, class_weight= 'balanced'))])
classifier.fit(data_train,data_train.CLASSES)
predicted = classifier.predict(data_val)