Ich versuche, die Ausgabe des Skripts in eine 2-Serie-Balkendiagramm mit Matplotlib in Python 2.7 zu plotten.Matplotlib Fehler: 'Höhe' muss Länge 5 oder Skalar sein
Mein 'msg' Skript druckt, die in der folgenden Ausgabe führt:
KNN: 90,000000 (0,322734)
LDA: 83,641395 (0,721210)
CART: 92,600996 (0,399870)
NB: 29,214167 (1,743959)
Zufall Forest: 92,617598 (0,323824)
Nach dem Code gibt die Ergebnisse des ‚msg‘, ich versuche, die Ergebnisse in eine 2-Serie Balkendiagramm plotten matplotlib mit und bin dann mit dem folgenden Fehler zurückgegeben:
Traceback (most recent call last):
File "comparison.py", line 113, in <module>
label='mean')
File "C:\Users\Scot\Anaconda2\lib\site-packages\matplotlib\pyplot.py", line 2650, in bar
**kwargs)
File "C:\Users\Scot\Anaconda2\lib\site-packages\matplotlib\__init__.py", line 1818, in inner
return func(ax, *args, **kwargs)
File "C:\Users\Scot\Anaconda2\lib\site-packages\matplotlib\axes\_axes.py", line 2038, in bar
"must be length %d or scalar" % nbars)
ValueError: incompatible sizes: argument 'height' must be length 5 or scalar
Ich bin nicht sicher, wie sie zu beheben Dies, denke ich, kann es daran liegen, dass die Werte der Ergebnisse ein Gleitkommawert sind. Jede Hilfe würde sehr geschätzt werden. Hier ist mein Code:
# Modules
import pandas
import numpy
import os
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from matplotlib import style
plt.rcdefaults()
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_recall_curve, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from scipy.stats import ttest_ind, ttest_ind_from_stats
from scipy.special import stdtr
from sklearn.svm import SVC
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
import warnings
# Load KDD dataset
data_set = "NSL-KDD/KDDTest+.arff"
import os
os.system("cls")
print "Loading: ", data_set
with warnings.catch_warnings():
warnings.simplefilter("ignore")
names = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'su_attempted', 'num_root', 'num_file_creations',
'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'class',
'dst_host_srv_rerror_rate']
dataset = pandas.read_csv(data_set, names=names)
for column in dataset.columns:
if dataset[column].dtype == type(object):
le = LabelEncoder()
dataset[column] = le.fit_transform(dataset[column])
array = dataset.values
X = array[:, 0:40]
Y = array[:, 40]
# Split-out validation dataset
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = cross_validation.train_test_split(
X, Y, test_size=validation_size, random_state=seed)
# Test options and evaluation metric
num_folds = 10
num_instances = len(X_train)
seed = 10
scoring = 'accuracy'
# Algorithms
models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('Random Forest', RandomForestClassifier()))
# models.append(('LR', LogisticRegression()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
cv_results = cross_validation.cross_val_score(
model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean() * 100, cv_results.std()
* 100) # multiplying by 100 to show percentage
print(msg)
# print cv_results * 100 # plots all values that make the average
print ("\n")
# Perform T Test on each iteration of models.
for i in range(len(results) - 1):
for j in range(i, len(results)):
t, p = ttest_ind(results[i], results[j], equal_var=False)
print("T_Test between {} & {}: T Value = {}, P Value = {}".format(
names[i], names[j], t, p))
print("\n")
plt.style.use('ggplot')
n_groups = 5
# create plot
fig, ax = plt.subplots()
index = numpy.arange(n_groups)
bar_width = 0.35
opacity = 0.8
rects1 = plt.bar(index, cv_results, bar_width,
alpha=opacity,
# color='b',
label='mean') # Line 113
rects2 = plt.bar(index + bar_width, cv_results.std(), bar_width,
alpha=opacity,
color='g',
label='standard_d')
plt.xlabel('Models')
plt.ylabel('Percentage')
plt.title('All Model Performance')
plt.xticks(index + bar_width, (names))
plt.legend()
plt.tight_layout()
plt.show()
EDIT
Druck cv_results
wie die folgende erscheint und 7 oder 8 Dezimalstellen:
[ 90.48146099 90.48146099 89.42999447 89.5960155 90.03873824
89.9833979 89.9833979 89.76203652 90.09407858 90.14941893]
[ 83.34255672 84.94742667 82.2910902 83.78527947 84.3386829
83.9513005 82.78915329 84.06198118 83.39789707 83.50857775]
[ 93.1931378 92.69507471 91.92030991 92.52905368 92.69507471
92.41837299 92.58439402 92.25235196 92.19701162 92.14167128]
[ 29.05368013 26.89540675 31.54399557 28.22357499 29.27504151
27.94687327 33.20420587 28.99833979 28.55561704 28.44493636]
[ 93.35915883 93.02711677 92.25235196 91.69894853 93.02711677
92.63973437 92.58439402 92.14167128 92.47371334 92.69507471]
Was ist die Länge von 'cv_results'? – Goyo
@Goyo Ich habe die Frage aktualisiert, um die Ergebnisse von 'cv_results' zu zeigen – Scott
Ich habe keine Ahnung, welche Art von Objekt würde diese Zeichenfolge Darstellung haben. Aber wenn Sie möchten, dass der erste Aufruf von "bar" 5 Balken plottet, macht das keinen Sinn. Du solltest deinen Code besser auf einen [mcve] kürzen. – Goyo