1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
| import pandas as pd import idsw.hdfs as ih hdfs = ih.client() reader = hdfs.open('/user/bf_ssglf_user01/idsw//jiangmanhua/dataset/dimension-jmh_train.csv', 'rb') df = pd.read_csv(reader) df.head()
X=df.iloc[:,2:42] X.iloc[0:5]
Y=df.iloc[:,1] Y.iloc[0:5]
from sklearn.model_selection import train_test_split import numpy as np X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)
from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import cross_val_score clf = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(clf, X, Y, cv=10) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(n_estimators = 10, criterion="gini") model = model.fit(Xd_train, y_train) y_predicted = model.predict(Xd_test) accuracy = np.mean(y_predicted == y_test) * 100 print ("y_test\n",y_test) print ("y_predicted\n",y_predicted) print ("accuracy:",accuracy)
reader2 = hdfs.open('/user/bf_ssglf_user01/idsw//jiangmanhua/dataset/dimension-jmh_202202_0203.csv', 'rb') testdf = pd.read_csv(reader2) testdf.head()
testX=testdf.iloc[:,1:41] testY = model.predict(testX) testY
testdf.iloc[:,0] out=pd.merge(testdf.iloc[:,0].to_frame(), pd.DataFrame(testY,columns=list('A')), left_index=True, right_index=True) out.head()
with hdfs.open('/user/bf_ssglf_user01/idsw//jiangmanhua/dataset/dimension-jmh_train_result.csv', 'wb') as ff: out.to_csv(ff,index=0,header=0)
pd.read_csv(hdfs.open('/user/bf_ssglf_user01/idsw//jiangmanhua/dataset/dimension-jmh_train_result.csv', 'rb'),header=None).head() hdfs.tail('/user/bf_ssglf_user01/idsw/jiangmanhua/dataset/dimension-jmh_train_result.csv')
|