Contents
  1. 1. Miniforge/micromamba
  2. 2. train Example
  3. 3. 数据类型
    1. 3.1. Series 一维数组结构 (列)
    2. 3.2. DataFrame 二维数组结构
      1. 3.2.1. 创建
      2. 3.2.2. 属性
      3. 3.2.3. 访问定位
      4. 3.2.4. 修改
      5. 3.2.5. 统计分析
  4. 4. 读写文件

Miniforge/micromamba

1
2
3
4
5
6
7
https://micro.mamba.pm/api/micromamba/win-64/latest 
micromamba.tar.bz2

pip install jupyter
jupyter notebook
jupyter lab

Pandas 库基于 Python NumPy 库开发

train Example

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
import idsw.hdfs as ih
hdfs = ih.client()
reader = hdfs.open('/user/bf_ssglf_user01/idsw//jiangmanhua/dataset/dimension-jmh_train.csv', 'rb')
df = pd.read_csv(reader)
df.head()

X=df.iloc[:,2:42]
X.iloc[0:5]

Y=df.iloc[:,1]
Y.iloc[0:5]
# Y = pd.Categorical(Y).codes

from sklearn.model_selection import train_test_split
import numpy as np
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(random_state=0)
#cross_val_score(clf, X_train, y_train, cv=10)
scores = cross_val_score(clf, X, Y, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 10, criterion="gini")
model = model.fit(Xd_train, y_train)
y_predicted = model.predict(Xd_test)
accuracy = np.mean(y_predicted == y_test) * 100
print ("y_test\n",y_test)
print ("y_predicted\n",y_predicted)
print ("accuracy:",accuracy)


reader2 = hdfs.open('/user/bf_ssglf_user01/idsw//jiangmanhua/dataset/dimension-jmh_202202_0203.csv', 'rb')
testdf = pd.read_csv(reader2)
testdf.head()

testX=testdf.iloc[:,1:41]
testY = model.predict(testX)
testY


testdf.iloc[:,0]
out=pd.merge(testdf.iloc[:,0].to_frame(), pd.DataFrame(testY,columns=list('A')), left_index=True, right_index=True)
out.head()

with hdfs.open('/user/bf_ssglf_user01/idsw//jiangmanhua/dataset/dimension-jmh_train_result.csv', 'wb') as ff:
out.to_csv(ff,index=0,header=0)

pd.read_csv(hdfs.open('/user/bf_ssglf_user01/idsw//jiangmanhua/dataset/dimension-jmh_train_result.csv', 'rb'),header=None).head()
hdfs.tail('/user/bf_ssglf_user01/idsw/jiangmanhua/dataset/dimension-jmh_train_result.csv')

1
2
3
import pandas as pd

import numpy as np

数据类型

Series 一维数组结构 (列)

ages = pd.Series([22, 35, 58], name="Age")

DataFrame 二维数组结构

行索引是 index,列索引是 columns

创建

1
2
3
4
5
6
7
8
9
10
11
12
13
# 从字典创建
data = {'Site':['Google', 'Runoob', 'Wiki'], 'Age':[10, 12, 13]}
df = pd.DataFrame(data)

# 从列表创建
data = [['Google', 10], [np.nan, 0], ['Wiki', 13]]
df2 = pd.DataFrame(data, columns=['Site', 'Age'])

# 从series创建
s1 = pd.Series(['Alice', 'Bob', 'Charlie'])
s2 = pd.Series([25, 30, 35])
s3 = pd.Series(['New York', 'Los Angeles', 'Chicago'])
df3 = pd.DataFrame({'Name': s1, 'Age': s2, 'City': s3})

属性

1
2
3
4
5
6
7
8
# DataFrame 的属性和方法
print(df.shape) # 形状
print(df.columns) # 列名
print(df.index) # 索引
print(df.head()) # 前几行数据,默认是前 5 行
print(df.tail()) # 后几行数据,默认是后 5 行
print(df.info()) # 数据信息
print(df.describe())# 描述统计信息

访问定位

1
2
3
4
5
6
7
8
9
10
11
12
13
# 行
df.loc[0, :]
df.iloc[0, ]

# 列
df['Age']
df.Age
df.loc[:, 'Age']
df.iloc[:, 1]

# 单元格
df.loc[0, 'Age']
df.iloc[0, 0]

修改

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 添加行
## 使用 loc 为特定索引添加新行
df.loc[3] = ['DG', 16]

## 使用concat添加新行
new_row = pd.DataFrame([['GD', 7]], columns=['Site', 'Age'])
df = pd.concat([df, new_row], ignore_index=True)

# 添加/修改列 长度需一致
df['Column1'] = [10, 11, 12, 13, 14]

# 删除行
df_dropped = df.drop(0)

# 删除列
df_dropped = df.drop('Age', axis=1)

统计分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
df['Column1'].sum()
df.mean()

# 选择行
df[df['Column1'] > 2]
df[df["Pclass"].isin([2, 3])]
df[(df["Pclass"] == 2) | (df["Pclass"] == 3)]

df['Column1'] = df['Column1'].astype('float64')

titanic[["Sex", "Age"]].groupby("Sex").mean()
titanic.groupby("Sex").mean(numeric_only=True)

titanic["Pclass"].value_counts()
titanic.groupby("Pclass")["Pclass"].count()

titanic.sort_values(by=['Pclass', 'Age'], ascending=False).head()

读写文件

1
2
3
titanic = pd.read_csv("data/titanic.csv")

titanic.to_excel("titanic.xlsx", sheet_name="passengers", index=False)