Article January 23, 2024

pandas

Words count 17k Reading time 15 mins. Read count 0

Miniforge/micromamba

1
2
3
4
5
6
7
https://micro.mamba.pm/api/micromamba/win-64/latest 
micromamba.tar.bz2

pip install jupyter
jupyter notebook
jupyter lab

Pandas 库基于 Python NumPy 库开发

train Example

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
import idsw.hdfs as ih
hdfs = ih.client()
reader = hdfs.open('/user/bf_ssglf_user01/idsw//jiangmanhua/dataset/dimension-jmh_train.csv', 'rb')
df = pd.read_csv(reader)
df.head()

X=df.iloc[:,2:42]
X.iloc[0:5]

Y=df.iloc[:,1]
Y.iloc[0:5]
# Y = pd.Categorical(Y).codes

from sklearn.model_selection import train_test_split
import numpy as np
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(random_state=0)
#cross_val_score(clf, X_train, y_train, cv=10)
scores = cross_val_score(clf, X, Y, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 10, criterion="gini")
model = model.fit(Xd_train, y_train)
y_predicted = model.predict(Xd_test)
accuracy = np.mean(y_predicted == y_test) * 100
print ("y_test\n",y_test)
print ("y_predicted\n",y_predicted)
print ("accuracy:",accuracy)


reader2 = hdfs.open('/user/bf_ssglf_user01/idsw//jiangmanhua/dataset/dimension-jmh_202202_0203.csv', 'rb')
testdf = pd.read_csv(reader2)
testdf.head()

testX=testdf.iloc[:,1:41]
testY = model.predict(testX)
testY


testdf.iloc[:,0]
out=pd.merge(testdf.iloc[:,0].to_frame(), pd.DataFrame(testY,columns=list('A')), left_index=True, right_index=True)
out.head()

with hdfs.open('/user/bf_ssglf_user01/idsw//jiangmanhua/dataset/dimension-jmh_train_result.csv', 'wb') as ff:
out.to_csv(ff,index=0,header=0)

pd.read_csv(hdfs.open('/user/bf_ssglf_user01/idsw//jiangmanhua/dataset/dimension-jmh_train_result.csv', 'rb'),header=None).head()
hdfs.tail('/user/bf_ssglf_user01/idsw/jiangmanhua/dataset/dimension-jmh_train_result.csv')

1
2
3
import pandas as pd

import numpy as np

数据类型

Series 一维数组结构 (列)

ages = pd.Series([22, 35, 58], name="Age")

DataFrame 二维数组结构

行索引是 index,列索引是 columns

创建

1
2
3
4
5
6
7
8
9
10
11
12
13
# 从字典创建
data = {'Site':['Google', 'Runoob', 'Wiki'], 'Age':[10, 12, 13]}
df = pd.DataFrame(data)

# 从列表创建
data = [['Google', 10], [np.nan, 0], ['Wiki', 13]]
df2 = pd.DataFrame(data, columns=['Site', 'Age'])

# 从series创建
s1 = pd.Series(['Alice', 'Bob', 'Charlie'])
s2 = pd.Series([25, 30, 35])
s3 = pd.Series(['New York', 'Los Angeles', 'Chicago'])
df3 = pd.DataFrame({'Name': s1, 'Age': s2, 'City': s3})

属性

1
2
3
4
5
6
7
8
# DataFrame 的属性和方法
print(df.shape) # 形状
print(df.columns) # 列名
print(df.index) # 索引
print(df.head()) # 前几行数据,默认是前 5 行
print(df.tail()) # 后几行数据,默认是后 5 行
print(df.info()) # 数据信息
print(df.describe())# 描述统计信息

访问定位

1
2
3
4
5
6
7
8
9
10
11
12
13
# 行
df.loc[0, :]
df.iloc[0, ]

# 列
df['Age']
df.Age
df.loc[:, 'Age']
df.iloc[:, 1]

# 单元格
df.loc[0, 'Age']
df.iloc[0, 0]

修改

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 添加行
## 使用 loc 为特定索引添加新行
df.loc[3] = ['DG', 16]

## 使用concat添加新行
new_row = pd.DataFrame([['GD', 7]], columns=['Site', 'Age'])
df = pd.concat([df, new_row], ignore_index=True)

# 添加/修改列 长度需一致
df['Column1'] = [10, 11, 12, 13, 14]

# 删除行
df_dropped = df.drop(0)

# 删除列
df_dropped = df.drop('Age', axis=1)

统计分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
df['Column1'].sum()
df.mean()

# 选择行
df[df['Column1'] > 2]
df[df["Pclass"].isin([2, 3])]
df[(df["Pclass"] == 2) | (df["Pclass"] == 3)]

df['Column1'] = df['Column1'].astype('float64')

titanic[["Sex", "Age"]].groupby("Sex").mean()
titanic.groupby("Sex").mean(numeric_only=True)

titanic["Pclass"].value_counts()
titanic.groupby("Pclass")["Pclass"].count()

titanic.sort_values(by=['Pclass', 'Age'], ascending=False).head()

读写文件

1
2
3
titanic = pd.read_csv("data/titanic.csv")

titanic.to_excel("titanic.xlsx", sheet_name="passengers", index=False)
0%