中信技术考察之基金推荐模型
1
2
3
4
5
6
import csv
import copy
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
1. 处理客户数据
1
2
3
4
5
6
7
8
9
10
11
12
with open('./data/客户数据.csv', 'r', encoding='utf-8') as file:
# 创建CSV读取器对象
csv_reader = csv.reader(file)
# 跳过标题行
customer_headers = next(csv_reader)[1:]
# print(f'Headers: {customer_headers}')
customer_list = []
# 遍历每一行并打印
for row in csv_reader:
customer_list.append(row[1:])
customer_df = pd.DataFrame(customer_list, columns=customer_headers)
customer_df
客户编号 | 客户公司名称 | 客户公司编号 | feature1 | feature2 | feature3 | feature4 | feature5 | feature6 | feature7 | ... | feature35 | feature36 | feature37 | feature38 | feature39 | feature40 | 第一选择基金 | 第二选择基金 | 第三选择基金 | 第四选择基金 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C0001 | 公司20 | 4040100 | 9 | 9 | 15.44036767 | 193 | 0.166081948 | 1 | 11 | ... | 1 | 3 | 10 | 43 | 56 | 44.99447821 | ||||
1 | C0002 | 公司3 | 90601 | 10 | 69 | 24.19789929 | 86 | 0.327464351 | 1 | 17 | ... | 8 | 3 | 11 | 22 | 74 | 54.38582584 | ||||
2 | C0003 | 公司18 | 3272481 | 3 | 61 | 6.773532386 | 201 | 0.482634837 | 1 | 2 | ... | 5 | 6 | 10 | 44 | 19 | 9.35864196 | ||||
3 | C0004 | 公司9 | 817216 | 3 | 9 | 6.375025793 | 92 | 0.545685448 | 1 | 3 | ... | 10 | 5 | 3 | 71 | 82 | 55.20456675 | ||||
4 | C0005 | 公司6 | 363609 | 3 | 65 | 4.183100481 | 195 | 0.657033027 | 1 | 16 | ... | 3 | 7 | 11 | 3 | 31 | 41.24379255 | ||||
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
11995 | C11996 | 公司16 | 2585664 | 7 | 37 | 7.110256464 | 173 | 0.320702523 | 1 | 4 | ... | 6 | 6 | 12 | 75 | 6 | 86.15755992 | ||||
11996 | C11997 | 公司4 | 161604 | 5 | 60 | 10.05886805 | 74 | 0.986376089 | 1 | 6 | ... | 12 | 4 | 12 | 82 | 16 | 11.28849184 | ||||
11997 | C11998 | 公司19 | 3644281 | 4 | 37 | 25.60124899 | 66 | 0.990199579 | 1 | 14 | ... | 8 | 5 | 4 | 34 | 85 | 48.17353183 | ||||
11998 | C11999 | 公司7 | 494209 | 10 | 60 | 30.56264901 | 34 | 0.775651603 | 1 | 6 | ... | 12 | 7 | 10 | 18 | 6.35029667 | J0076 | ||||
11999 | C12000 | 公司14 | 1979649 | 1 | 30 | 12.6276525 | 86 | 0.233256336 | 1 | 12 | ... | 4 | 5 | 9 | 46 | 45 | 18.21188006 | J0139 | J0034 |
12000 rows × 47 columns
1
2
3
4
5
# 处理一:筛选出训练数据
customer_list = customer_df.values.tolist()
print(len(customer_list))
customer_list = [row for row in customer_list if row[-4][0] == 'J']
print(len(customer_list))
1
2
12000
4780
1
2
# 处理二:删除客户及公司信息 只保留特征
customer_list = [row[3:] for row in customer_list]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# 处理三:复制
# 因为有4780行第一选择基金的数据,每行对应200支基金,训练数据有1000000数据量。故筛选至少选择2支基金的客户作为训练数据以减少训练成本 1704行每行对应200支基金
new_customer_list = []
strlist = ['J{:04d}'.format(i) for i in range(1, 201)]
for i in range(len(customer_list)):
# 17800 = 89 * 200
if customer_list[i][-1][0] == 'J':
# ['J0116', 'J0029', 'J0133', 'J0156']
#print(customer_list[i][-4:])
for j in range(1, 5):
tmp = copy.deepcopy(customer_list[i])
tmp[-4] = customer_list[i][-5+j]
tmp.append(j)
new_customer_list.append(tmp)
#print('-----------')
for str in strlist:
if str!= customer_list[i][-4] and str!= customer_list[i][-3] and str!= customer_list[i][-2] and str!= customer_list[i][-1]:
tmp = copy.deepcopy(customer_list[i])
tmp[-4] = str
tmp.append(0)
new_customer_list.append(tmp)
# 删除错例---客户数据.csv第10753行[J0163 J0012 J0139 J0012] 第二选择和第四选择重复
del new_customer_list[16403]
for i in range(len(customer_list)):
# 256000 = 1280 * 200
if customer_list[i][-3][0] == 'J' and customer_list[i][-2][0] != 'J' and customer_list[i][-1][0] != 'J':
# ['J0132', 'J0081', ' ', ' ']
#print(customer_list[i][-4:])
for j in range(1, 3):
tmp = copy.deepcopy(customer_list[i])
tmp[-4] = customer_list[i][-5+j]
tmp.append(j)
new_customer_list.append(tmp)
#print('-----------')
for str in strlist:
if str!= customer_list[i][-4] and str!= customer_list[i][-3]:
tmp = copy.deepcopy(customer_list[i])
tmp[-4] = str
tmp.append(0)
new_customer_list.append(tmp)
# 67000 = 335 * 200
if customer_list[i][-2][0] == 'J' and customer_list[i][-1][0] != 'J':
# ['J0188', 'J0191', 'J0068', ' ']
#print(customer_list[i][-4:])
for j in range(1, 4):
tmp = copy.deepcopy(customer_list[i])
tmp[-4] = customer_list[i][-5+j]
tmp.append(j)
new_customer_list.append(tmp)
#print('-----------')
for str in strlist:
if str!= customer_list[i][-4] and str!= customer_list[i][-3] and str!= customer_list[i][-2]:
tmp = copy.deepcopy(customer_list[i])
tmp[-4] = str
tmp.append(0)
new_customer_list.append(tmp)
print(f'一共生成{len(new_customer_list)}条数据')
1
一共生成340800条数据
1
2
3
4
5
6
7
8
# 修改header
new_customer_headers = customer_headers[3:]+['选择排序']
for i in range(0, 40):
new_customer_headers[i] = 'customer_' + new_customer_headers[i]
new_customer_headers[40] = '选择基金'
new_customer_df = pd.DataFrame(new_customer_list, columns = new_customer_headers)
new_customer_df.drop(new_customer_df.columns[-4:-1], axis=1, inplace=True)
new_customer_df
customer_feature1 | customer_feature2 | customer_feature3 | customer_feature4 | customer_feature5 | customer_feature6 | customer_feature7 | customer_feature8 | customer_feature9 | customer_feature10 | ... | customer_feature33 | customer_feature34 | customer_feature35 | customer_feature36 | customer_feature37 | customer_feature38 | customer_feature39 | customer_feature40 | 选择基金 | 选择排序 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 37 | 23.88129863 | 71 | 0.698777259 | 1 | 13 | 194 | 6.217688451 | 55 | ... | 4.810028598 | 1 | 7 | 2 | 10 | 80 | 40 | 46.45630119 | J0045 | 1 |
1 | 2 | 37 | 23.88129863 | 71 | 0.698777259 | 1 | 13 | 194 | 6.217688451 | 55 | ... | 4.810028598 | 1 | 7 | 2 | 10 | 80 | 40 | 46.45630119 | J0109 | 2 |
2 | 2 | 37 | 23.88129863 | 71 | 0.698777259 | 1 | 13 | 194 | 6.217688451 | 55 | ... | 4.810028598 | 1 | 7 | 2 | 10 | 80 | 40 | 46.45630119 | J0063 | 3 |
3 | 2 | 37 | 23.88129863 | 71 | 0.698777259 | 1 | 13 | 194 | 6.217688451 | 55 | ... | 4.810028598 | 1 | 7 | 2 | 10 | 80 | 40 | 46.45630119 | J0099 | 4 |
4 | 2 | 37 | 23.88129863 | 71 | 0.698777259 | 1 | 13 | 194 | 6.217688451 | 55 | ... | 4.810028598 | 1 | 7 | 2 | 10 | 80 | 40 | 46.45630119 | J0001 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
340795 | 1 | 30 | 12.6276525 | 86 | 0.233256336 | 1 | 12 | 85 | 3.740509451 | 55 | ... | 9.660924457 | 1 | 4 | 5 | 9 | 46 | 45 | 18.21188006 | J0196 | 0 |
340796 | 1 | 30 | 12.6276525 | 86 | 0.233256336 | 1 | 12 | 85 | 3.740509451 | 55 | ... | 9.660924457 | 1 | 4 | 5 | 9 | 46 | 45 | 18.21188006 | J0197 | 0 |
340797 | 1 | 30 | 12.6276525 | 86 | 0.233256336 | 1 | 12 | 85 | 3.740509451 | 55 | ... | 9.660924457 | 1 | 4 | 5 | 9 | 46 | 45 | 18.21188006 | J0198 | 0 |
340798 | 1 | 30 | 12.6276525 | 86 | 0.233256336 | 1 | 12 | 85 | 3.740509451 | 55 | ... | 9.660924457 | 1 | 4 | 5 | 9 | 46 | 45 | 18.21188006 | J0199 | 0 |
340799 | 1 | 30 | 12.6276525 | 86 | 0.233256336 | 1 | 12 | 85 | 3.740509451 | 55 | ... | 9.660924457 | 1 | 4 | 5 | 9 | 46 | 45 | 18.21188006 | J0200 | 0 |
340800 rows × 42 columns
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 将 DataFrame 中所有缺失值和极值先全部转为np.nan
new_customer_list = new_customer_df.values.tolist()
for row in new_customer_list:
# 提取dataframe中数值数据 从第三列起
for i in range(0, len(row) - 2):
if row[i] != '':
row[i] = float(row[i])
# 极值999999999转为np.nan
if row[i] > 1000 or row[i] < -1000:
row[i] = np.nan
for row in new_customer_list:
for i in range(0, len(row) - 2):
if row[i] == '':
# 缺失值转为np.nan
row[i] = np.nan
1
2
3
4
# 计算所有列的平均值
new_customer_df = pd.DataFrame(new_customer_list, columns = new_customer_headers[:-4] +new_customer_headers[-1:])
column_means = new_customer_df.iloc[:,:-2].mean(skipna=True)
print(column_means)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
customer_feature1 6.360880
customer_feature2 36.540284
customer_feature3 17.201055
customer_feature4 105.000593
customer_feature5 0.498709
customer_feature6 1.000000
customer_feature7 9.405694
customer_feature8 100.980251
customer_feature9 16.913861
customer_feature10 34.972123
customer_feature11 0.498026
customer_feature12 6.518365
customer_feature13 35.883363
customer_feature14 10.118418
customer_feature15 16.726703
customer_feature16 47.693500
customer_feature17 16.657743
customer_feature18 48.414837
customer_feature19 46.688786
customer_feature20 9.528190
customer_feature21 10.009301
customer_feature22 14.929339
customer_feature23 49.011289
customer_feature24 35.924674
customer_feature25 9.861468
customer_feature26 0.000000
customer_feature27 45.171292
customer_feature28 9.976480
customer_feature29 45.946405
customer_feature30 2.985731
customer_feature31 1.000000
customer_feature32 49.080166
customer_feature33 10.037611
customer_feature34 1.000000
customer_feature35 6.550535
customer_feature36 4.053039
customer_feature37 6.507101
customer_feature38 49.062278
customer_feature39 48.582398
customer_feature40 46.878168
dtype: float64
1
2
3
4
5
for row in new_customer_list:
for i in range(0, len(row) - 2):
if np.isnan(row[i]):
# 缺失值转为np.nan
row[i] = column_means[i]
1
2
# list 转回 dataframe
new_customer_df = pd.DataFrame(new_customer_list, columns = new_customer_headers[:-4] +new_customer_headers[-1:])
1
2
3
4
5
6
# 处理五:将特征重新缩放到零均值和单位方差来标准化数据
new_customer_df_scaled = new_customer_df.iloc[:,:-2]
new_customer_df_scaled = pd.DataFrame(StandardScaler().fit_transform(new_customer_df_scaled), columns=new_customer_df_scaled.columns)
# 将标准化后的40个特征和最后选择拼接
new_customer_df = pd.concat([new_customer_df_scaled, new_customer_df.iloc[:, -2:]], axis=1)
new_customer_df
customer_feature1 | customer_feature2 | customer_feature3 | customer_feature4 | customer_feature5 | customer_feature6 | customer_feature7 | customer_feature8 | customer_feature9 | customer_feature10 | ... | customer_feature33 | customer_feature34 | customer_feature35 | customer_feature36 | customer_feature37 | customer_feature38 | customer_feature39 | customer_feature40 | 选择基金 | 选择排序 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1.255141 | 0.022313 | 0.686722 | -0.580099 | 0.696518 | 0.0 | 0.701266 | 1.584568 | -1.094460 | 0.973191 | ... | -0.937251 | 0.0 | 0.130054 | -1.037672 | 1.008106 | 1.113990 | -0.309102 | -0.016149 | J0045 | 1 |
1 | -1.255141 | 0.022313 | 0.686722 | -0.580099 | 0.696518 | 0.0 | 0.701266 | 1.584568 | -1.094460 | 0.973191 | ... | -0.937251 | 0.0 | 0.130054 | -1.037672 | 1.008106 | 1.113990 | -0.309102 | -0.016149 | J0109 | 2 |
2 | -1.255141 | 0.022313 | 0.686722 | -0.580099 | 0.696518 | 0.0 | 0.701266 | 1.584568 | -1.094460 | 0.973191 | ... | -0.937251 | 0.0 | 0.130054 | -1.037672 | 1.008106 | 1.113990 | -0.309102 | -0.016149 | J0063 | 3 |
3 | -1.255141 | 0.022313 | 0.686722 | -0.580099 | 0.696518 | 0.0 | 0.701266 | 1.584568 | -1.094460 | 0.973191 | ... | -0.937251 | 0.0 | 0.130054 | -1.037672 | 1.008106 | 1.113990 | -0.309102 | -0.016149 | J0099 | 4 |
4 | -1.255141 | 0.022313 | 0.686722 | -0.580099 | 0.696518 | 0.0 | 0.701266 | 1.584568 | -1.094460 | 0.973191 | ... | -0.937251 | 0.0 | 0.130054 | -1.037672 | 1.008106 | 1.113990 | -0.309102 | -0.016149 | J0001 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
340795 | -1.542959 | -0.317437 | -0.470141 | -0.324177 | -0.924150 | 0.0 | 0.506162 | -0.272220 | -1.347931 | 0.973191 | ... | -0.067536 | 0.0 | -0.738007 | 0.478624 | 0.719490 | -0.110265 | -0.129023 | -1.097325 | J0196 | 0 |
340796 | -1.542959 | -0.317437 | -0.470141 | -0.324177 | -0.924150 | 0.0 | 0.506162 | -0.272220 | -1.347931 | 0.973191 | ... | -0.067536 | 0.0 | -0.738007 | 0.478624 | 0.719490 | -0.110265 | -0.129023 | -1.097325 | J0197 | 0 |
340797 | -1.542959 | -0.317437 | -0.470141 | -0.324177 | -0.924150 | 0.0 | 0.506162 | -0.272220 | -1.347931 | 0.973191 | ... | -0.067536 | 0.0 | -0.738007 | 0.478624 | 0.719490 | -0.110265 | -0.129023 | -1.097325 | J0198 | 0 |
340798 | -1.542959 | -0.317437 | -0.470141 | -0.324177 | -0.924150 | 0.0 | 0.506162 | -0.272220 | -1.347931 | 0.973191 | ... | -0.067536 | 0.0 | -0.738007 | 0.478624 | 0.719490 | -0.110265 | -0.129023 | -1.097325 | J0199 | 0 |
340799 | -1.542959 | -0.317437 | -0.470141 | -0.324177 | -0.924150 | 0.0 | 0.506162 | -0.272220 | -1.347931 | 0.973191 | ... | -0.067536 | 0.0 | -0.738007 | 0.478624 | 0.719490 | -0.110265 | -0.129023 | -1.097325 | J0200 | 0 |
340800 rows × 42 columns
2. 处理基金数据
1
2
3
4
5
6
7
8
9
10
11
12
with open('./data/基金数据.csv', 'r', encoding='utf-8') as file:
# 创建CSV读取器对象
csv_reader = csv.reader(file)
# 跳过标题行
fund_headers = next(csv_reader)[1:]
# print(f'Headers: {fund_headers}')
fund_list = []
# 遍历每一行并打印
for row in csv_reader:
fund_list.append(row[1:])
fund_df = pd.DataFrame(fund_list, columns=fund_headers)
fund_df.head()
基金代码 | 基金公司 | 基金成立时间 | 净值 | feature1 | feature2 | feature3 | feature4 | feature5 | feature6 | ... | feature21 | feature22 | feature23 | feature24 | feature25 | feature26 | feature27 | feature28 | feature29 | feature30 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | J0001 | 基金公司32 | 2021/10/28 | -3.483643378 | 1 | 6 | 21 | 7 | 3 | 14.0063104 | ... | 1.096101546 | 1.245983443 | 2 | -2.562946447 | 4 | 6 | 12 | 12 | 2.24522696 | -1.74427907 |
1 | J0002 | 基金公司7 | 2023/4/7 | -2.01121916 | 2 | 9 | 19 | 4 | 10 | -3.581032341 | ... | -3.471400767 | 1.557628419 | 5 | -0.183853669 | 6 | 3 | 11 | 9 | 2.491505767 | 2.959278664 |
2 | J0003 | 基金公司32 | 2021/6/24 | 1.54954112 | 3 | 7 | 2 | 8 | 7 | 9.535030668 | ... | -0.223731795 | 2.230497658 | 19 | 8.76975046 | 7 | 12 | 21 | 6 | 2.787080936 | 5.513654932 |
3 | J0004 | 基金公司3 | 2022/11/22 | -3.000416018 | 3 | 20 | 3 | 9 | 4 | -6.844948334 | ... | -5.377407589 | 0.111784599 | 12 | 4.246656241 | 1 | 11 | 19 | 3 | 1.531418859 | 5.766965363 |
4 | J0005 | 基金公司4 | 2023/9/22 | -2.388125665 | 1 | 16 | 8 | 8 | 1 | 9.637912079 | ... | -1.487962684 | 2.326857149 | 6 | 7.025243136 | 2 | 9 | 5 | 2 | 2.884185111 | 2.918918908 |
5 rows × 34 columns
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# 希望将‘基金成立时间’作为一个特征用于训练,对‘基金成立时间’进行独立编码
date = fund_df.iloc[:, 2:3]
# 将日期字符串转换为日期时间对象
date['date'] = pd.to_datetime(date['基金成立时间'])
# 提取年、月、日等信息作为新的特征
date['year'] = date['date'].dt.year
date['month'] = date['date'].dt.month
date['day'] = date['date'].dt.day
# 独热编码年份、月份等特征
date_encoded = pd.get_dummies(date, columns=['year', 'month', 'day']) # 200 rows × 48 columns
# 希望将‘基金公司’作为一个特征用于训练,对‘基金公司’进行独立编码
company = fund_df.iloc[:, 1:2]
company_encoded = pd.get_dummies(company) # 200 rows × 58 columns
# 汇总独立编码结果
fund_df.drop(['基金公司', '基金成立时间'], axis=1, inplace=True)
fund_df = pd.concat([fund_df, company_encoded, date_encoded], axis=1)
fund_df['净值'] = fund_df.pop('净值')
fund_df
基金代码 | feature1 | feature2 | feature3 | feature4 | feature5 | feature6 | feature7 | feature8 | feature9 | ... | day_23 | day_24 | day_25 | day_26 | day_27 | day_28 | day_29 | day_30 | day_31 | 净值 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | J0001 | 1 | 6 | 21 | 7 | 3 | 14.0063104 | 3 | 3 | 8 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -3.483643378 |
1 | J0002 | 2 | 9 | 19 | 4 | 10 | -3.581032341 | 1 | 5 | 10 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -2.01121916 |
2 | J0003 | 3 | 7 | 2 | 8 | 7 | 9.535030668 | 2 | 2 | 3 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1.54954112 |
3 | J0004 | 3 | 20 | 3 | 9 | 4 | -6.844948334 | 2 | 10 | 10 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -3.000416018 |
4 | J0005 | 1 | 16 | 8 | 8 | 1 | 9.637912079 | 1 | 5 | 3 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -2.388125665 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
195 | J0196 | 3 | 17 | 13 | 5 | 2 | -1.537449111 | 1 | 11 | 9 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8.075655166 |
196 | J0197 | 3 | 16 | 13 | 9 | 11 | -3.686967574 | 3 | 1 | 3 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7.724697415 |
197 | J0198 | 2 | 19 | 17 | 1 | 3 | 11.25684561 | 1 | 1 | 8 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -1.767100969 |
198 | J0199 | 2 | 18 | 21 | 3 | 13 | -1.866035653 | 3 | 3 | 7 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -2.661014584 |
199 | J0200 | 2 | 18 | 14 | 1 | 2 | 4.948720634 | 1 | 7 | 8 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4.231475045 |
200 rows × 138 columns
1
2
3
4
5
6
7
8
9
10
# 处理五:根据选择基金列 构建基金信息表
fund_list = fund_df.values.tolist()
fund_choice = new_customer_df.iloc[:, -2].tolist()
concat_fund_list = []
for j in range(len(fund_choice)):
for i in range(len(fund_list)):
if fund_list[i][0] == fund_choice[j]:
concat_fund_list.append(fund_list[i])
# fund_headers = fund_headers[:1] + fund_headers[4:] + fund_headers[3:4]
concat_fund_df = pd.DataFrame(concat_fund_list, columns=fund_df.columns)
1
2
3
# 对基金信息表进行特征标准化
concat_fund_df_scaled = pd.DataFrame(StandardScaler().fit_transform(concat_fund_df.iloc[:, 1:31]), columns=concat_fund_df.iloc[:, 1:31].columns)
concat_fund_df_scaled
feature1 | feature2 | feature3 | feature4 | feature5 | feature6 | feature7 | feature8 | feature9 | feature10 | ... | feature21 | feature22 | feature23 | feature24 | feature25 | feature26 | feature27 | feature28 | feature29 | feature30 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.200640 | 0.712252 | -1.076222 | -1.248437 | -0.334457 | 0.956536 | -1.278443 | -0.047607 | 0.958710 | -1.256648 | ... | 0.025146 | 0.877876 | 0.460757 | -0.839008 | 1.528116 | -1.419769 | 1.612192 | 0.510028 | 0.438328 | -0.922319 |
1 | 1.200640 | -0.500829 | -1.231746 | -0.938266 | -0.981792 | -0.654327 | -1.278443 | 0.269773 | 0.958710 | 1.051785 | ... | 0.098985 | -1.406773 | 0.797690 | 1.288207 | -1.517962 | -0.292968 | 0.374425 | -0.568255 | -1.662697 | 0.292734 |
2 | 1.200640 | -0.327532 | -0.765175 | -0.317925 | -0.981792 | 0.304544 | 1.203971 | 0.587154 | 1.619890 | 0.681958 | ... | 0.868716 | -0.049282 | -1.055444 | -1.229350 | 0.512756 | 0.157752 | -0.553901 | 1.588311 | 0.999667 | 0.447293 |
3 | -0.012128 | -1.367315 | 0.634536 | 0.302416 | -0.766014 | -0.291953 | 1.203971 | 0.587154 | 0.958710 | -0.065375 | ... | -0.909839 | -0.816945 | -1.560845 | 0.579045 | 0.005077 | 0.157752 | 0.838587 | 0.078715 | 1.013894 | 0.352525 |
4 | -1.224895 | -1.194018 | 1.412153 | 0.302416 | -0.981792 | 1.769586 | 1.203971 | -0.999748 | 0.628121 | 0.535437 | ... | 1.513066 | -0.372056 | -1.560845 | -0.900752 | 0.005077 | -0.518328 | 0.219704 | 0.725685 | 0.831718 | -1.828897 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
340795 | 1.200640 | 0.712252 | 0.167965 | -0.317925 | -1.197571 | -0.769214 | -1.278443 | 1.539295 | 0.958710 | -0.344894 | ... | 0.909669 | 1.288355 | -1.223911 | -0.251674 | 1.020436 | -0.969049 | 0.838587 | 0.078715 | 1.035668 | 0.077653 |
340796 | 1.200640 | 0.538954 | 0.167965 | 0.922758 | 0.744436 | -1.120300 | 1.203971 | -1.634509 | -1.024828 | -1.678996 | ... | 0.649671 | 0.835020 | 1.303090 | 1.078315 | 0.512756 | 0.833833 | 1.302750 | -1.215225 | 0.154496 | 0.301785 |
340797 | -0.012128 | 1.058846 | 0.790059 | -1.558608 | -0.981792 | 1.320509 | -1.278443 | -1.634509 | 0.628121 | 0.141773 | ... | -0.219062 | 0.936870 | 0.629223 | -0.091423 | -1.517962 | 1.059193 | 0.529146 | -1.646538 | -0.543317 | 0.626878 |
340798 | -0.012128 | 0.885549 | 1.412153 | -0.938266 | 1.175993 | -0.822883 | 1.203971 | -0.999748 | 0.297531 | -0.051350 | ... | -0.315874 | -1.397704 | -1.223911 | 0.448527 | 1.528116 | -0.518328 | -1.327506 | 1.156998 | 0.509574 | -0.429399 |
340799 | -0.012128 | 0.885549 | 0.323489 | -1.558608 | -1.197571 | 0.290188 | -1.278443 | 0.269773 | 0.628121 | -0.465884 | ... | -1.931259 | 1.438801 | -0.044644 | -0.074317 | -1.010282 | -0.518328 | -1.327506 | 0.725685 | 1.391821 | -0.452211 |
340800 rows × 30 columns
1
2
concat_fund_df = pd.concat([concat_fund_df.iloc[:, :1], concat_fund_df_scaled, concat_fund_df.iloc[:, 31:]], axis=1)
concat_fund_df
基金代码 | feature1 | feature2 | feature3 | feature4 | feature5 | feature6 | feature7 | feature8 | feature9 | ... | day_23 | day_24 | day_25 | day_26 | day_27 | day_28 | day_29 | day_30 | day_31 | 净值 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | J0045 | 1.200640 | 0.712252 | -1.076222 | -1.248437 | -0.334457 | 0.956536 | -1.278443 | -0.047607 | 0.958710 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -4.897256487 |
1 | J0109 | 1.200640 | -0.500829 | -1.231746 | -0.938266 | -0.981792 | -0.654327 | -1.278443 | 0.269773 | 0.958710 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -4.547777652 |
2 | J0063 | 1.200640 | -0.327532 | -0.765175 | -0.317925 | -0.981792 | 0.304544 | 1.203971 | 0.587154 | 1.619890 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | -4.306791468 |
3 | J0099 | -0.012128 | -1.367315 | 0.634536 | 0.302416 | -0.766014 | -0.291953 | 1.203971 | 0.587154 | 0.958710 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12.98199228 |
4 | J0001 | -1.224895 | -1.194018 | 1.412153 | 0.302416 | -0.981792 | 1.769586 | 1.203971 | -0.999748 | 0.628121 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -3.483643378 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
340795 | J0196 | 1.200640 | 0.712252 | 0.167965 | -0.317925 | -1.197571 | -0.769214 | -1.278443 | 1.539295 | 0.958710 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8.075655166 |
340796 | J0197 | 1.200640 | 0.538954 | 0.167965 | 0.922758 | 0.744436 | -1.120300 | 1.203971 | -1.634509 | -1.024828 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7.724697415 |
340797 | J0198 | -0.012128 | 1.058846 | 0.790059 | -1.558608 | -0.981792 | 1.320509 | -1.278443 | -1.634509 | 0.628121 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -1.767100969 |
340798 | J0199 | -0.012128 | 0.885549 | 1.412153 | -0.938266 | 1.175993 | -0.822883 | 1.203971 | -0.999748 | 0.297531 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -2.661014584 |
340799 | J0200 | -0.012128 | 0.885549 | 0.323489 | -1.558608 | -1.197571 | 0.290188 | -1.278443 | 0.269773 | 0.628121 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4.231475045 |
340800 rows × 138 columns
3.合并客户数据和基金数据
1
2
# 处理六:合并两张表格
df = pd.concat([new_customer_df, concat_fund_df], axis=1)
1
2
3
4
5
6
7
# 删除列的标签,axis=1 表示列
df = df.drop(['选择基金', '基金代码', '基金成立时间', 'date'], axis=1)
# 将target'选择排序' 移到最后一列
df['选择排序'] = df.pop('选择排序')
# 修改 feature为 fund_feature
for i in range(40, 70):
df = df.rename(columns={df.columns[i]: 'fund_' + df.columns[i]})
1
2
# 输出csv
# df.to_csv('output.csv', encoding='utf-8')
4.预测
1
2
3
4
5
6
7
8
9
from sklearn.model_selection import train_test_split
# 划分数据集为训练集和测试集
X = df.drop('选择排序', axis=1)
y = df['选择排序']
# 划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
1
2
3
4
5
6
7
8
9
# Perceptron模型
from sklearn.linear_model import Perceptron
perceptron_classifier = Perceptron(random_state=42, verbose=True, max_iter=1000)
perceptron_classifier.fit(X_train, y_train)
y_pred_pc = perceptron_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_pc)
print(f"Accuracy using Perceptron: {accuracy}, y_pred is {y_pred_pc}")
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
-- Epoch 1
Norm: 92.46, NNZs: 167, Bias: 68.000000, T: 272640, Avg. loss: 1.252741
Total training time: 0.09 seconds.
-- Epoch 2
Norm: 106.06, NNZs: 164, Bias: 68.000000, T: 545280, Avg. loss: 1.228747
Total training time: 0.25 seconds.
-- Epoch 3
Norm: 121.13, NNZs: 166, Bias: 67.000000, T: 817920, Avg. loss: 1.230478
Total training time: 0.37 seconds.
-- Epoch 4
Norm: 126.41, NNZs: 165, Bias: 63.000000, T: 1090560, Avg. loss: 1.230669
Total training time: 0.53 seconds.
-- Epoch 5
Norm: 120.86, NNZs: 167, Bias: 62.000000, T: 1363200, Avg. loss: 1.225637
Total training time: 0.68 seconds.
-- Epoch 6
Norm: 128.14, NNZs: 170, Bias: 69.000000, T: 1635840, Avg. loss: 1.224242
Total training time: 0.79 seconds.
-- Epoch 7
Norm: 125.84, NNZs: 167, Bias: 61.000000, T: 1908480, Avg. loss: 1.222632
Total training time: 0.89 seconds.
-- Epoch 8
Norm: 123.26, NNZs: 163, Bias: 69.000000, T: 2181120, Avg. loss: 1.218295
Total training time: 1.00 seconds.
-- Epoch 9
Norm: 126.76, NNZs: 167, Bias: 64.000000, T: 2453760, Avg. loss: 1.227991
Total training time: 1.10 seconds.
-- Epoch 10
Norm: 131.06, NNZs: 169, Bias: 65.000000, T: 2726400, Avg. loss: 1.234568
Total training time: 1.23 seconds.
-- Epoch 11
Norm: 132.18, NNZs: 166, Bias: 65.000000, T: 2999040, Avg. loss: 1.217471
Total training time: 1.33 seconds.
-- Epoch 12
Norm: 131.57, NNZs: 165, Bias: 63.000000, T: 3271680, Avg. loss: 1.235366
Total training time: 1.44 seconds.
-- Epoch 13
Norm: 136.27, NNZs: 166, Bias: 63.000000, T: 3544320, Avg. loss: 1.225213
Total training time: 1.55 seconds.
Convergence after 13 epochs took 1.55 seconds
-- Epoch 1
Norm: 78.94, NNZs: 162, Bias: -60.000000, T: 272640, Avg. loss: 0.565198
Total training time: 0.13 seconds.
-- Epoch 2
Norm: 93.65, NNZs: 167, Bias: -60.000000, T: 545280, Avg. loss: 0.548994
Total training time: 0.28 seconds.
-- Epoch 3
Norm: 107.45, NNZs: 167, Bias: -63.000000, T: 817920, Avg. loss: 0.543448
Total training time: 0.38 seconds.
-- Epoch 4
Norm: 112.42, NNZs: 162, Bias: -66.000000, T: 1090560, Avg. loss: 0.542047
Total training time: 0.49 seconds.
-- Epoch 5
Norm: 115.04, NNZs: 164, Bias: -62.000000, T: 1363200, Avg. loss: 0.550648
Total training time: 0.60 seconds.
-- Epoch 6
Norm: 128.83, NNZs: 165, Bias: -66.000000, T: 1635840, Avg. loss: 0.540281
Total training time: 0.71 seconds.
-- Epoch 7
Norm: 132.78, NNZs: 169, Bias: -63.000000, T: 1908480, Avg. loss: 0.542232
Total training time: 0.82 seconds.
-- Epoch 8
Norm: 136.39, NNZs: 167, Bias: -64.000000, T: 2181120, Avg. loss: 0.546548
Total training time: 0.92 seconds.
-- Epoch 9
Norm: 137.78, NNZs: 166, Bias: -70.000000, T: 2453760, Avg. loss: 0.539798
Total training time: 1.04 seconds.
-- Epoch 10
Norm: 143.50, NNZs: 166, Bias: -69.000000, T: 2726400, Avg. loss: 0.544055
Total training time: 1.27 seconds.
-- Epoch 11
Norm: 143.70, NNZs: 167, Bias: -64.000000, T: 2999040, Avg. loss: 0.543328
Total training time: 1.36 seconds.
Convergence after 11 epochs took 1.36 seconds
-- Epoch 1
Norm: 87.99, NNZs: 162, Bias: -65.000000, T: 272640, Avg. loss: 0.567236
Total training time: 0.12 seconds.
-- Epoch 2
Norm: 93.60, NNZs: 166, Bias: -65.000000, T: 545280, Avg. loss: 0.552740
Total training time: 0.23 seconds.
-- Epoch 3
Norm: 105.11, NNZs: 162, Bias: -63.000000, T: 817920, Avg. loss: 0.552061
Total training time: 0.32 seconds.
-- Epoch 4
Norm: 110.40, NNZs: 164, Bias: -73.000000, T: 1090560, Avg. loss: 0.544508
Total training time: 0.48 seconds.
-- Epoch 5
Norm: 111.62, NNZs: 167, Bias: -67.000000, T: 1363200, Avg. loss: 0.551475
Total training time: 0.59 seconds.
-- Epoch 6
Norm: 112.71, NNZs: 169, Bias: -65.000000, T: 1635840, Avg. loss: 0.544028
Total training time: 0.68 seconds.
-- Epoch 7
Norm: 117.17, NNZs: 169, Bias: -64.000000, T: 1908480, Avg. loss: 0.547210
Total training time: 0.82 seconds.
-- Epoch 8
Norm: 117.80, NNZs: 168, Bias: -63.000000, T: 2181120, Avg. loss: 0.546045
Total training time: 0.92 seconds.
-- Epoch 9
Norm: 125.14, NNZs: 170, Bias: -69.000000, T: 2453760, Avg. loss: 0.545288
Total training time: 1.05 seconds.
Convergence after 9 epochs took 1.05 seconds
-- Epoch 1
Norm: 69.14, NNZs: 156, Bias: -65.000000, T: 272640, Avg. loss: 0.151736
Total training time: 0.12 seconds.
-- Epoch 2
Norm: 79.10, NNZs: 162, Bias: -72.000000, T: 545280, Avg. loss: 0.138197
Total training time: 0.26 seconds.
-- Epoch 3
Norm: 79.31, NNZs: 164, Bias: -70.000000, T: 817920, Avg. loss: 0.140124
Total training time: 0.37 seconds.
-- Epoch 4
Norm: 82.79, NNZs: 165, Bias: -68.000000, T: 1090560, Avg. loss: 0.137800
Total training time: 0.49 seconds.
-- Epoch 5
Norm: 90.31, NNZs: 166, Bias: -68.000000, T: 1363200, Avg. loss: 0.137556
Total training time: 0.59 seconds.
-- Epoch 6
Norm: 89.96, NNZs: 163, Bias: -69.000000, T: 1635840, Avg. loss: 0.137137
Total training time: 0.71 seconds.
-- Epoch 7
Norm: 94.54, NNZs: 165, Bias: -66.000000, T: 1908480, Avg. loss: 0.139736
Total training time: 0.83 seconds.
Convergence after 7 epochs took 0.83 seconds
-- Epoch 1
Norm: 61.09, NNZs: 148, Bias: -69.000000, T: 272640, Avg. loss: 0.029810
Total training time: 0.14 seconds.
-- Epoch 2
Norm: 69.45, NNZs: 148, Bias: -70.000000, T: 545280, Avg. loss: 0.028391
Total training time: 0.25 seconds.
-- Epoch 3
Norm: 74.09, NNZs: 157, Bias: -71.000000, T: 817920, Avg. loss: 0.025827
Total training time: 0.38 seconds.
-- Epoch 4
Norm: 78.66, NNZs: 162, Bias: -71.000000, T: 1090560, Avg. loss: 0.025156
Total training time: 0.50 seconds.
-- Epoch 5
Norm: 85.40, NNZs: 163, Bias: -71.000000, T: 1363200, Avg. loss: 0.024645
Total training time: 0.61 seconds.
-- Epoch 6
Norm: 86.81, NNZs: 167, Bias: -71.000000, T: 1635840, Avg. loss: 0.024701
Total training time: 0.74 seconds.
-- Epoch 7
Norm: 93.31, NNZs: 164, Bias: -74.000000, T: 1908480, Avg. loss: 0.023552
Total training time: 0.85 seconds.
-- Epoch 8
Norm: 94.23, NNZs: 166, Bias: -70.000000, T: 2181120, Avg. loss: 0.025229
Total training time: 0.96 seconds.
-- Epoch 9
Norm: 95.97, NNZs: 166, Bias: -74.000000, T: 2453760, Avg. loss: 0.023290
Total training time: 1.08 seconds.
-- Epoch 10
Norm: 100.04, NNZs: 166, Bias: -74.000000, T: 2726400, Avg. loss: 0.023572
Total training time: 1.18 seconds.
-- Epoch 11
Norm: 102.60, NNZs: 163, Bias: -74.000000, T: 2999040, Avg. loss: 0.023608
Total training time: 1.31 seconds.
-- Epoch 12
Norm: 104.93, NNZs: 165, Bias: -74.000000, T: 3271680, Avg. loss: 0.023670
Total training time: 1.47 seconds.
Convergence after 12 epochs took 1.47 seconds
Accuracy using Perceptron: 0.9211707746478873, y_pred is [0 0 0 ... 0 2 0]
1
2
3
4
5
6
7
8
9
10
# logistic regression逻辑回归模型
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(multi_class='auto', solver='liblinear', max_iter=1000, verbose=1)
logreg.fit(X_train, y_train)
y_pred_lr = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy using logistic regression: {accuracy}, y_pred is {y_pred_lr}")
1
[LibLinear]Accuracy using logistic regression: 0.9883656103286385, y_pred is [0 0 0 ... 0 0 0]
1
2
3
4
5
6
7
8
9
10
# RandomForest随机森林
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42, n_estimators=500, n_jobs = -1, verbose=1)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy using RandomForest: {accuracy}, y_pred is {y_pred_rf}")
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 10.3s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.1min
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 2.6min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 3.0min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.9s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 2.2s
Accuracy using RandomForest: 0.9883656103286385, y_pred is [0 0 0 ... 0 0 0]
[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed: 2.6s finished
1
2
3
4
5
6
7
8
9
10
# # SVM分类
# from sklearn.svm import SVC
# svm_classifier = SVC(decision_function_shape='ovr', random_state=42)
# svm_classifier.fit(X_train, y_train)
# y_pred = svm_classifier.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy}")
This post is licensed under CC BY 4.0 by the author.
Comments powered by Disqus.