中信技术考察之基金推荐模型

Posted Nov 24, 2023

By Yuyao Jiang 26 min read

  
import csv
import copy
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

1. 处理客户数据

  
with open('./data/客户数据.csv', 'r', encoding='utf-8') as file:
    # 创建CSV读取器对象
    csv_reader = csv.reader(file)
    # 跳过标题行
    customer_headers = next(csv_reader)[1:]
    # print(f'Headers: {customer_headers}')
    customer_list = []
    # 遍历每一行并打印
    for row in csv_reader:
        customer_list.append(row[1:])
customer_df = pd.DataFrame(customer_list, columns=customer_headers)
customer_df  

	客户编号	客户公司名称	客户公司编号	feature1	feature2	feature3	feature4	feature5	feature6	feature7	...	feature35	feature36	feature37	feature38	feature39	feature40	第一选择基金	第二选择基金	第三选择基金	第四选择基金
0	C0001	公司20	4040100	9	9	15.44036767	193	0.166081948	1	11	...	1	3	10	43	56	44.99447821
1	C0002	公司3	90601	10	69	24.19789929	86	0.327464351	1	17	...	8	3	11	22	74	54.38582584
2	C0003	公司18	3272481	3	61	6.773532386	201	0.482634837	1	2	...	5	6	10	44	19	9.35864196
3	C0004	公司9	817216	3	9	6.375025793	92	0.545685448	1	3	...	10	5	3	71	82	55.20456675
4	C0005	公司6	363609	3	65	4.183100481	195	0.657033027	1	16	...	3	7	11	3	31	41.24379255
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
11995	C11996	公司16	2585664	7	37	7.110256464	173	0.320702523	1	4	...	6	6	12	75	6	86.15755992
11996	C11997	公司4	161604	5	60	10.05886805	74	0.986376089	1	6	...	12	4	12	82	16	11.28849184
11997	C11998	公司19	3644281	4	37	25.60124899	66	0.990199579	1	14	...	8	5	4	34	85	48.17353183
11998	C11999	公司7	494209	10	60	30.56264901	34	0.775651603	1	6	...	12	7	10	18		6.35029667	J0076
11999	C12000	公司14	1979649	1	30	12.6276525	86	0.233256336	1	12	...	4	5	9	46	45	18.21188006	J0139	J0034

12000 rows × 47 columns

  
# 处理一：筛选出训练数据
customer_list = customer_df.values.tolist()
print(len(customer_list))
customer_list = [row for row in customer_list if row[-4][0] == 'J']
print(len(customer_list))

12000
4780

  
# 处理二：删除客户及公司信息 只保留特征
customer_list = [row[3:] for row in customer_list]

  
# 处理三：复制
# 因为有4780行第一选择基金的数据，每行对应200支基金，训练数据有1000000数据量。故筛选至少选择2支基金的客户作为训练数据以减少训练成本 1704行每行对应200支基金
new_customer_list = []
strlist = ['J{:04d}'.format(i) for i in range(1, 201)]
for i in range(len(customer_list)):
    # 17800 = 89 * 200
    if customer_list[i][-1][0] == 'J':
    # ['J0116', 'J0029', 'J0133', 'J0156']
        #print(customer_list[i][-4:])
        for j in range(1, 5):
            tmp = copy.deepcopy(customer_list[i])
            tmp[-4] = customer_list[i][-5+j]
            tmp.append(j)
            new_customer_list.append(tmp)
        #print('-----------')
        for str in strlist:
            if str!= customer_list[i][-4] and str!= customer_list[i][-3] and str!= customer_list[i][-2] and str!= customer_list[i][-1]:
                tmp = copy.deepcopy(customer_list[i])
                tmp[-4] = str
                tmp.append(0)
                new_customer_list.append(tmp)
# 删除错例---客户数据.csv第10753行[J0163	J0012	J0139	J0012] 第二选择和第四选择重复
del new_customer_list[16403]
for i in range(len(customer_list)):
    # 256000 = 1280 * 200
    if customer_list[i][-3][0] == 'J' and customer_list[i][-2][0] != 'J' and customer_list[i][-1][0] != 'J':
        # ['J0132', 'J0081', '  ', '  ']
        #print(customer_list[i][-4:])
        for j in range(1, 3):
            tmp = copy.deepcopy(customer_list[i])
            tmp[-4] = customer_list[i][-5+j]
            tmp.append(j)
            new_customer_list.append(tmp)
        #print('-----------')
        for str in strlist:
            if str!= customer_list[i][-4] and str!= customer_list[i][-3]:
                tmp = copy.deepcopy(customer_list[i])
                tmp[-4] = str
                tmp.append(0)
                new_customer_list.append(tmp)          
    # 67000 = 335 * 200
    if customer_list[i][-2][0] == 'J' and customer_list[i][-1][0] != 'J':
        # ['J0188', 'J0191', 'J0068', '  ']
        #print(customer_list[i][-4:])
        for j in range(1, 4):
            tmp = copy.deepcopy(customer_list[i])
            tmp[-4] = customer_list[i][-5+j]
            tmp.append(j)
            new_customer_list.append(tmp)
        #print('-----------')
        for str in strlist:
            if str!= customer_list[i][-4] and str!= customer_list[i][-3] and str!= customer_list[i][-2]:
                tmp = copy.deepcopy(customer_list[i])
                tmp[-4] = str
                tmp.append(0)
                new_customer_list.append(tmp)
print(f'一共生成{len(new_customer_list)}条数据')

一共生成340800条数据

  
# 修改header
new_customer_headers = customer_headers[3:]+['选择排序']
for i in range(0, 40):
    new_customer_headers[i] = 'customer_' + new_customer_headers[i]
new_customer_headers[40] = '选择基金'   
new_customer_df = pd.DataFrame(new_customer_list, columns = new_customer_headers)
new_customer_df.drop(new_customer_df.columns[-4:-1], axis=1, inplace=True)
new_customer_df

	customer_feature1	customer_feature2	customer_feature3	customer_feature4	customer_feature5	customer_feature6	customer_feature7	customer_feature8	customer_feature9	customer_feature10	...	customer_feature33	customer_feature34	customer_feature35	customer_feature36	customer_feature37	customer_feature38	customer_feature39	customer_feature40	选择基金	选择排序
0	2	37	23.88129863	71	0.698777259	1	13	194	6.217688451	55	...	4.810028598	1	7	2	10	80	40	46.45630119	J0045	1
1	2	37	23.88129863	71	0.698777259	1	13	194	6.217688451	55	...	4.810028598	1	7	2	10	80	40	46.45630119	J0109	2
2	2	37	23.88129863	71	0.698777259	1	13	194	6.217688451	55	...	4.810028598	1	7	2	10	80	40	46.45630119	J0063	3
3	2	37	23.88129863	71	0.698777259	1	13	194	6.217688451	55	...	4.810028598	1	7	2	10	80	40	46.45630119	J0099	4
4	2	37	23.88129863	71	0.698777259	1	13	194	6.217688451	55	...	4.810028598	1	7	2	10	80	40	46.45630119	J0001	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
340795	1	30	12.6276525	86	0.233256336	1	12	85	3.740509451	55	...	9.660924457	1	4	5	9	46	45	18.21188006	J0196	0
340796	1	30	12.6276525	86	0.233256336	1	12	85	3.740509451	55	...	9.660924457	1	4	5	9	46	45	18.21188006	J0197	0
340797	1	30	12.6276525	86	0.233256336	1	12	85	3.740509451	55	...	9.660924457	1	4	5	9	46	45	18.21188006	J0198	0
340798	1	30	12.6276525	86	0.233256336	1	12	85	3.740509451	55	...	9.660924457	1	4	5	9	46	45	18.21188006	J0199	0
340799	1	30	12.6276525	86	0.233256336	1	12	85	3.740509451	55	...	9.660924457	1	4	5	9	46	45	18.21188006	J0200	0

340800 rows × 42 columns

  
# 将 DataFrame 中所有缺失值和极值先全部转为np.nan
new_customer_list = new_customer_df.values.tolist()
for row in new_customer_list:
    # 提取dataframe中数值数据 从第三列起
    for i in range(0, len(row) - 2):
        if row[i] != '':
            row[i] = float(row[i])
            # 极值999999999转为np.nan
            if row[i] > 1000 or row[i] < -1000:
                row[i] = np.nan
for row in new_customer_list:
    for i in range(0, len(row) - 2):
        if row[i] == '':
            # 缺失值转为np.nan
            row[i] = np.nan

  
# 计算所有列的平均值
new_customer_df = pd.DataFrame(new_customer_list, columns = new_customer_headers[:-4] +new_customer_headers[-1:])
column_means = new_customer_df.iloc[:,:-2].mean(skipna=True)
print(column_means)

customer_feature1       6.360880
customer_feature2      36.540284
customer_feature3      17.201055
customer_feature4     105.000593
customer_feature5       0.498709
customer_feature6       1.000000
customer_feature7       9.405694
customer_feature8     100.980251
customer_feature9      16.913861
customer_feature10     34.972123
customer_feature11      0.498026
customer_feature12      6.518365
customer_feature13     35.883363
customer_feature14     10.118418
customer_feature15     16.726703
customer_feature16     47.693500
customer_feature17     16.657743
customer_feature18     48.414837
customer_feature19     46.688786
customer_feature20      9.528190
customer_feature21     10.009301
customer_feature22     14.929339
customer_feature23     49.011289
customer_feature24     35.924674
customer_feature25      9.861468
customer_feature26      0.000000
customer_feature27     45.171292
customer_feature28      9.976480
customer_feature29     45.946405
customer_feature30      2.985731
customer_feature31      1.000000
customer_feature32     49.080166
customer_feature33     10.037611
customer_feature34      1.000000
customer_feature35      6.550535
customer_feature36      4.053039
customer_feature37      6.507101
customer_feature38     49.062278
customer_feature39     48.582398
customer_feature40     46.878168
dtype: float64

  
for row in new_customer_list:
    for i in range(0, len(row) - 2):
        if np.isnan(row[i]):
            # 缺失值转为np.nan
            row[i] = column_means[i]

  
# list 转回 dataframe
new_customer_df = pd.DataFrame(new_customer_list, columns = new_customer_headers[:-4] +new_customer_headers[-1:])

  
# 处理五：将特征重新缩放到零均值和单位方差来标准化数据
new_customer_df_scaled = new_customer_df.iloc[:,:-2]
new_customer_df_scaled = pd.DataFrame(StandardScaler().fit_transform(new_customer_df_scaled), columns=new_customer_df_scaled.columns)
# 将标准化后的40个特征和最后选择拼接
new_customer_df = pd.concat([new_customer_df_scaled, new_customer_df.iloc[:, -2:]], axis=1)
new_customer_df

	customer_feature1	customer_feature2	customer_feature3	customer_feature4	customer_feature5	customer_feature6	customer_feature7	customer_feature8	customer_feature9	customer_feature10	...	customer_feature33	customer_feature34	customer_feature35	customer_feature36	customer_feature37	customer_feature38	customer_feature39	customer_feature40	选择基金	选择排序
0	-1.255141	0.022313	0.686722	-0.580099	0.696518	0.0	0.701266	1.584568	-1.094460	0.973191	...	-0.937251	0.0	0.130054	-1.037672	1.008106	1.113990	-0.309102	-0.016149	J0045	1
1	-1.255141	0.022313	0.686722	-0.580099	0.696518	0.0	0.701266	1.584568	-1.094460	0.973191	...	-0.937251	0.0	0.130054	-1.037672	1.008106	1.113990	-0.309102	-0.016149	J0109	2
2	-1.255141	0.022313	0.686722	-0.580099	0.696518	0.0	0.701266	1.584568	-1.094460	0.973191	...	-0.937251	0.0	0.130054	-1.037672	1.008106	1.113990	-0.309102	-0.016149	J0063	3
3	-1.255141	0.022313	0.686722	-0.580099	0.696518	0.0	0.701266	1.584568	-1.094460	0.973191	...	-0.937251	0.0	0.130054	-1.037672	1.008106	1.113990	-0.309102	-0.016149	J0099	4
4	-1.255141	0.022313	0.686722	-0.580099	0.696518	0.0	0.701266	1.584568	-1.094460	0.973191	...	-0.937251	0.0	0.130054	-1.037672	1.008106	1.113990	-0.309102	-0.016149	J0001	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
340795	-1.542959	-0.317437	-0.470141	-0.324177	-0.924150	0.0	0.506162	-0.272220	-1.347931	0.973191	...	-0.067536	0.0	-0.738007	0.478624	0.719490	-0.110265	-0.129023	-1.097325	J0196	0
340796	-1.542959	-0.317437	-0.470141	-0.324177	-0.924150	0.0	0.506162	-0.272220	-1.347931	0.973191	...	-0.067536	0.0	-0.738007	0.478624	0.719490	-0.110265	-0.129023	-1.097325	J0197	0
340797	-1.542959	-0.317437	-0.470141	-0.324177	-0.924150	0.0	0.506162	-0.272220	-1.347931	0.973191	...	-0.067536	0.0	-0.738007	0.478624	0.719490	-0.110265	-0.129023	-1.097325	J0198	0
340798	-1.542959	-0.317437	-0.470141	-0.324177	-0.924150	0.0	0.506162	-0.272220	-1.347931	0.973191	...	-0.067536	0.0	-0.738007	0.478624	0.719490	-0.110265	-0.129023	-1.097325	J0199	0
340799	-1.542959	-0.317437	-0.470141	-0.324177	-0.924150	0.0	0.506162	-0.272220	-1.347931	0.973191	...	-0.067536	0.0	-0.738007	0.478624	0.719490	-0.110265	-0.129023	-1.097325	J0200	0

340800 rows × 42 columns

2. 处理基金数据

  
with open('./data/基金数据.csv', 'r', encoding='utf-8') as file:
    # 创建CSV读取器对象
    csv_reader = csv.reader(file)
    # 跳过标题行
    fund_headers = next(csv_reader)[1:]
    # print(f'Headers: {fund_headers}')
    fund_list = []
    # 遍历每一行并打印
    for row in csv_reader:
        fund_list.append(row[1:])
fund_df = pd.DataFrame(fund_list, columns=fund_headers)
fund_df.head()

	基金代码	基金公司	基金成立时间	净值	feature1	feature2	feature3	feature4	feature5	feature6	...	feature21	feature22	feature23	feature24	feature25	feature26	feature27	feature28	feature29	feature30
0	J0001	基金公司32	2021/10/28	-3.483643378	1	6	21	7	3	14.0063104	...	1.096101546	1.245983443	2	-2.562946447	4	6	12	12	2.24522696	-1.74427907
1	J0002	基金公司7	2023/4/7	-2.01121916	2	9	19	4	10	-3.581032341	...	-3.471400767	1.557628419	5	-0.183853669	6	3	11	9	2.491505767	2.959278664
2	J0003	基金公司32	2021/6/24	1.54954112	3	7	2	8	7	9.535030668	...	-0.223731795	2.230497658	19	8.76975046	7	12	21	6	2.787080936	5.513654932
3	J0004	基金公司3	2022/11/22	-3.000416018	3	20	3	9	4	-6.844948334	...	-5.377407589	0.111784599	12	4.246656241	1	11	19	3	1.531418859	5.766965363
4	J0005	基金公司4	2023/9/22	-2.388125665	1	16	8	8	1	9.637912079	...	-1.487962684	2.326857149	6	7.025243136	2	9	5	2	2.884185111	2.918918908

5 rows × 34 columns

  
# 希望将‘基金成立时间’作为一个特征用于训练，对‘基金成立时间’进行独立编码
date = fund_df.iloc[:, 2:3]
# 将日期字符串转换为日期时间对象
date['date'] = pd.to_datetime(date['基金成立时间'])
# 提取年、月、日等信息作为新的特征
date['year'] = date['date'].dt.year
date['month'] = date['date'].dt.month
date['day'] = date['date'].dt.day
# 独热编码年份、月份等特征
date_encoded = pd.get_dummies(date, columns=['year', 'month', 'day']) # 200 rows × 48 columns

# 希望将‘基金公司’作为一个特征用于训练，对‘基金公司’进行独立编码
company = fund_df.iloc[:, 1:2]
company_encoded = pd.get_dummies(company) # 200 rows × 58 columns

# 汇总独立编码结果
fund_df.drop(['基金公司', '基金成立时间'], axis=1, inplace=True)
fund_df = pd.concat([fund_df, company_encoded, date_encoded], axis=1)
fund_df['净值'] = fund_df.pop('净值')
fund_df

	基金代码	feature1	feature2	feature3	feature4	feature5	feature6	feature7	feature8	feature9	...	day_23	day_24	day_25	day_26	day_27	day_28	day_29	day_30	day_31	净值
0	J0001	1	6	21	7	3	14.0063104	3	3	8	...	0	0	0	0	0	1	0	0	0	-3.483643378
1	J0002	2	9	19	4	10	-3.581032341	1	5	10	...	0	0	0	0	0	0	0	0	0	-2.01121916
2	J0003	3	7	2	8	7	9.535030668	2	2	3	...	0	1	0	0	0	0	0	0	0	1.54954112
3	J0004	3	20	3	9	4	-6.844948334	2	10	10	...	0	0	0	0	0	0	0	0	0	-3.000416018
4	J0005	1	16	8	8	1	9.637912079	1	5	3	...	0	0	0	0	0	0	0	0	0	-2.388125665
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
195	J0196	3	17	13	5	2	-1.537449111	1	11	9	...	0	0	0	0	0	0	0	0	0	8.075655166
196	J0197	3	16	13	9	11	-3.686967574	3	1	3	...	0	0	0	0	0	0	0	0	0	7.724697415
197	J0198	2	19	17	1	3	11.25684561	1	1	8	...	0	0	0	0	0	0	0	0	0	-1.767100969
198	J0199	2	18	21	3	13	-1.866035653	3	3	7	...	0	0	0	0	0	0	0	0	0	-2.661014584
199	J0200	2	18	14	1	2	4.948720634	1	7	8	...	0	0	0	0	0	0	0	0	0	4.231475045

200 rows × 138 columns

  
# 处理五：根据选择基金列 构建基金信息表
fund_list = fund_df.values.tolist()
fund_choice = new_customer_df.iloc[:, -2].tolist()
concat_fund_list = []
for j in range(len(fund_choice)):
    for i in range(len(fund_list)):
        if fund_list[i][0] == fund_choice[j]:
            concat_fund_list.append(fund_list[i])
# fund_headers = fund_headers[:1] + fund_headers[4:] + fund_headers[3:4]
concat_fund_df = pd.DataFrame(concat_fund_list, columns=fund_df.columns)

  
# 对基金信息表进行特征标准化
concat_fund_df_scaled = pd.DataFrame(StandardScaler().fit_transform(concat_fund_df.iloc[:, 1:31]), columns=concat_fund_df.iloc[:, 1:31].columns)
concat_fund_df_scaled

	feature1	feature2	feature3	feature4	feature5	feature6	feature7	feature8	feature9	feature10	...	feature21	feature22	feature23	feature24	feature25	feature26	feature27	feature28	feature29	feature30
0	1.200640	0.712252	-1.076222	-1.248437	-0.334457	0.956536	-1.278443	-0.047607	0.958710	-1.256648	...	0.025146	0.877876	0.460757	-0.839008	1.528116	-1.419769	1.612192	0.510028	0.438328	-0.922319
1	1.200640	-0.500829	-1.231746	-0.938266	-0.981792	-0.654327	-1.278443	0.269773	0.958710	1.051785	...	0.098985	-1.406773	0.797690	1.288207	-1.517962	-0.292968	0.374425	-0.568255	-1.662697	0.292734
2	1.200640	-0.327532	-0.765175	-0.317925	-0.981792	0.304544	1.203971	0.587154	1.619890	0.681958	...	0.868716	-0.049282	-1.055444	-1.229350	0.512756	0.157752	-0.553901	1.588311	0.999667	0.447293
3	-0.012128	-1.367315	0.634536	0.302416	-0.766014	-0.291953	1.203971	0.587154	0.958710	-0.065375	...	-0.909839	-0.816945	-1.560845	0.579045	0.005077	0.157752	0.838587	0.078715	1.013894	0.352525
4	-1.224895	-1.194018	1.412153	0.302416	-0.981792	1.769586	1.203971	-0.999748	0.628121	0.535437	...	1.513066	-0.372056	-1.560845	-0.900752	0.005077	-0.518328	0.219704	0.725685	0.831718	-1.828897
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
340795	1.200640	0.712252	0.167965	-0.317925	-1.197571	-0.769214	-1.278443	1.539295	0.958710	-0.344894	...	0.909669	1.288355	-1.223911	-0.251674	1.020436	-0.969049	0.838587	0.078715	1.035668	0.077653
340796	1.200640	0.538954	0.167965	0.922758	0.744436	-1.120300	1.203971	-1.634509	-1.024828	-1.678996	...	0.649671	0.835020	1.303090	1.078315	0.512756	0.833833	1.302750	-1.215225	0.154496	0.301785
340797	-0.012128	1.058846	0.790059	-1.558608	-0.981792	1.320509	-1.278443	-1.634509	0.628121	0.141773	...	-0.219062	0.936870	0.629223	-0.091423	-1.517962	1.059193	0.529146	-1.646538	-0.543317	0.626878
340798	-0.012128	0.885549	1.412153	-0.938266	1.175993	-0.822883	1.203971	-0.999748	0.297531	-0.051350	...	-0.315874	-1.397704	-1.223911	0.448527	1.528116	-0.518328	-1.327506	1.156998	0.509574	-0.429399
340799	-0.012128	0.885549	0.323489	-1.558608	-1.197571	0.290188	-1.278443	0.269773	0.628121	-0.465884	...	-1.931259	1.438801	-0.044644	-0.074317	-1.010282	-0.518328	-1.327506	0.725685	1.391821	-0.452211

340800 rows × 30 columns

  
concat_fund_df = pd.concat([concat_fund_df.iloc[:, :1], concat_fund_df_scaled, concat_fund_df.iloc[:, 31:]], axis=1)
concat_fund_df

	基金代码	feature1	feature2	feature3	feature4	feature5	feature6	feature7	feature8	feature9	...	day_23	day_24	day_25	day_26	day_27	day_28	day_29	day_30	day_31	净值
0	J0045	1.200640	0.712252	-1.076222	-1.248437	-0.334457	0.956536	-1.278443	-0.047607	0.958710	...	0	0	0	0	0	0	0	0	0	-4.897256487
1	J0109	1.200640	-0.500829	-1.231746	-0.938266	-0.981792	-0.654327	-1.278443	0.269773	0.958710	...	0	0	0	0	0	0	0	0	0	-4.547777652
2	J0063	1.200640	-0.327532	-0.765175	-0.317925	-0.981792	0.304544	1.203971	0.587154	1.619890	...	0	0	0	0	0	0	1	0	0	-4.306791468
3	J0099	-0.012128	-1.367315	0.634536	0.302416	-0.766014	-0.291953	1.203971	0.587154	0.958710	...	0	0	0	0	0	0	0	0	0	12.98199228
4	J0001	-1.224895	-1.194018	1.412153	0.302416	-0.981792	1.769586	1.203971	-0.999748	0.628121	...	0	0	0	0	0	1	0	0	0	-3.483643378
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
340795	J0196	1.200640	0.712252	0.167965	-0.317925	-1.197571	-0.769214	-1.278443	1.539295	0.958710	...	0	0	0	0	0	0	0	0	0	8.075655166
340796	J0197	1.200640	0.538954	0.167965	0.922758	0.744436	-1.120300	1.203971	-1.634509	-1.024828	...	0	0	0	0	0	0	0	0	0	7.724697415
340797	J0198	-0.012128	1.058846	0.790059	-1.558608	-0.981792	1.320509	-1.278443	-1.634509	0.628121	...	0	0	0	0	0	0	0	0	0	-1.767100969
340798	J0199	-0.012128	0.885549	1.412153	-0.938266	1.175993	-0.822883	1.203971	-0.999748	0.297531	...	0	0	0	0	0	0	0	0	0	-2.661014584
340799	J0200	-0.012128	0.885549	0.323489	-1.558608	-1.197571	0.290188	-1.278443	0.269773	0.628121	...	0	0	0	0	0	0	0	0	0	4.231475045

340800 rows × 138 columns

3.合并客户数据和基金数据

  
# 处理六：合并两张表格
df = pd.concat([new_customer_df, concat_fund_df], axis=1)

  
# 删除列的标签，axis=1 表示列
df = df.drop(['选择基金', '基金代码', '基金成立时间', 'date'], axis=1)
# 将target'选择排序' 移到最后一列
df['选择排序'] = df.pop('选择排序')
# 修改 feature为 fund_feature
for i in range(40, 70):
    df = df.rename(columns={df.columns[i]: 'fund_' + df.columns[i]})

# 输出csv
# df.to_csv('output.csv', encoding='utf-8')

4.预测

  
from sklearn.model_selection import train_test_split

# 划分数据集为训练集和测试集
X = df.drop('选择排序', axis=1)
y = df['选择排序']

# 划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  
# Perceptron模型
from sklearn.linear_model import Perceptron
perceptron_classifier = Perceptron(random_state=42, verbose=True, max_iter=1000)

perceptron_classifier.fit(X_train, y_train)
y_pred_pc = perceptron_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_pc)
print(f"Accuracy using Perceptron: {accuracy}, y_pred is {y_pred_pc}")

-- Epoch 1
Norm: 92.46, NNZs: 167, Bias: 68.000000, T: 272640, Avg. loss: 1.252741
Total training time: 0.09 seconds.
-- Epoch 2
Norm: 106.06, NNZs: 164, Bias: 68.000000, T: 545280, Avg. loss: 1.228747
Total training time: 0.25 seconds.
-- Epoch 3
Norm: 121.13, NNZs: 166, Bias: 67.000000, T: 817920, Avg. loss: 1.230478
Total training time: 0.37 seconds.
-- Epoch 4
Norm: 126.41, NNZs: 165, Bias: 63.000000, T: 1090560, Avg. loss: 1.230669
Total training time: 0.53 seconds.
-- Epoch 5
Norm: 120.86, NNZs: 167, Bias: 62.000000, T: 1363200, Avg. loss: 1.225637
Total training time: 0.68 seconds.
-- Epoch 6
Norm: 128.14, NNZs: 170, Bias: 69.000000, T: 1635840, Avg. loss: 1.224242
Total training time: 0.79 seconds.
-- Epoch 7
Norm: 125.84, NNZs: 167, Bias: 61.000000, T: 1908480, Avg. loss: 1.222632
Total training time: 0.89 seconds.
-- Epoch 8
Norm: 123.26, NNZs: 163, Bias: 69.000000, T: 2181120, Avg. loss: 1.218295
Total training time: 1.00 seconds.
-- Epoch 9
Norm: 126.76, NNZs: 167, Bias: 64.000000, T: 2453760, Avg. loss: 1.227991
Total training time: 1.10 seconds.
-- Epoch 10
Norm: 131.06, NNZs: 169, Bias: 65.000000, T: 2726400, Avg. loss: 1.234568
Total training time: 1.23 seconds.
-- Epoch 11
Norm: 132.18, NNZs: 166, Bias: 65.000000, T: 2999040, Avg. loss: 1.217471
Total training time: 1.33 seconds.
-- Epoch 12
Norm: 131.57, NNZs: 165, Bias: 63.000000, T: 3271680, Avg. loss: 1.235366
Total training time: 1.44 seconds.
-- Epoch 13
Norm: 136.27, NNZs: 166, Bias: 63.000000, T: 3544320, Avg. loss: 1.225213
Total training time: 1.55 seconds.
Convergence after 13 epochs took 1.55 seconds
-- Epoch 1
Norm: 78.94, NNZs: 162, Bias: -60.000000, T: 272640, Avg. loss: 0.565198
Total training time: 0.13 seconds.
-- Epoch 2
Norm: 93.65, NNZs: 167, Bias: -60.000000, T: 545280, Avg. loss: 0.548994
Total training time: 0.28 seconds.
-- Epoch 3
Norm: 107.45, NNZs: 167, Bias: -63.000000, T: 817920, Avg. loss: 0.543448
Total training time: 0.38 seconds.
-- Epoch 4
Norm: 112.42, NNZs: 162, Bias: -66.000000, T: 1090560, Avg. loss: 0.542047
Total training time: 0.49 seconds.
-- Epoch 5
Norm: 115.04, NNZs: 164, Bias: -62.000000, T: 1363200, Avg. loss: 0.550648
Total training time: 0.60 seconds.
-- Epoch 6
Norm: 128.83, NNZs: 165, Bias: -66.000000, T: 1635840, Avg. loss: 0.540281
Total training time: 0.71 seconds.
-- Epoch 7
Norm: 132.78, NNZs: 169, Bias: -63.000000, T: 1908480, Avg. loss: 0.542232
Total training time: 0.82 seconds.
-- Epoch 8
Norm: 136.39, NNZs: 167, Bias: -64.000000, T: 2181120, Avg. loss: 0.546548
Total training time: 0.92 seconds.
-- Epoch 9
Norm: 137.78, NNZs: 166, Bias: -70.000000, T: 2453760, Avg. loss: 0.539798
Total training time: 1.04 seconds.
-- Epoch 10
Norm: 143.50, NNZs: 166, Bias: -69.000000, T: 2726400, Avg. loss: 0.544055
Total training time: 1.27 seconds.
-- Epoch 11
Norm: 143.70, NNZs: 167, Bias: -64.000000, T: 2999040, Avg. loss: 0.543328
Total training time: 1.36 seconds.
Convergence after 11 epochs took 1.36 seconds
-- Epoch 1
Norm: 87.99, NNZs: 162, Bias: -65.000000, T: 272640, Avg. loss: 0.567236
Total training time: 0.12 seconds.
-- Epoch 2
Norm: 93.60, NNZs: 166, Bias: -65.000000, T: 545280, Avg. loss: 0.552740
Total training time: 0.23 seconds.
-- Epoch 3
Norm: 105.11, NNZs: 162, Bias: -63.000000, T: 817920, Avg. loss: 0.552061
Total training time: 0.32 seconds.
-- Epoch 4
Norm: 110.40, NNZs: 164, Bias: -73.000000, T: 1090560, Avg. loss: 0.544508
Total training time: 0.48 seconds.
-- Epoch 5
Norm: 111.62, NNZs: 167, Bias: -67.000000, T: 1363200, Avg. loss: 0.551475
Total training time: 0.59 seconds.
-- Epoch 6
Norm: 112.71, NNZs: 169, Bias: -65.000000, T: 1635840, Avg. loss: 0.544028
Total training time: 0.68 seconds.
-- Epoch 7
Norm: 117.17, NNZs: 169, Bias: -64.000000, T: 1908480, Avg. loss: 0.547210
Total training time: 0.82 seconds.
-- Epoch 8
Norm: 117.80, NNZs: 168, Bias: -63.000000, T: 2181120, Avg. loss: 0.546045
Total training time: 0.92 seconds.
-- Epoch 9
Norm: 125.14, NNZs: 170, Bias: -69.000000, T: 2453760, Avg. loss: 0.545288
Total training time: 1.05 seconds.
Convergence after 9 epochs took 1.05 seconds
-- Epoch 1
Norm: 69.14, NNZs: 156, Bias: -65.000000, T: 272640, Avg. loss: 0.151736
Total training time: 0.12 seconds.
-- Epoch 2
Norm: 79.10, NNZs: 162, Bias: -72.000000, T: 545280, Avg. loss: 0.138197
Total training time: 0.26 seconds.
-- Epoch 3
Norm: 79.31, NNZs: 164, Bias: -70.000000, T: 817920, Avg. loss: 0.140124
Total training time: 0.37 seconds.
-- Epoch 4
Norm: 82.79, NNZs: 165, Bias: -68.000000, T: 1090560, Avg. loss: 0.137800
Total training time: 0.49 seconds.
-- Epoch 5
Norm: 90.31, NNZs: 166, Bias: -68.000000, T: 1363200, Avg. loss: 0.137556
Total training time: 0.59 seconds.
-- Epoch 6
Norm: 89.96, NNZs: 163, Bias: -69.000000, T: 1635840, Avg. loss: 0.137137
Total training time: 0.71 seconds.
-- Epoch 7
Norm: 94.54, NNZs: 165, Bias: -66.000000, T: 1908480, Avg. loss: 0.139736
Total training time: 0.83 seconds.
Convergence after 7 epochs took 0.83 seconds
-- Epoch 1
Norm: 61.09, NNZs: 148, Bias: -69.000000, T: 272640, Avg. loss: 0.029810
Total training time: 0.14 seconds.
-- Epoch 2
Norm: 69.45, NNZs: 148, Bias: -70.000000, T: 545280, Avg. loss: 0.028391
Total training time: 0.25 seconds.
-- Epoch 3
Norm: 74.09, NNZs: 157, Bias: -71.000000, T: 817920, Avg. loss: 0.025827
Total training time: 0.38 seconds.
-- Epoch 4
Norm: 78.66, NNZs: 162, Bias: -71.000000, T: 1090560, Avg. loss: 0.025156
Total training time: 0.50 seconds.
-- Epoch 5
Norm: 85.40, NNZs: 163, Bias: -71.000000, T: 1363200, Avg. loss: 0.024645
Total training time: 0.61 seconds.
-- Epoch 6
Norm: 86.81, NNZs: 167, Bias: -71.000000, T: 1635840, Avg. loss: 0.024701
Total training time: 0.74 seconds.
-- Epoch 7
Norm: 93.31, NNZs: 164, Bias: -74.000000, T: 1908480, Avg. loss: 0.023552
Total training time: 0.85 seconds.
-- Epoch 8
Norm: 94.23, NNZs: 166, Bias: -70.000000, T: 2181120, Avg. loss: 0.025229
Total training time: 0.96 seconds.
-- Epoch 9
Norm: 95.97, NNZs: 166, Bias: -74.000000, T: 2453760, Avg. loss: 0.023290
Total training time: 1.08 seconds.
-- Epoch 10
Norm: 100.04, NNZs: 166, Bias: -74.000000, T: 2726400, Avg. loss: 0.023572
Total training time: 1.18 seconds.
-- Epoch 11
Norm: 102.60, NNZs: 163, Bias: -74.000000, T: 2999040, Avg. loss: 0.023608
Total training time: 1.31 seconds.
-- Epoch 12
Norm: 104.93, NNZs: 165, Bias: -74.000000, T: 3271680, Avg. loss: 0.023670
Total training time: 1.47 seconds.
Convergence after 12 epochs took 1.47 seconds
Accuracy using Perceptron: 0.9211707746478873, y_pred is [0 0 0 ... 0 2 0]

  
# logistic regression逻辑回归模型
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(multi_class='auto', solver='liblinear', max_iter=1000, verbose=1)
logreg.fit(X_train, y_train)

y_pred_lr = logreg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy using logistic regression: {accuracy}, y_pred is {y_pred_lr}")

[LibLinear]Accuracy using logistic regression: 0.9883656103286385, y_pred is [0 0 0 ... 0 0 0]

  
# RandomForest随机森林
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(random_state=42, n_estimators=500, n_jobs = -1, verbose=1)
rf_classifier.fit(X_train, y_train)

y_pred_rf = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy using RandomForest: {accuracy}, y_pred is {y_pred_rf}")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.0min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.9s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    2.2s


Accuracy using RandomForest: 0.9883656103286385, y_pred is [0 0 0 ... 0 0 0]


[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:    2.6s finished

  
# # SVM分类
# from sklearn.svm import SVC

# svm_classifier = SVC(decision_function_shape='ovr', random_state=42)
# svm_classifier.fit(X_train, y_train)

# y_pred = svm_classifier.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy}")

Projects, zhongxin

This post is licensed under CC BY 4.0 by the author.

1. 处理客户数据

2. 处理基金数据

3.合并客户数据和基金数据

4.预测

Trending Tags