Post

中信技术考察之基金推荐模型

1
2
3
4
5
6
import csv
import copy
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

1. 处理客户数据

1
2
3
4
5
6
7
8
9
10
11
12
with open('./data/客户数据.csv', 'r', encoding='utf-8') as file:
    # 创建CSV读取器对象
    csv_reader = csv.reader(file)
    # 跳过标题行
    customer_headers = next(csv_reader)[1:]
    # print(f'Headers: {customer_headers}')
    customer_list = []
    # 遍历每一行并打印
    for row in csv_reader:
        customer_list.append(row[1:])
customer_df = pd.DataFrame(customer_list, columns=customer_headers)
customer_df  
客户编号客户公司名称客户公司编号feature1feature2feature3feature4feature5feature6feature7...feature35feature36feature37feature38feature39feature40第一选择基金第二选择基金第三选择基金第四选择基金
0C0001公司2040401009915.440367671930.166081948111...1310435644.99447821
1C0002公司390601106924.19789929860.327464351117...8311227454.38582584
2C0003公司1832724813616.7735323862010.48263483712...561044199.35864196
3C0004公司9817216396.375025793920.54568544813...1053718255.20456675
4C0005公司63636093654.1831004811950.657033027116...371133141.24379255
..................................................................
11995C11996公司1625856647377.1102564641730.32070252314...661275686.15755992
11996C11997公司416160456010.05886805740.98637608916...12412821611.28849184
11997C11998公司19364428143725.60124899660.990199579114...854348548.17353183
11998C11999公司7494209106030.56264901340.77565160316...12710186.35029667J0076
11999C12000公司14197964913012.6276525860.233256336112...459464518.21188006J0139J0034

12000 rows × 47 columns

1
2
3
4
5
# 处理一:筛选出训练数据
customer_list = customer_df.values.tolist()
print(len(customer_list))
customer_list = [row for row in customer_list if row[-4][0] == 'J']
print(len(customer_list))
1
2
12000
4780
1
2
# 处理二:删除客户及公司信息 只保留特征
customer_list = [row[3:] for row in customer_list]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# 处理三:复制
# 因为有4780行第一选择基金的数据,每行对应200支基金,训练数据有1000000数据量。故筛选至少选择2支基金的客户作为训练数据以减少训练成本 1704行每行对应200支基金
new_customer_list = []
strlist = ['J{:04d}'.format(i) for i in range(1, 201)]
for i in range(len(customer_list)):
    # 17800 = 89 * 200
    if customer_list[i][-1][0] == 'J':
    # ['J0116', 'J0029', 'J0133', 'J0156']
        #print(customer_list[i][-4:])
        for j in range(1, 5):
            tmp = copy.deepcopy(customer_list[i])
            tmp[-4] = customer_list[i][-5+j]
            tmp.append(j)
            new_customer_list.append(tmp)
        #print('-----------')
        for str in strlist:
            if str!= customer_list[i][-4] and str!= customer_list[i][-3] and str!= customer_list[i][-2] and str!= customer_list[i][-1]:
                tmp = copy.deepcopy(customer_list[i])
                tmp[-4] = str
                tmp.append(0)
                new_customer_list.append(tmp)
# 删除错例---客户数据.csv第10753行[J0163	J0012	J0139	J0012] 第二选择和第四选择重复
del new_customer_list[16403]
for i in range(len(customer_list)):
    # 256000 = 1280 * 200
    if customer_list[i][-3][0] == 'J' and customer_list[i][-2][0] != 'J' and customer_list[i][-1][0] != 'J':
        # ['J0132', 'J0081', '  ', '  ']
        #print(customer_list[i][-4:])
        for j in range(1, 3):
            tmp = copy.deepcopy(customer_list[i])
            tmp[-4] = customer_list[i][-5+j]
            tmp.append(j)
            new_customer_list.append(tmp)
        #print('-----------')
        for str in strlist:
            if str!= customer_list[i][-4] and str!= customer_list[i][-3]:
                tmp = copy.deepcopy(customer_list[i])
                tmp[-4] = str
                tmp.append(0)
                new_customer_list.append(tmp)          
    # 67000 = 335 * 200
    if customer_list[i][-2][0] == 'J' and customer_list[i][-1][0] != 'J':
        # ['J0188', 'J0191', 'J0068', '  ']
        #print(customer_list[i][-4:])
        for j in range(1, 4):
            tmp = copy.deepcopy(customer_list[i])
            tmp[-4] = customer_list[i][-5+j]
            tmp.append(j)
            new_customer_list.append(tmp)
        #print('-----------')
        for str in strlist:
            if str!= customer_list[i][-4] and str!= customer_list[i][-3] and str!= customer_list[i][-2]:
                tmp = copy.deepcopy(customer_list[i])
                tmp[-4] = str
                tmp.append(0)
                new_customer_list.append(tmp)
print(f'一共生成{len(new_customer_list)}条数据')
1
一共生成340800条数据
1
2
3
4
5
6
7
8
# 修改header
new_customer_headers = customer_headers[3:]+['选择排序']
for i in range(0, 40):
    new_customer_headers[i] = 'customer_' + new_customer_headers[i]
new_customer_headers[40] = '选择基金'   
new_customer_df = pd.DataFrame(new_customer_list, columns = new_customer_headers)
new_customer_df.drop(new_customer_df.columns[-4:-1], axis=1, inplace=True)
new_customer_df
customer_feature1customer_feature2customer_feature3customer_feature4customer_feature5customer_feature6customer_feature7customer_feature8customer_feature9customer_feature10...customer_feature33customer_feature34customer_feature35customer_feature36customer_feature37customer_feature38customer_feature39customer_feature40选择基金选择排序
023723.88129863710.6987772591131946.21768845155...4.81002859817210804046.45630119J00451
123723.88129863710.6987772591131946.21768845155...4.81002859817210804046.45630119J01092
223723.88129863710.6987772591131946.21768845155...4.81002859817210804046.45630119J00633
323723.88129863710.6987772591131946.21768845155...4.81002859817210804046.45630119J00994
423723.88129863710.6987772591131946.21768845155...4.81002859817210804046.45630119J00010
..................................................................
34079513012.6276525860.233256336112853.74050945155...9.6609244571459464518.21188006J01960
34079613012.6276525860.233256336112853.74050945155...9.6609244571459464518.21188006J01970
34079713012.6276525860.233256336112853.74050945155...9.6609244571459464518.21188006J01980
34079813012.6276525860.233256336112853.74050945155...9.6609244571459464518.21188006J01990
34079913012.6276525860.233256336112853.74050945155...9.6609244571459464518.21188006J02000

340800 rows × 42 columns

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 将 DataFrame 中所有缺失值和极值先全部转为np.nan
new_customer_list = new_customer_df.values.tolist()
for row in new_customer_list:
    # 提取dataframe中数值数据 从第三列起
    for i in range(0, len(row) - 2):
        if row[i] != '':
            row[i] = float(row[i])
            # 极值999999999转为np.nan
            if row[i] > 1000 or row[i] < -1000:
                row[i] = np.nan
for row in new_customer_list:
    for i in range(0, len(row) - 2):
        if row[i] == '':
            # 缺失值转为np.nan
            row[i] = np.nan
1
2
3
4
# 计算所有列的平均值
new_customer_df = pd.DataFrame(new_customer_list, columns = new_customer_headers[:-4] +new_customer_headers[-1:])
column_means = new_customer_df.iloc[:,:-2].mean(skipna=True)
print(column_means)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
customer_feature1       6.360880
customer_feature2      36.540284
customer_feature3      17.201055
customer_feature4     105.000593
customer_feature5       0.498709
customer_feature6       1.000000
customer_feature7       9.405694
customer_feature8     100.980251
customer_feature9      16.913861
customer_feature10     34.972123
customer_feature11      0.498026
customer_feature12      6.518365
customer_feature13     35.883363
customer_feature14     10.118418
customer_feature15     16.726703
customer_feature16     47.693500
customer_feature17     16.657743
customer_feature18     48.414837
customer_feature19     46.688786
customer_feature20      9.528190
customer_feature21     10.009301
customer_feature22     14.929339
customer_feature23     49.011289
customer_feature24     35.924674
customer_feature25      9.861468
customer_feature26      0.000000
customer_feature27     45.171292
customer_feature28      9.976480
customer_feature29     45.946405
customer_feature30      2.985731
customer_feature31      1.000000
customer_feature32     49.080166
customer_feature33     10.037611
customer_feature34      1.000000
customer_feature35      6.550535
customer_feature36      4.053039
customer_feature37      6.507101
customer_feature38     49.062278
customer_feature39     48.582398
customer_feature40     46.878168
dtype: float64
1
2
3
4
5
for row in new_customer_list:
    for i in range(0, len(row) - 2):
        if np.isnan(row[i]):
            # 缺失值转为np.nan
            row[i] = column_means[i]
1
2
# list 转回 dataframe
new_customer_df = pd.DataFrame(new_customer_list, columns = new_customer_headers[:-4] +new_customer_headers[-1:])
1
2
3
4
5
6
# 处理五:将特征重新缩放到零均值和单位方差来标准化数据
new_customer_df_scaled = new_customer_df.iloc[:,:-2]
new_customer_df_scaled = pd.DataFrame(StandardScaler().fit_transform(new_customer_df_scaled), columns=new_customer_df_scaled.columns)
# 将标准化后的40个特征和最后选择拼接
new_customer_df = pd.concat([new_customer_df_scaled, new_customer_df.iloc[:, -2:]], axis=1)
new_customer_df
customer_feature1customer_feature2customer_feature3customer_feature4customer_feature5customer_feature6customer_feature7customer_feature8customer_feature9customer_feature10...customer_feature33customer_feature34customer_feature35customer_feature36customer_feature37customer_feature38customer_feature39customer_feature40选择基金选择排序
0-1.2551410.0223130.686722-0.5800990.6965180.00.7012661.584568-1.0944600.973191...-0.9372510.00.130054-1.0376721.0081061.113990-0.309102-0.016149J00451
1-1.2551410.0223130.686722-0.5800990.6965180.00.7012661.584568-1.0944600.973191...-0.9372510.00.130054-1.0376721.0081061.113990-0.309102-0.016149J01092
2-1.2551410.0223130.686722-0.5800990.6965180.00.7012661.584568-1.0944600.973191...-0.9372510.00.130054-1.0376721.0081061.113990-0.309102-0.016149J00633
3-1.2551410.0223130.686722-0.5800990.6965180.00.7012661.584568-1.0944600.973191...-0.9372510.00.130054-1.0376721.0081061.113990-0.309102-0.016149J00994
4-1.2551410.0223130.686722-0.5800990.6965180.00.7012661.584568-1.0944600.973191...-0.9372510.00.130054-1.0376721.0081061.113990-0.309102-0.016149J00010
..................................................................
340795-1.542959-0.317437-0.470141-0.324177-0.9241500.00.506162-0.272220-1.3479310.973191...-0.0675360.0-0.7380070.4786240.719490-0.110265-0.129023-1.097325J01960
340796-1.542959-0.317437-0.470141-0.324177-0.9241500.00.506162-0.272220-1.3479310.973191...-0.0675360.0-0.7380070.4786240.719490-0.110265-0.129023-1.097325J01970
340797-1.542959-0.317437-0.470141-0.324177-0.9241500.00.506162-0.272220-1.3479310.973191...-0.0675360.0-0.7380070.4786240.719490-0.110265-0.129023-1.097325J01980
340798-1.542959-0.317437-0.470141-0.324177-0.9241500.00.506162-0.272220-1.3479310.973191...-0.0675360.0-0.7380070.4786240.719490-0.110265-0.129023-1.097325J01990
340799-1.542959-0.317437-0.470141-0.324177-0.9241500.00.506162-0.272220-1.3479310.973191...-0.0675360.0-0.7380070.4786240.719490-0.110265-0.129023-1.097325J02000

340800 rows × 42 columns

2. 处理基金数据

1
2
3
4
5
6
7
8
9
10
11
12
with open('./data/基金数据.csv', 'r', encoding='utf-8') as file:
    # 创建CSV读取器对象
    csv_reader = csv.reader(file)
    # 跳过标题行
    fund_headers = next(csv_reader)[1:]
    # print(f'Headers: {fund_headers}')
    fund_list = []
    # 遍历每一行并打印
    for row in csv_reader:
        fund_list.append(row[1:])
fund_df = pd.DataFrame(fund_list, columns=fund_headers)
fund_df.head()
基金代码基金公司基金成立时间净值feature1feature2feature3feature4feature5feature6...feature21feature22feature23feature24feature25feature26feature27feature28feature29feature30
0J0001基金公司322021/10/28-3.48364337816217314.0063104...1.0961015461.2459834432-2.5629464474612122.24522696-1.74427907
1J0002基金公司72023/4/7-2.011219162919410-3.581032341...-3.4714007671.5576284195-0.183853669631192.4915057672.959278664
2J0003基金公司322021/6/241.54954112372879.535030668...-0.2237317952.230497658198.769750467122162.7870809365.513654932
3J0004基金公司32022/11/22-3.000416018320394-6.844948334...-5.3774075890.111784599124.2466562411111931.5314188595.766965363
4J0005基金公司42023/9/22-2.3881256651168819.637912079...-1.4879626842.32685714967.02524313629522.8841851112.918918908

5 rows × 34 columns

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# 希望将‘基金成立时间’作为一个特征用于训练,对‘基金成立时间’进行独立编码
date = fund_df.iloc[:, 2:3]
# 将日期字符串转换为日期时间对象
date['date'] = pd.to_datetime(date['基金成立时间'])
# 提取年、月、日等信息作为新的特征
date['year'] = date['date'].dt.year
date['month'] = date['date'].dt.month
date['day'] = date['date'].dt.day
# 独热编码年份、月份等特征
date_encoded = pd.get_dummies(date, columns=['year', 'month', 'day']) # 200 rows × 48 columns

# 希望将‘基金公司’作为一个特征用于训练,对‘基金公司’进行独立编码
company = fund_df.iloc[:, 1:2]
company_encoded = pd.get_dummies(company) # 200 rows × 58 columns

# 汇总独立编码结果
fund_df.drop(['基金公司', '基金成立时间'], axis=1, inplace=True)
fund_df = pd.concat([fund_df, company_encoded, date_encoded], axis=1)
fund_df['净值'] = fund_df.pop('净值')
fund_df
基金代码feature1feature2feature3feature4feature5feature6feature7feature8feature9...day_23day_24day_25day_26day_27day_28day_29day_30day_31净值
0J000116217314.0063104338...000001000-3.483643378
1J00022919410-3.5810323411510...000000000-2.01121916
2J0003372879.535030668223...0100000001.54954112
3J0004320394-6.84494833421010...000000000-3.000416018
4J00051168819.637912079153...000000000-2.388125665
..................................................................
195J01963171352-1.5374491111119...0000000008.075655166
196J019731613911-3.686967574313...0000000007.724697415
197J0198219171311.25684561118...000000000-1.767100969
198J019921821313-1.866035653337...000000000-2.661014584
199J020021814124.948720634178...0000000004.231475045

200 rows × 138 columns

1
2
3
4
5
6
7
8
9
10
# 处理五:根据选择基金列 构建基金信息表
fund_list = fund_df.values.tolist()
fund_choice = new_customer_df.iloc[:, -2].tolist()
concat_fund_list = []
for j in range(len(fund_choice)):
    for i in range(len(fund_list)):
        if fund_list[i][0] == fund_choice[j]:
            concat_fund_list.append(fund_list[i])
# fund_headers = fund_headers[:1] + fund_headers[4:] + fund_headers[3:4]
concat_fund_df = pd.DataFrame(concat_fund_list, columns=fund_df.columns)
1
2
3
# 对基金信息表进行特征标准化
concat_fund_df_scaled = pd.DataFrame(StandardScaler().fit_transform(concat_fund_df.iloc[:, 1:31]), columns=concat_fund_df.iloc[:, 1:31].columns)
concat_fund_df_scaled
feature1feature2feature3feature4feature5feature6feature7feature8feature9feature10...feature21feature22feature23feature24feature25feature26feature27feature28feature29feature30
01.2006400.712252-1.076222-1.248437-0.3344570.956536-1.278443-0.0476070.958710-1.256648...0.0251460.8778760.460757-0.8390081.528116-1.4197691.6121920.5100280.438328-0.922319
11.200640-0.500829-1.231746-0.938266-0.981792-0.654327-1.2784430.2697730.9587101.051785...0.098985-1.4067730.7976901.288207-1.517962-0.2929680.374425-0.568255-1.6626970.292734
21.200640-0.327532-0.765175-0.317925-0.9817920.3045441.2039710.5871541.6198900.681958...0.868716-0.049282-1.055444-1.2293500.5127560.157752-0.5539011.5883110.9996670.447293
3-0.012128-1.3673150.6345360.302416-0.766014-0.2919531.2039710.5871540.958710-0.065375...-0.909839-0.816945-1.5608450.5790450.0050770.1577520.8385870.0787151.0138940.352525
4-1.224895-1.1940181.4121530.302416-0.9817921.7695861.203971-0.9997480.6281210.535437...1.513066-0.372056-1.560845-0.9007520.005077-0.5183280.2197040.7256850.831718-1.828897
..................................................................
3407951.2006400.7122520.167965-0.317925-1.197571-0.769214-1.2784431.5392950.958710-0.344894...0.9096691.288355-1.223911-0.2516741.020436-0.9690490.8385870.0787151.0356680.077653
3407961.2006400.5389540.1679650.9227580.744436-1.1203001.203971-1.634509-1.024828-1.678996...0.6496710.8350201.3030901.0783150.5127560.8338331.302750-1.2152250.1544960.301785
340797-0.0121281.0588460.790059-1.558608-0.9817921.320509-1.278443-1.6345090.6281210.141773...-0.2190620.9368700.629223-0.091423-1.5179621.0591930.529146-1.646538-0.5433170.626878
340798-0.0121280.8855491.412153-0.9382661.175993-0.8228831.203971-0.9997480.297531-0.051350...-0.315874-1.397704-1.2239110.4485271.528116-0.518328-1.3275061.1569980.509574-0.429399
340799-0.0121280.8855490.323489-1.558608-1.1975710.290188-1.2784430.2697730.628121-0.465884...-1.9312591.438801-0.044644-0.074317-1.010282-0.518328-1.3275060.7256851.391821-0.452211

340800 rows × 30 columns

1
2
concat_fund_df = pd.concat([concat_fund_df.iloc[:, :1], concat_fund_df_scaled, concat_fund_df.iloc[:, 31:]], axis=1)
concat_fund_df
基金代码feature1feature2feature3feature4feature5feature6feature7feature8feature9...day_23day_24day_25day_26day_27day_28day_29day_30day_31净值
0J00451.2006400.712252-1.076222-1.248437-0.3344570.956536-1.278443-0.0476070.958710...000000000-4.897256487
1J01091.200640-0.500829-1.231746-0.938266-0.981792-0.654327-1.2784430.2697730.958710...000000000-4.547777652
2J00631.200640-0.327532-0.765175-0.317925-0.9817920.3045441.2039710.5871541.619890...000000100-4.306791468
3J0099-0.012128-1.3673150.6345360.302416-0.766014-0.2919531.2039710.5871540.958710...00000000012.98199228
4J0001-1.224895-1.1940181.4121530.302416-0.9817921.7695861.203971-0.9997480.628121...000001000-3.483643378
..................................................................
340795J01961.2006400.7122520.167965-0.317925-1.197571-0.769214-1.2784431.5392950.958710...0000000008.075655166
340796J01971.2006400.5389540.1679650.9227580.744436-1.1203001.203971-1.634509-1.024828...0000000007.724697415
340797J0198-0.0121281.0588460.790059-1.558608-0.9817921.320509-1.278443-1.6345090.628121...000000000-1.767100969
340798J0199-0.0121280.8855491.412153-0.9382661.175993-0.8228831.203971-0.9997480.297531...000000000-2.661014584
340799J0200-0.0121280.8855490.323489-1.558608-1.1975710.290188-1.2784430.2697730.628121...0000000004.231475045

340800 rows × 138 columns

3.合并客户数据和基金数据

1
2
# 处理六:合并两张表格
df = pd.concat([new_customer_df, concat_fund_df], axis=1)
1
2
3
4
5
6
7
# 删除列的标签,axis=1 表示列
df = df.drop(['选择基金', '基金代码', '基金成立时间', 'date'], axis=1)
# 将target'选择排序' 移到最后一列
df['选择排序'] = df.pop('选择排序')
# 修改 feature为 fund_feature
for i in range(40, 70):
    df = df.rename(columns={df.columns[i]: 'fund_' + df.columns[i]})
1
2
# 输出csv
# df.to_csv('output.csv', encoding='utf-8')

4.预测

1
2
3
4
5
6
7
8
9
from sklearn.model_selection import train_test_split

# 划分数据集为训练集和测试集
X = df.drop('选择排序', axis=1)
y = df['选择排序']

# 划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

1
2
3
4
5
6
7
8
9
# Perceptron模型
from sklearn.linear_model import Perceptron
perceptron_classifier = Perceptron(random_state=42, verbose=True, max_iter=1000)

perceptron_classifier.fit(X_train, y_train)
y_pred_pc = perceptron_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_pc)
print(f"Accuracy using Perceptron: {accuracy}, y_pred is {y_pred_pc}")
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
-- Epoch 1
Norm: 92.46, NNZs: 167, Bias: 68.000000, T: 272640, Avg. loss: 1.252741
Total training time: 0.09 seconds.
-- Epoch 2
Norm: 106.06, NNZs: 164, Bias: 68.000000, T: 545280, Avg. loss: 1.228747
Total training time: 0.25 seconds.
-- Epoch 3
Norm: 121.13, NNZs: 166, Bias: 67.000000, T: 817920, Avg. loss: 1.230478
Total training time: 0.37 seconds.
-- Epoch 4
Norm: 126.41, NNZs: 165, Bias: 63.000000, T: 1090560, Avg. loss: 1.230669
Total training time: 0.53 seconds.
-- Epoch 5
Norm: 120.86, NNZs: 167, Bias: 62.000000, T: 1363200, Avg. loss: 1.225637
Total training time: 0.68 seconds.
-- Epoch 6
Norm: 128.14, NNZs: 170, Bias: 69.000000, T: 1635840, Avg. loss: 1.224242
Total training time: 0.79 seconds.
-- Epoch 7
Norm: 125.84, NNZs: 167, Bias: 61.000000, T: 1908480, Avg. loss: 1.222632
Total training time: 0.89 seconds.
-- Epoch 8
Norm: 123.26, NNZs: 163, Bias: 69.000000, T: 2181120, Avg. loss: 1.218295
Total training time: 1.00 seconds.
-- Epoch 9
Norm: 126.76, NNZs: 167, Bias: 64.000000, T: 2453760, Avg. loss: 1.227991
Total training time: 1.10 seconds.
-- Epoch 10
Norm: 131.06, NNZs: 169, Bias: 65.000000, T: 2726400, Avg. loss: 1.234568
Total training time: 1.23 seconds.
-- Epoch 11
Norm: 132.18, NNZs: 166, Bias: 65.000000, T: 2999040, Avg. loss: 1.217471
Total training time: 1.33 seconds.
-- Epoch 12
Norm: 131.57, NNZs: 165, Bias: 63.000000, T: 3271680, Avg. loss: 1.235366
Total training time: 1.44 seconds.
-- Epoch 13
Norm: 136.27, NNZs: 166, Bias: 63.000000, T: 3544320, Avg. loss: 1.225213
Total training time: 1.55 seconds.
Convergence after 13 epochs took 1.55 seconds
-- Epoch 1
Norm: 78.94, NNZs: 162, Bias: -60.000000, T: 272640, Avg. loss: 0.565198
Total training time: 0.13 seconds.
-- Epoch 2
Norm: 93.65, NNZs: 167, Bias: -60.000000, T: 545280, Avg. loss: 0.548994
Total training time: 0.28 seconds.
-- Epoch 3
Norm: 107.45, NNZs: 167, Bias: -63.000000, T: 817920, Avg. loss: 0.543448
Total training time: 0.38 seconds.
-- Epoch 4
Norm: 112.42, NNZs: 162, Bias: -66.000000, T: 1090560, Avg. loss: 0.542047
Total training time: 0.49 seconds.
-- Epoch 5
Norm: 115.04, NNZs: 164, Bias: -62.000000, T: 1363200, Avg. loss: 0.550648
Total training time: 0.60 seconds.
-- Epoch 6
Norm: 128.83, NNZs: 165, Bias: -66.000000, T: 1635840, Avg. loss: 0.540281
Total training time: 0.71 seconds.
-- Epoch 7
Norm: 132.78, NNZs: 169, Bias: -63.000000, T: 1908480, Avg. loss: 0.542232
Total training time: 0.82 seconds.
-- Epoch 8
Norm: 136.39, NNZs: 167, Bias: -64.000000, T: 2181120, Avg. loss: 0.546548
Total training time: 0.92 seconds.
-- Epoch 9
Norm: 137.78, NNZs: 166, Bias: -70.000000, T: 2453760, Avg. loss: 0.539798
Total training time: 1.04 seconds.
-- Epoch 10
Norm: 143.50, NNZs: 166, Bias: -69.000000, T: 2726400, Avg. loss: 0.544055
Total training time: 1.27 seconds.
-- Epoch 11
Norm: 143.70, NNZs: 167, Bias: -64.000000, T: 2999040, Avg. loss: 0.543328
Total training time: 1.36 seconds.
Convergence after 11 epochs took 1.36 seconds
-- Epoch 1
Norm: 87.99, NNZs: 162, Bias: -65.000000, T: 272640, Avg. loss: 0.567236
Total training time: 0.12 seconds.
-- Epoch 2
Norm: 93.60, NNZs: 166, Bias: -65.000000, T: 545280, Avg. loss: 0.552740
Total training time: 0.23 seconds.
-- Epoch 3
Norm: 105.11, NNZs: 162, Bias: -63.000000, T: 817920, Avg. loss: 0.552061
Total training time: 0.32 seconds.
-- Epoch 4
Norm: 110.40, NNZs: 164, Bias: -73.000000, T: 1090560, Avg. loss: 0.544508
Total training time: 0.48 seconds.
-- Epoch 5
Norm: 111.62, NNZs: 167, Bias: -67.000000, T: 1363200, Avg. loss: 0.551475
Total training time: 0.59 seconds.
-- Epoch 6
Norm: 112.71, NNZs: 169, Bias: -65.000000, T: 1635840, Avg. loss: 0.544028
Total training time: 0.68 seconds.
-- Epoch 7
Norm: 117.17, NNZs: 169, Bias: -64.000000, T: 1908480, Avg. loss: 0.547210
Total training time: 0.82 seconds.
-- Epoch 8
Norm: 117.80, NNZs: 168, Bias: -63.000000, T: 2181120, Avg. loss: 0.546045
Total training time: 0.92 seconds.
-- Epoch 9
Norm: 125.14, NNZs: 170, Bias: -69.000000, T: 2453760, Avg. loss: 0.545288
Total training time: 1.05 seconds.
Convergence after 9 epochs took 1.05 seconds
-- Epoch 1
Norm: 69.14, NNZs: 156, Bias: -65.000000, T: 272640, Avg. loss: 0.151736
Total training time: 0.12 seconds.
-- Epoch 2
Norm: 79.10, NNZs: 162, Bias: -72.000000, T: 545280, Avg. loss: 0.138197
Total training time: 0.26 seconds.
-- Epoch 3
Norm: 79.31, NNZs: 164, Bias: -70.000000, T: 817920, Avg. loss: 0.140124
Total training time: 0.37 seconds.
-- Epoch 4
Norm: 82.79, NNZs: 165, Bias: -68.000000, T: 1090560, Avg. loss: 0.137800
Total training time: 0.49 seconds.
-- Epoch 5
Norm: 90.31, NNZs: 166, Bias: -68.000000, T: 1363200, Avg. loss: 0.137556
Total training time: 0.59 seconds.
-- Epoch 6
Norm: 89.96, NNZs: 163, Bias: -69.000000, T: 1635840, Avg. loss: 0.137137
Total training time: 0.71 seconds.
-- Epoch 7
Norm: 94.54, NNZs: 165, Bias: -66.000000, T: 1908480, Avg. loss: 0.139736
Total training time: 0.83 seconds.
Convergence after 7 epochs took 0.83 seconds
-- Epoch 1
Norm: 61.09, NNZs: 148, Bias: -69.000000, T: 272640, Avg. loss: 0.029810
Total training time: 0.14 seconds.
-- Epoch 2
Norm: 69.45, NNZs: 148, Bias: -70.000000, T: 545280, Avg. loss: 0.028391
Total training time: 0.25 seconds.
-- Epoch 3
Norm: 74.09, NNZs: 157, Bias: -71.000000, T: 817920, Avg. loss: 0.025827
Total training time: 0.38 seconds.
-- Epoch 4
Norm: 78.66, NNZs: 162, Bias: -71.000000, T: 1090560, Avg. loss: 0.025156
Total training time: 0.50 seconds.
-- Epoch 5
Norm: 85.40, NNZs: 163, Bias: -71.000000, T: 1363200, Avg. loss: 0.024645
Total training time: 0.61 seconds.
-- Epoch 6
Norm: 86.81, NNZs: 167, Bias: -71.000000, T: 1635840, Avg. loss: 0.024701
Total training time: 0.74 seconds.
-- Epoch 7
Norm: 93.31, NNZs: 164, Bias: -74.000000, T: 1908480, Avg. loss: 0.023552
Total training time: 0.85 seconds.
-- Epoch 8
Norm: 94.23, NNZs: 166, Bias: -70.000000, T: 2181120, Avg. loss: 0.025229
Total training time: 0.96 seconds.
-- Epoch 9
Norm: 95.97, NNZs: 166, Bias: -74.000000, T: 2453760, Avg. loss: 0.023290
Total training time: 1.08 seconds.
-- Epoch 10
Norm: 100.04, NNZs: 166, Bias: -74.000000, T: 2726400, Avg. loss: 0.023572
Total training time: 1.18 seconds.
-- Epoch 11
Norm: 102.60, NNZs: 163, Bias: -74.000000, T: 2999040, Avg. loss: 0.023608
Total training time: 1.31 seconds.
-- Epoch 12
Norm: 104.93, NNZs: 165, Bias: -74.000000, T: 3271680, Avg. loss: 0.023670
Total training time: 1.47 seconds.
Convergence after 12 epochs took 1.47 seconds
Accuracy using Perceptron: 0.9211707746478873, y_pred is [0 0 0 ... 0 2 0]
1
2
3
4
5
6
7
8
9
10
# logistic regression逻辑回归模型
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(multi_class='auto', solver='liblinear', max_iter=1000, verbose=1)
logreg.fit(X_train, y_train)

y_pred_lr = logreg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy using logistic regression: {accuracy}, y_pred is {y_pred_lr}")
1
[LibLinear]Accuracy using logistic regression: 0.9883656103286385, y_pred is [0 0 0 ... 0 0 0]
1
2
3
4
5
6
7
8
9
10
# RandomForest随机森林
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(random_state=42, n_estimators=500, n_jobs = -1, verbose=1)
rf_classifier.fit(X_train, y_train)

y_pred_rf = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy using RandomForest: {accuracy}, y_pred is {y_pred_rf}")
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.0min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.9s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    2.2s


Accuracy using RandomForest: 0.9883656103286385, y_pred is [0 0 0 ... 0 0 0]


[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:    2.6s finished
1
2
3
4
5
6
7
8
9
10
# # SVM分类
# from sklearn.svm import SVC

# svm_classifier = SVC(decision_function_shape='ovr', random_state=42)
# svm_classifier.fit(X_train, y_train)

# y_pred = svm_classifier.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy}")
This post is licensed under CC BY 4.0 by the author.

Comments powered by Disqus.