Разработка модели обнаружения компрометации банковских транзакций

Разработка комплексных подходов обнаружения скомпрометированных банковских транзакций на основании ряда их признаков с помощью методов машинного обучения и других аналитических моделей. Анализ банковских операций, включая все их виды и особенности.

Рубрика Программирование, компьютеры и кибернетика
Вид дипломная работа
Язык русский
Дата добавления 07.12.2019
Размер файла 4,0 M

Отправить свою хорошую работу в базу знаний просто. Используйте форму, расположенную ниже

Студенты, аспиранты, молодые ученые, использующие базу знаний в своей учебе и работе, будут вам очень благодарны.

for i, v in enumerate(pvs):

ax.text(i - 0.2, v + 0.01, str(round(v, 3)), color='black', fontsize=8)

plt.show()

return imp_cols

def get_train_test_dataset(df_array, test_percent, classname):

flen = len(df_array[0].loc[df_array[0][classname] == 1])

nflen = len(df_array[0].loc[df_array[0][classname] == 0])

nf_train_array = []

f_train_array = []

nf_test_array = []

f_test_array = []

random.seed()

r = random.sample(range(nflen), int(round(nflen * test_percent)))

for df in df_array:

no_fraud = df.loc[df[classname] == 0]

nf_train_array.append(no_fraud.iloc[r])

nf_test_array.append(no_fraud.drop(no_fraud.iloc[r].axes[0], axis=0))

random.seed()

r = random.sample(range(flen), int(round(flen * test_percent)))

for df in df_array:

fraud = df.loc[df[classname] == 1]

f_train_array.append(fraud.iloc[r])

f_test_array.append(fraud.drop(fraud.iloc[r].axes[0], axis=0))

results_train = []

results_test = []

for i in range(len(nf_train_array)):

results_train.append(pd.concat([nf_train_array[i], f_train_array[i]]))

results_test.append(pd.concat([nf_test_array[i], f_test_array[i]]))

return results_train, results_test

def get_smote_over_sampling(x_df, y_df, fraud_percent, classname):

rate = fraud_percent / (1 - fraud_percent)

sm = SMOTE(sampling_strategy=rate)

x_res, y_res = sm.fit_resample(x_df, y_df)

y_res_df = pd.DataFrame(data=y_res, columns=[classname])

x_res_df = pd.DataFrame(data=x_res, columns=x_df.columns.values)

return [x_res_df, y_res_df]

def multiple_result(df, model_str, print_results=False):

pr_mean = df['Precision'].mean()

rec_mean = df['Recall'].mean()

f1_mean = df['F1'].mean()

acc_mean = df['Accuracy'].mean()

tp_mean = df['TP'].mean()

tn_mean = df['TN'].mean()

fp_mean = df['FP'].mean()

fn_mean = df['FN'].mean()

hss_mean = df['HSS'].mean()

if print_results:

print('\n---' + model_str + '---\n')

plt.suptitle(model_str)

plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.6, hspace=0.35)

plt.subplot(2, 4, 1)

plt.ylabel('Score')

plt.boxplot(df['Precision'].tolist())

plt.title('Precision')

plt.subplot(2, 4, 2)

plt.boxplot(df['Recall'].tolist())

plt.title('Recall')

plt.subplot(2, 4, 3)

plt.boxplot(df['F1'].tolist())

plt.title('F1')

plt.subplot(2, 4, 4)

plt.boxplot(df['HSS'].tolist())

plt.title('HSS')

plt.subplot(2, 4, 5)

plt.ylabel('Number of transactions')

plt.boxplot(df['TP'].tolist())

plt.title('True positive')

ax = plt.subplot(2, 4, 6)

ax.get_yaxis().get_major_formatter().set_useOffset(False)

plt.boxplot(df['TN'].tolist())

plt.title('True negative')

plt.subplot(2, 4, 7)

plt.boxplot(df['FP'].tolist())

plt.title('False positive')

plt.subplot(2, 4, 8)

plt.boxplot(df['FN'].tolist())

plt.title('False negative')

# plt.ticklabel_format(style='plain')

plt.show()

plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=None)

# plt.title('ROC Curve ' + model_str, fontsize=12)

# fpr = fp_mean / (tn_mean+fp_mean)

# tpr = tp_mean / (tp_mean+fn_mean)

#

# plt.plot([0, fpr, 1],

# [0, tpr, 1], linewidth=2)

# plt.plot([0, 1], [0, 1], 'r--')

# plt.xlabel('False Positive Rate', fontsize=12)

# plt.ylabel('True Positive Rate', fontsize=12)

# plt.axis([-0.01, 1, 0, 1])

# plt.show()

print('Precision mean: ' + str(round(pr_mean, 4)))

print('Recall mean: ' + str(round(rec_mean, 4)))

print('F1 mean: ' + str(round(f1_mean, 4)))

print('HSS mean: ' + str(round(hss_mean, 4)))

print('True positive mean: ' + str(round(tp_mean, 4)) + ' | ' + str(round(tp_mean / (tp_mean + fn_mean), 2)))

print('True negative mean: ' + str(round(tn_mean, 4)))

print('False positive mean: ' + str(round(fp_mean, 4)))

print('False negative mean: ' + str(round(fn_mean, 4)) + ' | ' + str(round(fn_mean / (tp_mean + fn_mean), 2)))

return pr_mean, rec_mean, f1_mean, acc_mean, hss_mean, tp_mean, tn_mean, fp_mean, fn_mean

def test_model(model, model_name, x_train, y_train, x_test, y_test, show_reports, show_roc=False):

start_time = time.time()

y_train = y_train.values.ravel()

y_test = y_test.values.ravel()

model.fit(x_train, y_train)

duration_fit = time.time() - start_time

start_time = time.time()

y_res = model.predict(x_test)

y_pred_proba = model.predict_proba(x_test)[::, 1]

fpr, tpr, threshold = roc_curve(y_test, y_pred_proba)

auc = skl.metrics.roc_auc_score(y_test, y_pred_proba)

if show_reports:

plt.plot(fpr, tpr, label="auc=" + str(auc))

plt.title('ROC Curve ' + model_name, fontsize=12)

plt.legend(loc=4)

plt.xlabel('False Positive Rate', fontsize=12)

plt.ylabel('True Positive Rate', fontsize=12)

plt.show()

duration_predict = time.time() - start_time

precision_sc, recall_sc, f1, accuracy_sc = \

get_model_scores(y_res, y_test, model_name,

show_report=show_reports)

tn_sc, tp_sc, fp_sc, fn_sc, hss = get_conf_matrix_data(y_res, y_test, print_data=show_reports)

if show_reports:

print('AUC: ' + str(round(auc, 4)))

if show_reports:

print('HSS: ', hss)

if 'Decision tree' in model_name:

skl.tree.export_graphviz(model, out_file=save_to_path + model_name + '.dot', class_names=['No fraud', 'Fraud'],

filled=True)

return precision_sc, recall_sc, f1, accuracy_sc, tn_sc, tp_sc, fp_sc, fn_sc, hss, duration_fit,\

duration_predict, model

def get_model_scores(y_pred, y_act, model_str, show_roc=False, show_report=False):

precision_sc = precision_score(y_act, y_pred)

recall_sc = recall_score(y_act, y_pred)

accuracy_sc = accuracy_score(y_act, y_pred)

f1_sc = f1_score(y_act, y_pred)

if show_report:

print('\n' + model_str + '\n')

print(classification_report(y_act, y_pred))

print('Precision: ' + str(round(precision_sc, 4)))

print('Recall: ' + str(round(recall_sc, 4)))

print('F1: ' + str(round(f1_sc, 4)))

print('Accuracy: ' + str(round(accuracy_sc, 4)))

get_conf_matrix_data(y_pred, y_act, False)

if show_roc:

fpr, tpr, threshold = roc_curve(y_act, y_pred)

plt.title('ROC Curve ' + model_str, fontsize=12)

plt.plot(fpr, tpr, linewidth=2)

plt.plot([0, 1], [0, 1], 'r--')

plt.xlabel('False Positive Rate', fontsize=12)

plt.ylabel('True Positive Rate', fontsize=12)

plt.axis([-0.01, 1, 0, 1])

plt.show()

return precision_sc, recall_sc, f1_sc, accuracy_sc

def get_conf_matrix_data(y_pred, y_act, print_data=False):

cm = confusion_matrix(y_act, y_pred)

tn_sc, tp_sc, fp_sc, fn_sc = cm[0][0], cm[1][1], cm[0][1], cm[1][0]

# Heidke's Skill Score

hss = 2 * (tp_sc * tn_sc - fp_sc * fn_sc) / ((tp_sc + fn_sc) * (fn_sc + tn_sc) + (tp_sc + fp_sc) * (fp_sc + tn_sc))

if print_data:

print('True Negative: ', str(tn_sc))

print('True Positive: ', str(tp_sc))

print('False Positive: ', str(fp_sc))

print('False Negative: ', str(fn_sc))

return tn_sc, tp_sc, fp_sc, fn_sc, hss

def split_variable_by_percentile(array, class_number, var_name):

percents = []

for i in range(class_number):

percents.append(round(100 / class_number * (i + 1), 1))

percentile_ranges = np.percentile(a=array, q=percents)

class_var = []

for val in array:

not_empty = False

for i in range(len(percentile_ranges)):

if i == 0:

if val <= percentile_ranges[i]:

class_var.append(i)

not_empty = True

else:

if percentile_ranges[i-1] < val <= percentile_ranges[i]:

class_var.append(i)

not_empty = True

if not not_empty:

print('SPLITTING ERROR')

return pd.DataFrame(columns=[var_name], data=class_var)

def create_grouped_dataset(df, class_number):

if os.path.isfile(save_to_path + 'Classed_' + str(class_number) + '.csv'):

return pd.read_csv(save_to_path + 'Classed_' + str(class_number) + '.csv', sep=';', index_col=0)

else:

print('Splitting started')

class_column = df['Class']

df = df.drop('Class', axis=1)

cols = df.columns.values

var_columns = []

for c in cols:

var_columns.append(

split_variable_by_percentile(array=df[c], class_number=class_number, var_name=c + '_Class'))

var_columns.append(class_column)

print('Splitting finished')

result_ds = pd.concat(var_columns, axis=1, sort=False)

result_ds.to_csv(save_to_path + 'Classed_' + str(class_number) + '.csv', sep=';')

return result_ds

def major_voting(x_test, rate):

results = []

for i, e in enumerate(x_test):

votes_number = np.sum(e)

if votes_number >= rate:

results.append(1)

else:

results.append(0)

return results

# ---Downloading data------------------

data = pd.read_csv(data_path)

# ---Iterative process------------------

lines_num = iterations_number * len(models_list)

model_score = pd.DataFrame(columns=['Model', 'Precision', 'Recall', 'F1', 'Accuracy', 'TN', 'TP', 'FP', 'FN',

'HSS'],

index=range(lines_num))

ln = 0

classed_data_20 = create_grouped_dataset(df=data, class_number=20)

classed_data_30 = create_grouped_dataset(df=data, class_number=30)

imp = sample_t_tests(data, class_name, to_print=False)

imp_classed_20 = sample_t_tests(classed_data_20, class_name, to_print=False)

imp_classed_30 = sample_t_tests(classed_data_30, class_name, to_print=False)

print("Columns to use:")

print(imp)

print(imp_classed_20)

print(imp_classed_30)

for i in range(iterations_number):

print('\nIteration ' + str(i) + ' started\n')

start_time = time.time()

[train_dt, train_20_dt, train_30_dt], [test_dt, test_20_dt, test_30_dt] =\

get_train_test_dataset([data, classed_data_20, classed_data_30], train_rate, class_name)

x_train_dt = train_dt[imp]

x_train_20_dt = train_20_dt[imp_classed_20]

x_train_30_dt = train_30_dt[imp_classed_30]

y_train_dt = train_dt[class_name]

if train_rate == 1 or test_at_all:

x_test = data[imp]

x_test_20 = classed_data_20[imp_classed_20]

x_test_30 = classed_data_30[imp_classed_30]

y_test = data[class_name]

else:

x_test = test_dt[imp]

x_test_20 = test_20_dt[imp_classed_20]

x_test_30 = test_30_dt[imp_classed_30]

y_test = test_dt[class_name]

# ---Data analysis---

x_train_array = ()

x_test_array = ()

x_train_WOKNN_array = ()

x_test_WOKNN_array = ()

# Logistic regression

if logistic_regression_model:

# SMOTE oversampling

mod_start_time = time.time()

print('Logistic regression start')

x_train_smote, y_train_smote = get_smote_over_sampling(x_train_30_dt, y_train_dt, 0.01, class_name)

log_reg = skl.linear_model.LogisticRegression(solver='liblinear', tol=0.001)

log_reg = test_model(model=log_reg, model_name='Logistic regression',

x_train=x_train_smote, y_train=y_train_smote, x_test=x_test_30, y_test=y_test,

show_reports=False)[11]

x_train_log_reg = log_reg.predict(x_train_30_dt)

x_test_log_reg = log_reg.predict(x_test_30)

x_train_array = x_train_array + (x_train_log_reg,)

x_test_array = x_test_array + (x_test_log_reg,)

x_train_WOKNN_array = x_train_WOKNN_array + (x_train_log_reg,)

x_test_WOKNN_array = x_test_WOKNN_array + (x_test_log_reg,)

print('Finished - ' + str(time.time() - mod_start_time) + ' sec')

# K nearest neighbours

if k_nearest_model:

# SMOTE oversampling

print('KNN start')

mod_start_time = time.time()

x_train_smote, y_train_smote = get_smote_over_sampling(x_train_30_dt, y_train_dt, 0.01, class_name)

knn = skl.neighbors.KNeighborsClassifier()

knn = test_model(model=knn, model_name='KNN',

x_train=x_train_smote, y_train=y_train_smote,

x_test=x_test_30, y_test=y_test, show_reports=False)[11]

x_train_knn = knn.predict(x_train_30_dt)

x_test_knn = knn.predict(x_test_30)

x_train_array = x_train_array + (x_train_knn,)

x_test_array = x_test_array + (x_test_knn,)

print('Finished - ' + str(time.time() - mod_start_time) + ' sec')

# Naive Bayes

if naive_bayes_model:

print('NB start')

mod_start_time = time.time()

gnb = GaussianNB()

gnb = test_model(model=gnb, model_name='Gaussian NB',

x_train=x_train_20_dt, y_train=y_train_dt, x_test=x_test_20, y_test=y_test,

show_reports=False)[11]

x_train_nb = gnb.predict(x_train_20_dt)

x_test_nb = gnb.predict(x_test_20)

x_train_array = x_train_array + (x_train_nb,)

x_test_array = x_test_array + (x_test_nb,)

x_train_WOKNN_array = x_train_WOKNN_array + (x_train_nb,)

x_test_WOKNN_array = x_test_WOKNN_array + (x_test_nb,)

print('Finished - ' + str(time.time() - mod_start_time) + ' sec')

# Support Vector Machine

if svm_model:

print('SVM start')

mod_start_time = time.time()

svm = CalibratedClassifierCV(base_estimator=LinearSVC(dual=False))

x_train_smote, y_train_smote = get_smote_over_sampling(x_train_dt, y_train_dt, 0.005, class_name)

svm = test_model(model=svm, model_name='SVM (Clear)',

x_train=x_train_smote, y_train=y_train_smote, x_test=x_test, y_test=y_test,

show_reports=False)[11]

x_train_svm = svm.predict(x_train_dt)

x_test_svm = svm.predict(x_test)

x_train_array = x_train_array + (x_train_svm,)

x_test_array = x_test_array + (x_test_svm,)

x_train_WOKNN_array = x_train_WOKNN_array + (x_train_svm,)

x_test_WOKNN_array = x_test_WOKNN_array + (x_test_svm,)

print('Finished - ' + str(time.time() - mod_start_time) + ' sec')

# Decision tree

if decision_tree_model:

print('Decision tree start')

mod_start_time = time.time()

dtc = skl.tree.DecisionTreeClassifier(max_depth=10, min_samples_split=6)

dtc = test_model(model=dtc, model_name='Decision tree (Clear)',

x_train=x_train_dt, y_train=y_train_dt, x_test=x_test, y_test=y_test,

show_reports=False)[11]

x_train_dtc = dtc.predict(x_train_dt)

x_test_dtc = dtc.predict(x_test)

x_train_array = x_train_array + (x_train_dtc,)

x_test_array = x_test_array + (x_test_dtc,)

x_train_WOKNN_array = x_train_WOKNN_array + (x_train_dtc,)

x_test_WOKNN_array = x_test_WOKNN_array + (x_test_dtc,)

print('Finished - ' + str(time.time() - mod_start_time) + ' sec')

# AdaBoost

if ada_boost_model:

print('AdaBoost start')

mod_start_time = time.time()

dtc = skl.tree.DecisionTreeClassifier(max_depth=8, min_samples_split=2)

adb = skl.ensemble.AdaBoostClassifier(n_estimators=80, base_estimator=dtc)

adb = test_model(model=adb, model_name='AdaBoost',

x_train=x_train_dt, y_train=y_train_dt, x_test=x_test, y_test=y_test,

show_reports=False)[11]

x_train_adb = adb.predict(x_train_dt)

x_test_adb = adb.predict(x_test)

x_train_array = x_train_array + (x_train_adb,)

x_test_array = x_test_array + (x_test_adb,)

x_train_WOKNN_array = x_train_WOKNN_array + (x_train_adb,)

x_test_WOKNN_array = x_test_WOKNN_array + (x_test_adb,)

print('Finished - ' + str(time.time() - mod_start_time) + ' sec')

# Random forest

if random_forest_model:

print('RF start')

mod_start_time = time.time()

rfm = skl.ensemble.RandomForestClassifier(n_estimators=80, max_depth=15, min_samples_split=4)

rfm = test_model(model=rfm, model_name='Random forest (Clear)',

x_train=x_train_dt, y_train=y_train_dt, x_test=x_test, y_test=y_test,

show_reports=False)[11]

x_train_rfm = rfm.predict(x_train_dt)

x_test_rfm = rfm.predict(x_test)

x_train_array = x_train_array + (x_train_rfm,)

x_test_array = x_test_array + (x_test_rfm,)

x_train_WOKNN_array = x_train_WOKNN_array + (x_train_rfm,)

x_test_WOKNN_array = x_test_WOKNN_array + (x_test_rfm,)

print('Finished - ' + str(time.time() - mod_start_time) + ' sec')

x_train_final = np.array(x_train_array).T

x_test_final = np.array(x_test_array).T

x_train_WOKNN_final = np.array(x_train_WOKNN_array).T

x_test_WOKNN_final = np.array(x_test_WOKNN_array).T

np.savetxt(fname=save_to_path + 'Test_WOKNN.csv', X=x_test_WOKNN_final, delimiter=';')

np.savetxt(fname=save_to_path + 'Test_WKNN.csv', X=x_test_final, delimiter=';')

# final_ds = np.array(

# (x_test_log_reg, x_test_knn, x_test_nb, x_test_svm, x_test_dtc, x_test_adb, x_test_rfm, y_test.values)).T

# np.savetxt(fname=save_to_path + 'FinalData.csv', X=final_ds, delimiter=';')

# VOTING CLASSIFIER

y_predicted_final = major_voting(x_test_final, 4)

final_scores = get_model_scores(y_pred=y_predicted_final, y_act=y_test, model_str='FINAL Voting',

show_roc=False, show_report=False)

conf_matrix = get_conf_matrix_data(y_predicted_final, y_test, print_data=False)

mv_results = final_scores + conf_matrix

# Without KNN

y_predicted_WOKNN_final = major_voting(x_test_WOKNN_final, 4)

final_WOKNN_scores = get_model_scores(y_pred=y_predicted_WOKNN_final, y_act=y_test, model_str='FINAL Voting',

show_roc=False, show_report=False)

conf_matrix_WOKNN = get_conf_matrix_data(y_predicted_WOKNN_final, y_test, print_data=False)

mv_results_WOKNN = final_WOKNN_scores + conf_matrix_WOKNN

# NAIVE BAYES CLASSIFIER

nb_classifier = GaussianNB()

final_nb_results = test_model(model=nb_classifier, model_name='Final model NB',

x_train=x_train_final, y_train=y_train_dt, x_test=x_test_final, y_test=y_test,

show_reports=False, show_roc=False)

final_nb_WOKNN_results = test_model(model=nb_classifier, model_name='Final model NB',

x_train=x_train_WOKNN_final, y_train=y_train_dt, x_test=x_test_WOKNN_final, y_test=y_test,

show_reports=False, show_roc=False)

# LOGISTIC REGRESSION CLASSIFIER

final_lr = skl.linear_model.LogisticRegression()

final_lr_results = test_model(model=final_lr, model_name='Final model logistic regression',

x_train=x_train_final, y_train=y_train_dt, x_test=x_test_final, y_test=y_test,

show_reports=False, show_roc=False)

final_lr_WOKNN_results = test_model(model=final_lr, model_name='Final model logistic regression',

x_train=x_train_WOKNN_final, y_train=y_train_dt, x_test=x_test_WOKNN_final, y_test=y_test,

show_reports=False, show_roc=False)

# DECISION TREE

final_dtc = skl.tree.DecisionTreeClassifier(max_depth=8, min_samples_leaf=2)

final_dtc_results = test_model(model=final_dtc, model_name='Final model Decision tree',

x_train=x_train_final, y_train=y_train_dt, x_test=x_test_final, y_test=y_test,

show_reports=False, show_roc=False)

final_dtc_WOKNN_results = test_model(model=final_dtc, model_name='Final model Decision tree',

x_train=x_train_WOKNN_final, y_train=y_train_dt, x_test=x_test_WOKNN_final, y_test=y_test,

show_reports=False, show_roc=False)

model_score.loc[ln] = ('Major voting',) + mv_results[:9]

ln += 1

model_score.loc[ln] = ('Naive Bayes',) + final_nb_results[:9]

ln += 1

model_score.loc[ln] = ('Logistic regression',) + final_lr_results[:9]

ln += 1

model_score.loc[ln] = ('Decision tree',) + final_dtc_results[:9]

ln += 1

model_score.loc[ln] = ('Major voting W/O KNN',) + mv_results_WOKNN[:9]

ln += 1

model_score.loc[ln] = ('Naive Bayes W/O KNN',) + final_nb_WOKNN_results[:9]

ln += 1

model_score.loc[ln] = ('Logistic regression W/O KNN',) + final_lr_WOKNN_results[:9]

ln += 1

model_score.loc[ln] = ('Decision tree W/O KNN',) + final_dtc_WOKNN_results[:9]

ln += 1

print('Iteration finished\nDuration: ' + str(time.time() - start_time) + '\n')

general_results = []

general_results_WOKNN = []

# print(model_score)

for mdl in models_list:

if 'W/O' not in mdl:

score_df = model_score.loc[(model_score['Model'] == mdl)]

if len(score_df.index) != 0:

# multiple_result(df=score_df, model_str=(mdl), print_results=True)

general_results.append(

(mdl,) + multiple_result(df=score_df, model_str=mdl, print_results=True))

for mdl in models_list:

if 'W/O' in mdl:

score_df = model_score.loc[(model_score['Model'] == mdl)]

if len(score_df.index) != 0:

# multiple_result(df=score_df, model_str=(mdl), print_results=True)

general_results_WOKNN.append(

(mdl,) + multiple_result(df=score_df, model_str=mdl, print_results=True))

fig = plt.figure(1)

bar_names = [k[0] for i, k in enumerate(general_results)]

index = np.arange(len(general_results))

bar_width = 0.2

opacity = 0.6

ax = fig.add_subplot(1, 2, 1)

f1 = [k[3] for i, k in enumerate(general_results)]

rec = [k[2] for i, k in enumerate(general_results)]

pr = [k[1] for i, k in enumerate(general_results)]

hss_vals = [k[5] for i, k in enumerate(general_results)]

rects1 = ax.bar(index, f1, bar_width, color='b', alpha=opacity,

label='F1')

rects2 = ax.bar(index + 2 * bar_width, rec, bar_width, color='r', alpha=opacity,

label='Recall')

rects3 = ax.bar(index + 3 * bar_width, pr, bar_width, color='g', alpha=opacity,

label='Precision')

rects4 = ax.bar(index + 1 * bar_width, hss_vals, bar_width, color='m', alpha=opacity,

label='HSS')

ax.set_xlabel('Model')

ax.set_ylabel('Scores')

ax.legend()

ax.set_xticks(index + bar_width)

ax.set_xticklabels(bar_names, rotation=30)

ax2 = fig.add_subplot(1, 2, 2)

tp = [k[6] for i, k in enumerate(general_results)]

fp = [k[8] for i, k in enumerate(general_results)]

fn = [k[9] for i, k in enumerate(general_results)]

rects11 = ax2.bar(index, tp, bar_width, color='g', alpha=opacity,

label='TP')

rects31 = ax2.bar(index + bar_width, fp, bar_width, color='b', alpha=opacity,

label='FP')

rects41 = ax2.bar(index + 2 * bar_width, fn, bar_width, color='r', alpha=opacity,

label='FN')

ax2.set_xlabel('Model')

ax2.set_ylabel('Scores')

ax2.legend()

ax2.set_xticks(index + bar_width)

ax2.set_xticklabels(bar_names, rotation=30)

# WITHOUT KNN

fig_woknn= plt.figure(2)

bar_names_woknn = [k[0] for i, k in enumerate(general_results_WOKNN)]

index_woknn = np.arange(len(general_results_WOKNN))

bar_width = 0.2

opacity = 0.6

ax_woknn = fig_woknn.add_subplot(1, 2, 1)

f1_woknn = [k[3] for i, k in enumerate(general_results_WOKNN)]

rec_woknn = [k[2] for i, k in enumerate(general_results_WOKNN)]

pr_woknn = [k[1] for i, k in enumerate(general_results_WOKNN)]

hss_vals_woknn = [k[5] for i, k in enumerate(general_results_WOKNN)]

rects1_woknn = ax_woknn.bar(index_woknn, f1_woknn, bar_width, color='b', alpha=opacity,

label='F1')

rects2_woknn = ax_woknn.bar(index_woknn + 2 * bar_width, rec_woknn, bar_width, color='r', alpha=opacity,

label='Recall')

rects3_woknn = ax_woknn.bar(index_woknn + 3 * bar_width, pr_woknn, bar_width, color='g', alpha=opacity,

label='Precision')

rects4_woknn = ax_woknn.bar(index_woknn + 1 * bar_width, hss_vals_woknn, bar_width, color='m', alpha=opacity,

label='HSS')

ax_woknn.set_xlabel('Model')

ax_woknn.set_ylabel('Scores')

ax_woknn.legend()

ax_woknn.set_xticks(index_woknn + bar_width)

ax_woknn.set_xticklabels(bar_names_woknn, rotation=30)

ax2_woknn = fig_woknn.add_subplot(1, 2, 2)

tp_woknn = [k[6] for i, k in enumerate(general_results_WOKNN)]

fp_woknn = [k[8] for i, k in enumerate(general_results_WOKNN)]

fn_woknn = [k[9] for i, k in enumerate(general_results_WOKNN)]

rects11_woknn = ax2_woknn.bar(index_woknn, tp_woknn, bar_width, color='g', alpha=opacity,

label='TP')

rects31_woknn = ax2_woknn.bar(index_woknn + bar_width, fp_woknn, bar_width, color='b', alpha=opacity,

label='FP')

rects41_woknn = ax2_woknn.bar(index_woknn + 2 * bar_width, fn_woknn, bar_width, color='r', alpha=opacity,

label='FN')

ax2_woknn.set_xlabel('Model')

ax2_woknn.set_ylabel('Scores')

ax2_woknn.legend()

ax2_woknn.set_xticks(index_woknn + bar_width)

ax2_woknn.set_xticklabels(bar_names_woknn, rotation=30)

fig_woknn.tight_layout()

plt.show()

Размещено на Allbest.ru


Подобные документы

Работы в архивах красиво оформлены согласно требованиям ВУЗов и содержат рисунки, диаграммы, формулы и т.д.
PPT, PPTX и PDF-файлы представлены только в архивах.
Рекомендуем скачать работу.