import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression as LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,r2_score, mean_absolute_error,roc_curve, roc_auc_score
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from scipy.stats import mode
from sklearn.impute import SimpleImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

billets = pd.read_csv('billets.csv', sep=';')

billets

billets.head() # afficher les premières lignes du tableau de données

billets.info() # afficher des informations sur les données
billets.describe(include='all') # afficher des statistiques descriptives sur les données

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   is_genuine    1500 non-null   bool   
 1   diagonal      1500 non-null   float64
 2   height_left   1500 non-null   float64
 3   height_right  1500 non-null   float64
 4   margin_low    1463 non-null   float64
 5   margin_up     1500 non-null   float64
 6   length        1500 non-null   float64
dtypes: bool(1), float64(6)
memory usage: 71.9 KB

print("Il y a :\n\n\n", billets.isnull().sum() , '\n\n\nvaleurs nulls')

Il y a :


 is_genuine       0
diagonal         0
height_left      0
height_right     0
margin_low      37
margin_up        0
length           0
dtype: int64 


valeurs nulls

#nulls = billets.loc[billets['margin_low'].isnull()]
print(billets.loc[billets['margin_low'].isnull()])

      is_genuine  diagonal  height_left  height_right  margin_low  margin_up  \
72          True    171.94       103.89        103.45         NaN       3.25   
99          True    171.93       104.07        104.18         NaN       3.14   
151         True    172.07       103.80        104.38         NaN       3.02   
197         True    171.45       103.66        103.80         NaN       3.62   
241         True    171.83       104.14        104.06         NaN       3.02   
251         True    171.80       103.26        102.82         NaN       2.95   
284         True    171.92       103.83        103.76         NaN       3.23   
334         True    171.85       103.70        103.96         NaN       3.00   
410         True    172.56       103.72        103.51         NaN       3.12   
413         True    172.30       103.66        103.50         NaN       3.16   
445         True    172.34       104.42        103.22         NaN       3.01   
481         True    171.81       103.53        103.96         NaN       2.71   
505         True    172.01       103.97        104.05         NaN       2.98   
611         True    171.80       103.68        103.49         NaN       3.30   
654         True    171.97       103.69        103.54         NaN       2.70   
675         True    171.60       103.85        103.91         NaN       2.56   
710         True    172.03       103.97        103.86         NaN       3.07   
739         True    172.07       103.74        103.76         NaN       3.09   
742         True    172.14       104.06        103.96         NaN       3.24   
780         True    172.41       103.95        103.79         NaN       3.13   
798         True    171.96       103.84        103.62         NaN       3.01   
844         True    171.62       104.14        104.49         NaN       2.99   
845         True    172.02       104.21        104.05         NaN       2.90   
871         True    171.37       104.07        103.75         NaN       3.07   
895         True    171.81       103.68        103.80         NaN       2.98   
919         True    171.92       103.68        103.45         NaN       2.58   
945         True    172.09       103.74        103.52         NaN       3.02   
946         True    171.63       103.87        104.66         NaN       3.27   
981         True    172.02       104.23        103.72         NaN       2.99   
1076       False    171.57       104.27        104.44         NaN       3.21   
1121       False    171.40       104.38        104.19         NaN       3.17   
1176       False    171.59       104.05        103.94         NaN       3.02   
1303       False    172.17       104.49        103.76         NaN       2.93   
1315       False    172.08       104.15        104.17         NaN       3.40   
1347       False    171.72       104.46        104.12         NaN       3.61   
1435       False    172.66       104.33        104.41         NaN       3.56   
1438       False    171.90       104.28        104.29         NaN       3.24   

      length  
72    112.79  
99    113.08  
151   112.93  
197   113.27  
241   112.36  
251   113.22  
284   113.29  
334   113.36  
410   112.95  
413   112.95  
445   112.97  
481   113.99  
505   113.65  
611   112.84  
654   112.79  
675   113.27  
710   112.65  
739   112.41  
742   113.07  
780   113.41  
798   114.44  
844   113.35  
845   113.62  
871   113.27  
895   113.82  
919   113.68  
945   112.78  
946   112.68  
981   113.37  
1076  111.87  
1121  112.39  
1176  111.29  
1303  111.21  
1315  112.29  
1347  110.31  
1435  111.47  
1438  111.49

billets_avec_nulls = billets.loc[billets['margin_low'].isna()]
billets_tous=billets
billets = billets.dropna(subset=['margin_low'])
billets_avec_nulls

BC=billets.columns
BC=['diagonal', 'height_left', 'height_right', 'margin_low',
       'margin_up', 'length']

billets=billets.copy()
#D['is_genuine']=D['is_genuine'].astype(int)
billets.loc[:, 'is_genuine'] = billets['is_genuine'].astype(int)

for column in BC:
    plt.title(column, fontsize=14)
    for value in billets['is_genuine'].unique():
        billets.loc[billets['is_genuine']==value, column].dropna().hist(alpha=0.5, label=value)
    plt.xlabel("Valeur")
    plt.ylabel("Frequence")
    plt.legend(title='is_genuine')
    plt.show()

for column in BC:
    #plt.title(column, fontsize=14)
    billets.boxplot(column=column, by='is_genuine')
    plt.xlabel("is_genuine")
    plt.ylabel(column)
    plt.show()

billets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1463 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   is_genuine    1463 non-null   int64  
 1   diagonal      1463 non-null   float64
 2   height_left   1463 non-null   float64
 3   height_right  1463 non-null   float64
 4   margin_low    1463 non-null   float64
 5   margin_up     1463 non-null   float64
 6   length        1463 non-null   float64
dtypes: float64(6), int64(1)
memory usage: 91.4 KB

#scaler = StandardScaler()
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(billets), columns=billets.columns)

XT = pd.DataFrame(scaler.fit_transform(billets_tous), columns=billets_tous.columns)

corr_matrix = billets.corr(method='pearson')
corr_matrix=round (corr_matrix, 3)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))
sns.heatmap(corr_matrix, annot     = True, cbar      = False,annot_kws = {"size": 10}, vmin      = -1, vmax      = 1,center    = 0, cmap      = sns.diverging_palette(20, 220, n=200),square    = True, ax        = ax ),ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right', ),ax.tick_params(labelsize = 10)
plt.show()

corr_matrix = billets[billets['is_genuine']==0].corr(method='pearson')
corr_matrix=round (corr_matrix, 3)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))
sns.heatmap(corr_matrix, annot     = True, cbar      = False,annot_kws = {"size": 10}, vmin      = -1, vmax      = 1,center    = 0, cmap      = sns.diverging_palette(20, 220, n=200),square    = True, ax        = ax ),ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right', ),ax.tick_params(labelsize = 10)
plt.show()

corr_matrix = billets[billets['is_genuine']==1].corr(method='pearson')
corr_matrix=round (corr_matrix, 3)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))
sns.heatmap(corr_matrix, annot     = True, cbar      = False,annot_kws = {"size": 10}, vmin      = -1, vmax      = 1,center    = 0, cmap      = sns.diverging_palette(20, 220, n=200),square    = True, ax        = ax ),ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right', ),ax.tick_params(labelsize = 10)
plt.show()

# VIF dataframe
vif = pd.DataFrame()
vif["valeur"] = pd.DataFrame(X[BC].columns)
  
# calculer VIF pour chaque valeur
vif["VIF"] = [variance_inflation_factor(X[BC].values, i)
                          for i in range(len(X[BC].columns))]
  
print(vif)

         valeur        VIF
0      diagonal   9.571588
1   height_left  10.693729
2  height_right  12.474296
3    margin_low   9.510445
4     margin_up  16.641133
5        length   8.715947

#DF X est normalise et is_genuine 1 true et 0 false
sns.pairplot(XT, hue='is_genuine', diag_kind="hist",kind="kde",palette="Set2",
             height=1.4)
plt.show()

billets_tous.iloc[billets_avec_nulls.index.to_list()]

imputer = KNNImputer(n_neighbors=5) 
billets_tous_knnimputer = pd.DataFrame(imputer.fit_transform(billets_tous), columns=billets_tous.columns)

# On separe
donnees_avec_margin_low = billets_tous[billets_tous['margin_low'].notnull()]
donnees_sans_margin_low = billets_tous[billets_tous['margin_low'].isnull()]

# Train test split
X_train_lin, X_test_lin, y_train_lin, y_test_lin = train_test_split(donnees_avec_margin_low.drop('margin_low', axis=1)
                                                                    , donnees_avec_margin_low['margin_low'], test_size=0.15, random_state=42)

X_test_lin.shape, X_train_lin.shape ,y_train_lin.shape, y_test_lin.shape

((220, 6), (1243, 6), (1243,), (220,))

lmodel=LinearRegression()

lmodel.fit(X_train_lin, y_train_lin)

LinearRegression()

y_pred=lmodel.predict(X_test_lin)

(y_pred-y_test_lin).hist()

<AxesSubplot:>

X = donnees_avec_margin_low.drop('margin_low', axis=1)
X= X.drop('is_genuine', axis=1)
y = donnees_avec_margin_low['margin_low']
X = sm.add_constant(X) 
model = sm.OLS(y,X).fit()
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:             margin_low   R-squared:                       0.477
Model:                            OLS   Adj. R-squared:                  0.476
Method:                 Least Squares   F-statistic:                     266.1
Date:                Tue, 16 May 2023   Prob (F-statistic):          2.60e-202
Time:                        12:18:22   Log-Likelihood:                -1001.3
No. Observations:                1463   AIC:                             2015.
Df Residuals:                    1457   BIC:                             2046.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           22.9948      9.656      2.382      0.017       4.055      41.935
diagonal        -0.1111      0.041     -2.680      0.007      -0.192      -0.030
height_left      0.1841      0.045      4.113      0.000       0.096       0.272
height_right     0.2571      0.043      5.978      0.000       0.173       0.342
margin_up        0.2562      0.064      3.980      0.000       0.130       0.382
length          -0.4091      0.018    -22.627      0.000      -0.445      -0.374
==============================================================================
Omnibus:                       73.627   Durbin-Watson:                   1.893
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               95.862
Skew:                           0.482   Prob(JB):                     1.53e-21
Kurtosis:                       3.801   Cond. No.                     1.94e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.94e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

r2 = r2_score(y, 
              lmodel.predict(donnees_avec_margin_low.drop('margin_low', axis=1)))
mae = mean_absolute_error(y, lmodel.predict(donnees_avec_margin_low.drop('margin_low', axis=1)))
print("R2:", r2)
print("MAE:", mae)

R2: 0.6166656453942228
MAE: 0.3160336084059514

# Entreiner model
X_train_margin_low = donnees_avec_margin_low.drop('margin_low', axis=1)
y_train_margin_low = donnees_avec_margin_low['margin_low']
model_margin_low = LinearRegression()
model_margin_low.fit(X_train_margin_low, y_train_margin_low)
# VALIDER MODEL
y_pred=model_margin_low.predict(X_test_lin)
#y_test-y_pred

# Entreiner model
X_train_margin_low = donnees_avec_margin_low.drop('margin_low', axis=1)
y_train_margin_low = donnees_avec_margin_low['margin_low']
model_margin_low = LinearRegression()
model_margin_low.fit(X_train_margin_low, y_train_margin_low)

# Prediction donnes sans margin
X_test_margin_low = donnees_sans_margin_low.drop('margin_low', axis=1)
donnees_sans_margin_low['margin_low'] = model_margin_low.predict(X_test_margin_low)

# merge/concat
billets_tous_linear = pd.concat([donnees_avec_margin_low, donnees_sans_margin_low])

/var/folders/pd/z7nn_cm91pg9fjmhgsnqvvd80000gn/T/ipykernel_24189/1793560917.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  donnees_sans_margin_low['margin_low'] = model_margin_low.predict(X_test_margin_low)

donnees_sans_margin_low.describe()

billets_tous_knnimputer.iloc[billets_avec_nulls.index.to_list()]-donnees_sans_margin_low

fig, ax = plt.subplots()

billets_tous_knnimputer[['margin_low']].iloc[billets_avec_nulls.index.to_list()].boxplot(ax=ax,showmeans=True,positions=[0])
donnees_sans_margin_low[['margin_low']].boxplot(ax=ax, showmeans=True, positions=[1])
ax.set_xticklabels(['Knnimputer', 'Regresion lineare'])
ax.set_ylabel('margin_low')
ax.set_title('Comparacion des distribuctons de margin_low avec differents methodes')

Text(0.5, 1.0, 'Comparacion des distribuctons de margin_low avec differents methodes')

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(billets_tous_linear), columns=billets_tous_linear.columns)

X['is_genuine']=billets_tous_linear['is_genuine']

XT.info() # afficher des informations sur les données
XT.describe(include='all') # afficher des statistiques descriptives sur les données

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   is_genuine    1500 non-null   float64
 1   diagonal      1500 non-null   float64
 2   height_left   1500 non-null   float64
 3   height_right  1500 non-null   float64
 4   margin_low    1463 non-null   float64
 5   margin_up     1500 non-null   float64
 6   length        1500 non-null   float64
dtypes: float64(7)
memory usage: 82.2 KB

#est-ce une valeur continue (un nombre) ou bien une valeur discrète (une catégorie) ? 
#Le premier cas est appelé une régression, le second une classification  
#Depuis openclassrooms, Initiez-vous au Machine Learning 

#alors je fais regression...

x = billets_tous_linear.drop('is_genuine', axis=1)
y = billets_tous_linear['is_genuine']

X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(x, y, test_size=0.15, random_state=42)

# Entreiner le model de regresion logistique
Logistique_Regression_model = LogisticRegression()
Logistique_Regression_model.fit(X_train_log, y_train_log)

y_pred_log = Logistique_Regression_model.predict(X_test_log)

# prediction:
y_pred_log = Logistique_Regression_model.predict(X_test_log)

#y_pred_log est ma prediction et y_test est ce que j'avais separé
y_pred_log_tous= Logistique_Regression_model.predict (billets_tous_linear.drop('is_genuine', axis=1))
y_log_tous=billets_tous_linear['is_genuine']

#precision:
accuracy_log = accuracy_score(y_log_tous, y_pred_log_tous)

print("Precision:" ,accuracy_log)
#matriz de confusion también r score

Precision: 0.9913333333333333

print(classification_report(y_log_tous, y_pred_log_tous)
   
     )

              precision    recall  f1-score   support

       False       0.99      0.98      0.99       500
        True       0.99      1.00      0.99      1000

    accuracy                           0.99      1500
   macro avg       0.99      0.99      0.99      1500
weighted avg       0.99      0.99      0.99      1500

Z = sm.add_constant(x)
model = sm.Logit(y, Z).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.026349
         Iterations 13
                           Logit Regression Results                           
==============================================================================
Dep. Variable:             is_genuine   No. Observations:                 1500
Model:                          Logit   Df Residuals:                     1493
Method:                           MLE   Df Model:                            6
Date:                Tue, 16 May 2023   Pseudo R-squ.:                  0.9586
Time:                        12:18:23   Log-Likelihood:                -39.524
converged:                       True   LL-Null:                       -954.77
Covariance Type:            nonrobust   LLR p-value:                     0.000
================================================================================
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const         -230.4143    249.954     -0.922      0.357    -720.316     259.487
diagonal         0.2562      1.135      0.226      0.821      -1.969       2.481
height_left     -1.2845      1.117     -1.150      0.250      -3.474       0.905
height_right    -2.8941      1.143     -2.533      0.011      -5.134      -0.654
margin_low      -6.0235      0.988     -6.099      0.000      -7.959      -4.088
margin_up      -10.2986      2.206     -4.669      0.000     -14.622      -5.975
length           6.0561      0.897      6.751      0.000       4.298       7.814
================================================================================

Possibly complete quasi-separation: A fraction 0.53 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.

y_test_log.shape, y_pred_log.shape

((225,), (225,))

# calculer courbe ROC
fpr, tpr, thresholds = roc_curve(y_log_tous, y_pred_log_tous)

# calculer  AUC
auc = roc_auc_score(y_log_tous, y_pred_log_tous)

# grafique courbe ROC
plt.plot(fpr, tpr, label='AUC = {:.2f}'.format(auc))
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel('Taux de faux positif')
plt.ylabel('Taux de vrai positif')
plt.title('Courbe ROC')
plt.legend()
plt.show()

cm = confusion_matrix(y_pred_log_tous, y_log_tous)

plt.matshow(cm, cmap=plt.cm.Blues)
plt.xlabel('Prediction')
plt.ylabel('Vrai')
plt.plot([-0.5, cm.shape[0]-0.5], [-0.5, cm.shape[1]-0.5], 'g-')  # línea diagonal roja
#plt.plot([-0.5, cm.shape[0]-0.5], [cm.shape[1]-0.5, -0.5], 'r-')  # línea diagonal verde

for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(x=j, y=i, s=cm[i, j], va='center', ha='center', fontsize=14, weight='bold')
plt.show()

# SANS P value >0.5

x = billets_tous_linear.drop('is_genuine', axis=1).drop('diagonal', axis=1).drop('height_left', axis=1)
y = billets_tous_linear['is_genuine']

X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(x, y, test_size=0.15, random_state=42)
# Entreiner le model de regresion logistique
Logistique_Regression_model = LogisticRegression()
Logistique_Regression_model.fit(X_train_log, y_train_log)

y_pred_log = Logistique_Regression_model.predict(X_test_log)
# prediction:
y_pred_log = Logistique_Regression_model.predict(X_test_log)

#y_pred_log est ma prediction et y_test est ce que j'avais separé
y_pred_log_tous= Logistique_Regression_model.predict (x)
y_log_tous=billets_tous_linear['is_genuine']

#------METRIQUES
#precision:
accuracy_log = accuracy_score(y_log_tous, y_pred_log_tous)

print("Precision:" ,accuracy_log)

Precision: 0.99

print(classification_report(y_log_tous, y_pred_log_tous)
   
     )

              precision    recall  f1-score   support

       False       0.99      0.98      0.98       500
        True       0.99      1.00      0.99      1000

    accuracy                           0.99      1500
   macro avg       0.99      0.99      0.99      1500
weighted avg       0.99      0.99      0.99      1500

Z = sm.add_constant(x)
model = sm.Logit(y, Z).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.026815
         Iterations 13
                           Logit Regression Results                           
==============================================================================
Dep. Variable:             is_genuine   No. Observations:                 1500
Model:                          Logit   Df Residuals:                     1495
Method:                           MLE   Df Model:                            4
Date:                Tue, 16 May 2023   Pseudo R-squ.:                  0.9579
Time:                        12:18:23   Log-Likelihood:                -40.223
converged:                       True   LL-Null:                       -954.77
Covariance Type:            nonrobust   LLR p-value:                     0.000
================================================================================
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const         -285.3339    139.160     -2.050      0.040    -558.083     -12.585
height_right    -3.3223      1.117     -2.975      0.003      -5.511      -1.134
margin_low      -6.3080      0.963     -6.547      0.000      -8.196      -4.420
margin_up      -10.4027      2.196     -4.737      0.000     -14.707      -6.098
length           6.1579      0.888      6.934      0.000       4.417       7.898
================================================================================

Possibly complete quasi-separation: A fraction 0.55 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.

# calculer courbe ROC
fpr, tpr, thresholds = roc_curve(y_log_tous, y_pred_log_tous)

# calculer  AUC
auc = roc_auc_score(y_log_tous, y_pred_log_tous)

# grafique courbe ROC
plt.plot(fpr, tpr, label='AUC = {:.2f}'.format(auc))
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel('Taux de faux positif')
plt.ylabel('Taux de vrai positif')
plt.title('Courbe ROC')
plt.legend()
plt.show()

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(x)

plt.scatter(X_pca[y == 0][:, 0], X_pca[y == 0][:, 1], color='red', label='False (0)')
plt.scatter(X_pca[y == 1][:, 0], X_pca[y == 1][:, 1], color='blue', label='True (1)')

plt.xlabel('CP 1')
plt.ylabel('CP 2')
plt.title('Visualizacion 2D de 5 variables avec PCA')
          #avec DONNÉES AVEC imputer regression lineare
plt.legend(loc='upper left')
plt.show()

pca = PCA()
pca.fit(x)

explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

plt.figure()
plt.plot(range(1, len(explained_variance_ratio) + 1), cumulative_explained_variance, marker='o')
plt.xlabel('Nombre de composantes principales')
plt.ylabel('Taux de la variance expliquée')
plt.show()
print("Nombre CP qu'on a besoin : ", np.argmax(cumulative_explained_variance >= 0.70) + 1, " (plus que 70% d'explication de variabilité)")

Nombre CP qu'on a besoin :  1  (plus que 70% d'explication de variabilité)

scree = (pca.explained_variance_ratio_*100).round(2)
scree_cum = scree.cumsum().round()
x_list = range(1, 5)
list(x_list)
plt.bar(x_list, scree)
plt.plot(x_list, scree_cum,c="red",marker='o')
plt.xlabel("rang de l'axe d'inertie")
plt.ylabel("pourcentage d'inertie")
plt.title("Eboulis des valeurs propres")
plt.show(block=False)

t, r = 0,1
fig, ax = plt.subplots(figsize=(10, 9))
for i in range(0, pca.components_.shape[1]):
    ax.arrow(0,
             0,  # Start the arrow at the origin
             pca.components_[0, i],  #0 for PC1
             pca.components_[1, i],  #1 for PC2
             
             head_width=0.07,
             head_length=0.07, 
             width=0.02,              )

    plt.text(pca.components_[0, i] + 0.05,
             pca.components_[1, i] + 0.05,
             BC[i])
    
# affichage des lignes horizontales et verticales
plt.plot([-1, 1], [0, 0], color='grey', ls='--')
plt.plot([0, 0], [-1, 1], color='grey', ls='--')


# nom des axes, avec le pourcentage d'inertie expliqué
plt.xlabel('F{} ({}%)'.format(t+1, round(100*pca.explained_variance_ratio_[t],1)))
plt.ylabel('F{} ({}%)'.format(r+1, round(100*pca.explained_variance_ratio_[r],1)))

plt.title("Cercle des corrélations (CP{} et CP{})".format(t+1, r+1))


an = np.linspace(0, 2 * np.pi, 100)
plt.plot(np.cos(an), np.sin(an))  # Add a unit circle for scale
plt.axis('equal')
plt.show(block=False)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(billets_tous_linear[BC])

df_X_pca=pd.DataFrame(X_pca, index=X.index)

df_X_pca['is_genuine']=X['is_genuine']

sum(df_X_pca['is_genuine']==X['is_genuine'])

1500

df_X_pca

x_pca = df_X_pca.drop('is_genuine', axis=1)
y = df_X_pca['is_genuine']
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(x_pca, y, test_size=0.20, random_state=42)

Logistique_Regression_model_acp = LogisticRegression()
Logistique_Regression_model_acp.fit(X_train_log, y_train_log)
y_pred_log = Logistique_Regression_model_acp.predict(X_test_log)

accuracy_score(y_test_log, y_pred_log)

0.9433333333333334

accuracy_score(df_X_pca['is_genuine'], Logistique_Regression_model_acp.predict(df_X_pca.drop('is_genuine', axis=1)))

0.9486666666666667

X_kmeans=billets_tous_linear
y = X['is_genuine']

y= billets_tous_linear['is_genuine']

X_train_kmeans, X_test_kmeans, y_train_kmeans, y_test_kmeans = train_test_split(x, y, test_size=0.15, random_state=72)

kmeans = KMeans(n_clusters=2) 
kmeans.fit(x)
y_pred_kmeans = kmeans.fit_predict(x)

x['is_genuine_kmeans']=kmeans.labels_

x['is_genuine']=y

x=x.replace({0:2})
x=x.replace({1:0})
x=x.replace({2:1})

y_pred_tous=kmeans.predict(x[[#'diagonal', 'height_left', 
                              'height_right', 'margin_low',
       'margin_up', 'length']])
X_kmeans['is_genuine_kmeans']=y_pred_tous

y_pred_kmeans

array([0, 0, 0, ..., 1, 1, 1], dtype=int32)

sum(X_kmeans['is_genuine']!=X_kmeans['is_genuine_kmeans'])

1474

accuracy_score(kmeans.labels_, y)

0.017333333333333333

accuracy_score(x['is_genuine_kmeans'], x['is_genuine'])

0.9826666666666667

(y_test_kmeans.shape, y_pred_kmeans.shape)

((225,), (1500,))

cm = confusion_matrix(x['is_genuine_kmeans'], x['is_genuine'])

print(cm)

[[476   2]
 [ 24 998]]

plt.matshow(cm, cmap=plt.cm.Blues)
plt.xlabel('Prediction')
plt.ylabel('Vrai')
plt.plot([-0.5, cm.shape[0]-0.5], [-0.5, cm.shape[1]-0.5], 'g-')  # línea diagonal roja
#plt.plot([-0.5, cm.shape[0]-0.5], [cm.shape[1]-0.5, -0.5], 'r-')  # línea diagonal verde

for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(x=j, y=i, s=cm[i, j], va='center', ha='center', fontsize=14, weight='bold')
plt.show()
#La diagonale vert représente les prédictions correctes.
#La croix (rouge) qui croise de la diagonale vert représente les prédictions incorrectes.
#Il y a 2 faux négatifs.
#Il y a 24 faux positifs.

#Il y a 998 vrai positifs
#Il y a 476 vrai négatifs

#[476 2]
#[24 998]

billetsP = pd.read_csv('billets_production.csv', sep=',')

billetsP

billetsP.head()

billetsP.info()
billetsP.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   diagonal      5 non-null      float64
 1   height_left   5 non-null      float64
 2   height_right  5 non-null      float64
 3   margin_low    5 non-null      float64
 4   margin_up     5 non-null      float64
 5   length        5 non-null      float64
 6   id            5 non-null      object 
dtypes: float64(6), object(1)
memory usage: 408.0+ bytes

sns.pairplot(billetsP[['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up','length']])
#, hue='id')
plt.show()

scaler = StandardScaler()
XP = pd.DataFrame(scaler.fit_transform(billetsP[['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
       'length']]
                                      ), columns=['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
       'length'])

XP

sns.pairplot(XP[['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
       'length']])
plt.show()

billetsP['is_genuine']=Logistique_Regression_model.predict(XP.drop('diagonal', axis=1).drop('height_left', axis=1))

billetsP

kmeans.predict(XP.drop('diagonal', axis=1).drop('height_left', axis=1))

array([1, 1, 1, 1, 1], dtype=int32)

(XP.drop('diagonal', axis=1).drop('height_left', axis=1))

Logistique_Regression_model.predict(XP.drop('diagonal', axis=1).drop('height_left', axis=1))

array([False, False, False,  True,  True])

algoritme=pd.read_csv('billets_production.csv', sep=',')

scaler = StandardScaler()
TA = pd.DataFrame(scaler.fit_transform(algoritme[['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
       'length']]
                                      ), columns=['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
       'length'])

TA

Logistique_Regression_model.predict(TA.drop('diagonal', axis=1).drop('height_left', axis=1))

array([False, False, False,  True,  True])

	is_genuine	diagonal	height_left	height_right	margin_low	margin_up	length
count	1500	1500.000000	1500.000000	1500.000000	1463.000000	1500.000000	1500.00000
unique	2	NaN	NaN	NaN	NaN	NaN	NaN
top	True	NaN	NaN	NaN	NaN	NaN	NaN
freq	1000	NaN	NaN	NaN	NaN	NaN	NaN
mean	NaN	171.958440	104.029533	103.920307	4.485967	3.151473	112.67850
std	NaN	0.305195	0.299462	0.325627	0.663813	0.231813	0.87273
min	NaN	171.040000	103.140000	102.820000	2.980000	2.270000	109.49000
25%	NaN	171.750000	103.820000	103.710000	4.015000	2.990000	112.03000
50%	NaN	171.960000	104.040000	103.920000	4.310000	3.140000	112.96000
75%	NaN	172.170000	104.230000	104.150000	4.870000	3.310000	113.34000
max	NaN	173.010000	104.880000	104.950000	6.900000	3.910000	114.44000

	diagonal	height_left	height_right	margin_low	margin_up	length
count	37.000000	37.000000	37.000000	37.000000	37.000000	37.000000
mean	171.928649	103.958378	103.874054	4.359368	3.087838	112.826486
std	0.297145	0.288680	0.380631	0.470245	0.239782	0.851388
min	171.370000	103.260000	102.820000	3.993571	2.560000	110.310000
25%	171.800000	103.720000	103.620000	4.096960	2.990000	112.410000
50%	171.930000	103.950000	103.860000	4.125390	3.070000	112.950000
75%	172.070000	104.150000	104.120000	4.198423	3.230000	113.350000
max	172.660000	104.490000	104.660000	5.302069	3.620000	114.440000

	is_genuine	diagonal	height_left	height_right	margin_low	margin_up	length
count	1500.000000	1500.000000	1500.000000	1500.000000	1463.000000	1500.000000	1500.000000
mean	0.666667	0.466213	0.511226	0.516576	0.384175	0.537484	0.644141
std	0.471562	0.154921	0.172104	0.152877	0.169340	0.141349	0.176309
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.360406	0.390805	0.417840	0.264031	0.439024	0.513131
50%	1.000000	0.467005	0.517241	0.516432	0.339286	0.530488	0.701010
75%	1.000000	0.573604	0.626437	0.624413	0.482143	0.634146	0.777778
max	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000

	0	1	is_genuine
0	0.113911	0.237710	True
1	-0.832936	-0.377681	True
2	-0.500740	0.167628	True
3	-1.146882	-0.216228	True
4	-0.118929	-0.460558	True
...	...	...	...
1495	1.624551	-0.148959	False
1496	0.779278	0.382925	False
1497	2.437424	-0.731457	False
1498	1.485415	-0.103298	False
1499	1.471789	-0.004096	False

	diagonal	height_left	height_right	margin_low	margin_up	length	id
0	171.76	104.01	103.54	5.21	3.30	111.42	A_1
1	171.87	104.17	104.13	6.00	3.31	112.09	A_2
2	172.00	104.58	104.29	4.99	3.39	111.57	A_3
3	172.49	104.55	104.34	4.44	3.03	113.20	A_4
4	171.65	103.63	103.56	3.77	3.16	113.33	A_5

IMPORT¶

HEAD, INFO, DESCRIBE¶

BILLETS AVEC NULLS¶

EDA¶

HISTOGRAMS¶

BOXPLOTS¶

CORRELATIONS¶

VIF¶

PAIRPLOT¶

IMPUTER¶

KNNImputer¶

REGRESION LINEARE¶

METRIQUES REGRESION LINEARE¶

KNNImputer vs REGRESION LINEARE:¶

DIVISION DES DONNÉES EN VALEURS (x) ET SI EST GENUINE OU PAS (y)¶

DIVISION DES DONNES EN DONNES POUR ENTREINER ET DE TEST¶

MODELE REGRESION LOGISTIQUE¶

METRIQUES¶

COURBE ROC¶

MATRICE DE CONFUSION¶

SUPRESSION VARIABLES PAS SIGNIFICATIVES¶

ACP¶

Nombre de ACP¶

K-Means¶

IMPORT Billets Production¶

DONNEES StandardScale¶

PREDICTION billets_production.csv¶

Kmeans billets_production¶

TEST DE L’ALGORITHME¶

	is_genuine	diagonal	height_left	height_right	margin_low	margin_up	length
0	True	171.81	104.86	104.95	4.52	2.89	112.83
1	True	171.46	103.36	103.66	3.77	2.99	113.09
2	True	172.69	104.48	103.50	4.40	2.94	113.16
3	True	171.36	103.91	103.94	3.62	3.01	113.51
4	True	171.73	104.28	103.46	4.04	3.48	112.54
...	...	...	...	...	...	...	...
1495	False	171.75	104.38	104.17	4.42	3.09	111.28
1496	False	172.19	104.63	104.44	5.27	3.37	110.97
1497	False	171.80	104.01	104.12	5.51	3.36	111.95
1498	False	172.06	104.28	104.06	5.17	3.46	112.25
1499	False	171.47	104.15	103.82	4.63	3.37	112.07

	is_genuine	diagonal	height_left	height_right	margin_low	margin_up	length
72	True	171.94	103.89	103.45	NaN	3.25	112.79
99	True	171.93	104.07	104.18	NaN	3.14	113.08
151	True	172.07	103.80	104.38	NaN	3.02	112.93
197	True	171.45	103.66	103.80	NaN	3.62	113.27
241	True	171.83	104.14	104.06	NaN	3.02	112.36
251	True	171.80	103.26	102.82	NaN	2.95	113.22
284	True	171.92	103.83	103.76	NaN	3.23	113.29
334	True	171.85	103.70	103.96	NaN	3.00	113.36
410	True	172.56	103.72	103.51	NaN	3.12	112.95
413	True	172.30	103.66	103.50	NaN	3.16	112.95
445	True	172.34	104.42	103.22	NaN	3.01	112.97
481	True	171.81	103.53	103.96	NaN	2.71	113.99
505	True	172.01	103.97	104.05	NaN	2.98	113.65
611	True	171.80	103.68	103.49	NaN	3.30	112.84
654	True	171.97	103.69	103.54	NaN	2.70	112.79
675	True	171.60	103.85	103.91	NaN	2.56	113.27
710	True	172.03	103.97	103.86	NaN	3.07	112.65
739	True	172.07	103.74	103.76	NaN	3.09	112.41
742	True	172.14	104.06	103.96	NaN	3.24	113.07
780	True	172.41	103.95	103.79	NaN	3.13	113.41
798	True	171.96	103.84	103.62	NaN	3.01	114.44
844	True	171.62	104.14	104.49	NaN	2.99	113.35
845	True	172.02	104.21	104.05	NaN	2.90	113.62
871	True	171.37	104.07	103.75	NaN	3.07	113.27
895	True	171.81	103.68	103.80	NaN	2.98	113.82
919	True	171.92	103.68	103.45	NaN	2.58	113.68
945	True	172.09	103.74	103.52	NaN	3.02	112.78
946	True	171.63	103.87	104.66	NaN	3.27	112.68
981	True	172.02	104.23	103.72	NaN	2.99	113.37
1076	False	171.57	104.27	104.44	NaN	3.21	111.87
1121	False	171.40	104.38	104.19	NaN	3.17	112.39
1176	False	171.59	104.05	103.94	NaN	3.02	111.29
1303	False	172.17	104.49	103.76	NaN	2.93	111.21
1315	False	172.08	104.15	104.17	NaN	3.40	112.29
1347	False	171.72	104.46	104.12	NaN	3.61	110.31
1435	False	172.66	104.33	104.41	NaN	3.56	111.47
1438	False	171.90	104.28	104.29	NaN	3.24	111.49

	is_genuine	diagonal	height_left	height_right	margin_low	margin_up	length
72	0.0	0.0	0.0	0.0	-0.018954	0.0	0.0
99	0.0	0.0	0.0	0.0	0.166010	0.0	0.0
151	0.0	0.0	0.0	0.0	0.079997	0.0	0.0
197	0.0	0.0	0.0	0.0	0.112429	0.0	0.0
241	0.0	0.0	0.0	0.0	0.095601	0.0	0.0
251	0.0	0.0	0.0	0.0	0.253716	0.0	0.0
284	0.0	0.0	0.0	0.0	0.003876	0.0	0.0
334	0.0	0.0	0.0	0.0	0.210610	0.0	0.0
410	0.0	0.0	0.0	0.0	0.091272	0.0	0.0
413	0.0	0.0	0.0	0.0	-0.057633	0.0	0.0
445	0.0	0.0	0.0	0.0	0.027027	0.0	0.0
481	0.0	0.0	0.0	0.0	0.113620	0.0	0.0
505	0.0	0.0	0.0	0.0	-0.118484	0.0	0.0
611	0.0	0.0	0.0	0.0	-0.069068	0.0	0.0
654	0.0	0.0	0.0	0.0	-0.180377	0.0	0.0
675	0.0	0.0	0.0	0.0	-0.073551	0.0	0.0
710	0.0	0.0	0.0	0.0	-0.131868	0.0	0.0
739	0.0	0.0	0.0	0.0	0.085159	0.0	0.0
742	0.0	0.0	0.0	0.0	-0.043843	0.0	0.0
780	0.0	0.0	0.0	0.0	0.191238	0.0	0.0
798	0.0	0.0	0.0	0.0	0.171498	0.0	0.0
844	0.0	0.0	0.0	0.0	0.160824	0.0	0.0
845	0.0	0.0	0.0	0.0	0.087712	0.0	0.0
871	0.0	0.0	0.0	0.0	-0.003938	0.0	0.0
895	0.0	0.0	0.0	0.0	0.126464	0.0	0.0
919	0.0	0.0	0.0	0.0	0.059577	0.0	0.0
945	0.0	0.0	0.0	0.0	0.016377	0.0	0.0
946	0.0	0.0	0.0	0.0	-0.082960	0.0	0.0
981	0.0	0.0	0.0	0.0	0.236159	0.0	0.0
1076	0.0	0.0	0.0	0.0	0.018315	0.0	0.0
1121	0.0	0.0	0.0	0.0	0.053183	0.0	0.0
1176	0.0	0.0	0.0	0.0	0.061481	0.0	0.0
1303	0.0	0.0	0.0	0.0	0.093931	0.0	0.0
1315	0.0	0.0	0.0	0.0	0.025642	0.0	0.0
1347	0.0	0.0	0.0	0.0	-0.099468	0.0	0.0
1435	0.0	0.0	0.0	0.0	-0.187450	0.0	0.0
1438	0.0	0.0	0.0	0.0	0.225249	0.0	0.0

	diagonal	height_left	height_right	margin_low	margin_up	length	id
count	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000	5
unique	NaN	NaN	NaN	NaN	NaN	NaN	5
top	NaN	NaN	NaN	NaN	NaN	NaN	A_1
freq	NaN	NaN	NaN	NaN	NaN	NaN	1
mean	171.954000	104.188000	103.972000	4.882000	3.238000	112.322000	NaN
std	0.326542	0.396257	0.393027	0.836821	0.142724	0.897201	NaN
min	171.650000	103.630000	103.540000	3.770000	3.030000	111.420000	NaN
25%	171.760000	104.010000	103.560000	4.440000	3.160000	111.570000	NaN
50%	171.870000	104.170000	104.130000	4.990000	3.300000	112.090000	NaN
75%	172.000000	104.550000	104.290000	5.210000	3.310000	113.200000	NaN
max	172.490000	104.580000	104.340000	6.000000	3.390000	113.330000	NaN

	diagonal	height_left	height_right	margin_low	margin_up	length
0	-0.664228	-0.502224	-1.228900	0.438224	0.485681	-1.124014
1	-0.287604	-0.050787	0.449459	1.493702	0.564017	-0.289103
2	0.157497	1.106022	0.904607	0.144293	1.190702	-0.937094
3	1.835186	1.021377	1.046841	-0.590533	-1.629382	1.094107
4	-1.040852	-1.574388	-1.172007	-1.485686	-0.611018	1.256104