IMPORT¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression as LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,r2_score, mean_absolute_error,roc_curve, roc_auc_score
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from scipy.stats import mode
from sklearn.impute import SimpleImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
In [2]:
billets = pd.read_csv('billets.csv', sep=';')
In [3]:
billets
Out[3]:
is_genuine diagonal height_left height_right margin_low margin_up length
0 True 171.81 104.86 104.95 4.52 2.89 112.83
1 True 171.46 103.36 103.66 3.77 2.99 113.09
2 True 172.69 104.48 103.50 4.40 2.94 113.16
3 True 171.36 103.91 103.94 3.62 3.01 113.51
4 True 171.73 104.28 103.46 4.04 3.48 112.54
... ... ... ... ... ... ... ...
1495 False 171.75 104.38 104.17 4.42 3.09 111.28
1496 False 172.19 104.63 104.44 5.27 3.37 110.97
1497 False 171.80 104.01 104.12 5.51 3.36 111.95
1498 False 172.06 104.28 104.06 5.17 3.46 112.25
1499 False 171.47 104.15 103.82 4.63 3.37 112.07

1500 rows × 7 columns

HEAD, INFO, DESCRIBE¶

In [4]:
billets.head() # afficher les premières lignes du tableau de données
Out[4]:
is_genuine diagonal height_left height_right margin_low margin_up length
0 True 171.81 104.86 104.95 4.52 2.89 112.83
1 True 171.46 103.36 103.66 3.77 2.99 113.09
2 True 172.69 104.48 103.50 4.40 2.94 113.16
3 True 171.36 103.91 103.94 3.62 3.01 113.51
4 True 171.73 104.28 103.46 4.04 3.48 112.54
In [5]:
billets.info() # afficher des informations sur les données
billets.describe(include='all') # afficher des statistiques descriptives sur les données
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   is_genuine    1500 non-null   bool   
 1   diagonal      1500 non-null   float64
 2   height_left   1500 non-null   float64
 3   height_right  1500 non-null   float64
 4   margin_low    1463 non-null   float64
 5   margin_up     1500 non-null   float64
 6   length        1500 non-null   float64
dtypes: bool(1), float64(6)
memory usage: 71.9 KB
Out[5]:
is_genuine diagonal height_left height_right margin_low margin_up length
count 1500 1500.000000 1500.000000 1500.000000 1463.000000 1500.000000 1500.00000
unique 2 NaN NaN NaN NaN NaN NaN
top True NaN NaN NaN NaN NaN NaN
freq 1000 NaN NaN NaN NaN NaN NaN
mean NaN 171.958440 104.029533 103.920307 4.485967 3.151473 112.67850
std NaN 0.305195 0.299462 0.325627 0.663813 0.231813 0.87273
min NaN 171.040000 103.140000 102.820000 2.980000 2.270000 109.49000
25% NaN 171.750000 103.820000 103.710000 4.015000 2.990000 112.03000
50% NaN 171.960000 104.040000 103.920000 4.310000 3.140000 112.96000
75% NaN 172.170000 104.230000 104.150000 4.870000 3.310000 113.34000
max NaN 173.010000 104.880000 104.950000 6.900000 3.910000 114.44000
In [6]:
print("Il y a :\n\n\n", billets.isnull().sum() , '\n\n\nvaleurs nulls')
Il y a :


 is_genuine       0
diagonal         0
height_left      0
height_right     0
margin_low      37
margin_up        0
length           0
dtype: int64 


valeurs nulls
In [7]:
#nulls = billets.loc[billets['margin_low'].isnull()]
print(billets.loc[billets['margin_low'].isnull()])
      is_genuine  diagonal  height_left  height_right  margin_low  margin_up  \
72          True    171.94       103.89        103.45         NaN       3.25   
99          True    171.93       104.07        104.18         NaN       3.14   
151         True    172.07       103.80        104.38         NaN       3.02   
197         True    171.45       103.66        103.80         NaN       3.62   
241         True    171.83       104.14        104.06         NaN       3.02   
251         True    171.80       103.26        102.82         NaN       2.95   
284         True    171.92       103.83        103.76         NaN       3.23   
334         True    171.85       103.70        103.96         NaN       3.00   
410         True    172.56       103.72        103.51         NaN       3.12   
413         True    172.30       103.66        103.50         NaN       3.16   
445         True    172.34       104.42        103.22         NaN       3.01   
481         True    171.81       103.53        103.96         NaN       2.71   
505         True    172.01       103.97        104.05         NaN       2.98   
611         True    171.80       103.68        103.49         NaN       3.30   
654         True    171.97       103.69        103.54         NaN       2.70   
675         True    171.60       103.85        103.91         NaN       2.56   
710         True    172.03       103.97        103.86         NaN       3.07   
739         True    172.07       103.74        103.76         NaN       3.09   
742         True    172.14       104.06        103.96         NaN       3.24   
780         True    172.41       103.95        103.79         NaN       3.13   
798         True    171.96       103.84        103.62         NaN       3.01   
844         True    171.62       104.14        104.49         NaN       2.99   
845         True    172.02       104.21        104.05         NaN       2.90   
871         True    171.37       104.07        103.75         NaN       3.07   
895         True    171.81       103.68        103.80         NaN       2.98   
919         True    171.92       103.68        103.45         NaN       2.58   
945         True    172.09       103.74        103.52         NaN       3.02   
946         True    171.63       103.87        104.66         NaN       3.27   
981         True    172.02       104.23        103.72         NaN       2.99   
1076       False    171.57       104.27        104.44         NaN       3.21   
1121       False    171.40       104.38        104.19         NaN       3.17   
1176       False    171.59       104.05        103.94         NaN       3.02   
1303       False    172.17       104.49        103.76         NaN       2.93   
1315       False    172.08       104.15        104.17         NaN       3.40   
1347       False    171.72       104.46        104.12         NaN       3.61   
1435       False    172.66       104.33        104.41         NaN       3.56   
1438       False    171.90       104.28        104.29         NaN       3.24   

      length  
72    112.79  
99    113.08  
151   112.93  
197   113.27  
241   112.36  
251   113.22  
284   113.29  
334   113.36  
410   112.95  
413   112.95  
445   112.97  
481   113.99  
505   113.65  
611   112.84  
654   112.79  
675   113.27  
710   112.65  
739   112.41  
742   113.07  
780   113.41  
798   114.44  
844   113.35  
845   113.62  
871   113.27  
895   113.82  
919   113.68  
945   112.78  
946   112.68  
981   113.37  
1076  111.87  
1121  112.39  
1176  111.29  
1303  111.21  
1315  112.29  
1347  110.31  
1435  111.47  
1438  111.49  

BILLETS AVEC NULLS¶

In [8]:
billets_avec_nulls = billets.loc[billets['margin_low'].isna()]
billets_tous=billets
billets = billets.dropna(subset=['margin_low'])
billets_avec_nulls
Out[8]:
is_genuine diagonal height_left height_right margin_low margin_up length
72 True 171.94 103.89 103.45 NaN 3.25 112.79
99 True 171.93 104.07 104.18 NaN 3.14 113.08
151 True 172.07 103.80 104.38 NaN 3.02 112.93
197 True 171.45 103.66 103.80 NaN 3.62 113.27
241 True 171.83 104.14 104.06 NaN 3.02 112.36
251 True 171.80 103.26 102.82 NaN 2.95 113.22
284 True 171.92 103.83 103.76 NaN 3.23 113.29
334 True 171.85 103.70 103.96 NaN 3.00 113.36
410 True 172.56 103.72 103.51 NaN 3.12 112.95
413 True 172.30 103.66 103.50 NaN 3.16 112.95
445 True 172.34 104.42 103.22 NaN 3.01 112.97
481 True 171.81 103.53 103.96 NaN 2.71 113.99
505 True 172.01 103.97 104.05 NaN 2.98 113.65
611 True 171.80 103.68 103.49 NaN 3.30 112.84
654 True 171.97 103.69 103.54 NaN 2.70 112.79
675 True 171.60 103.85 103.91 NaN 2.56 113.27
710 True 172.03 103.97 103.86 NaN 3.07 112.65
739 True 172.07 103.74 103.76 NaN 3.09 112.41
742 True 172.14 104.06 103.96 NaN 3.24 113.07
780 True 172.41 103.95 103.79 NaN 3.13 113.41
798 True 171.96 103.84 103.62 NaN 3.01 114.44
844 True 171.62 104.14 104.49 NaN 2.99 113.35
845 True 172.02 104.21 104.05 NaN 2.90 113.62
871 True 171.37 104.07 103.75 NaN 3.07 113.27
895 True 171.81 103.68 103.80 NaN 2.98 113.82
919 True 171.92 103.68 103.45 NaN 2.58 113.68
945 True 172.09 103.74 103.52 NaN 3.02 112.78
946 True 171.63 103.87 104.66 NaN 3.27 112.68
981 True 172.02 104.23 103.72 NaN 2.99 113.37
1076 False 171.57 104.27 104.44 NaN 3.21 111.87
1121 False 171.40 104.38 104.19 NaN 3.17 112.39
1176 False 171.59 104.05 103.94 NaN 3.02 111.29
1303 False 172.17 104.49 103.76 NaN 2.93 111.21
1315 False 172.08 104.15 104.17 NaN 3.40 112.29
1347 False 171.72 104.46 104.12 NaN 3.61 110.31
1435 False 172.66 104.33 104.41 NaN 3.56 111.47
1438 False 171.90 104.28 104.29 NaN 3.24 111.49
In [9]:
BC=billets.columns
BC=['diagonal', 'height_left', 'height_right', 'margin_low',
       'margin_up', 'length']
In [10]:
billets=billets.copy()
#D['is_genuine']=D['is_genuine'].astype(int)
billets.loc[:, 'is_genuine'] = billets['is_genuine'].astype(int)

EDA¶

HISTOGRAMS¶

In [11]:
for column in BC:
    plt.title(column, fontsize=14)
    for value in billets['is_genuine'].unique():
        billets.loc[billets['is_genuine']==value, column].dropna().hist(alpha=0.5, label=value)
    plt.xlabel("Valeur")
    plt.ylabel("Frequence")
    plt.legend(title='is_genuine')
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

BOXPLOTS¶

In [12]:
for column in BC:
    #plt.title(column, fontsize=14)
    billets.boxplot(column=column, by='is_genuine')
    plt.xlabel("is_genuine")
    plt.ylabel(column)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [13]:
billets.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1463 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   is_genuine    1463 non-null   int64  
 1   diagonal      1463 non-null   float64
 2   height_left   1463 non-null   float64
 3   height_right  1463 non-null   float64
 4   margin_low    1463 non-null   float64
 5   margin_up     1463 non-null   float64
 6   length        1463 non-null   float64
dtypes: float64(6), int64(1)
memory usage: 91.4 KB
In [14]:
#scaler = StandardScaler()
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(billets), columns=billets.columns)
In [15]:
XT = pd.DataFrame(scaler.fit_transform(billets_tous), columns=billets_tous.columns)

CORRELATIONS¶

In [16]:
corr_matrix = billets.corr(method='pearson')
corr_matrix=round (corr_matrix, 3)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))
sns.heatmap(corr_matrix, annot     = True, cbar      = False,annot_kws = {"size": 10}, vmin      = -1, vmax      = 1,center    = 0, cmap      = sns.diverging_palette(20, 220, n=200),square    = True, ax        = ax ),ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right', ),ax.tick_params(labelsize = 10)
plt.show()
No description has been provided for this image
In [17]:
corr_matrix = billets[billets['is_genuine']==0].corr(method='pearson')
corr_matrix=round (corr_matrix, 3)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))
sns.heatmap(corr_matrix, annot     = True, cbar      = False,annot_kws = {"size": 10}, vmin      = -1, vmax      = 1,center    = 0, cmap      = sns.diverging_palette(20, 220, n=200),square    = True, ax        = ax ),ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right', ),ax.tick_params(labelsize = 10)
plt.show()
No description has been provided for this image
In [18]:
corr_matrix = billets[billets['is_genuine']==1].corr(method='pearson')
corr_matrix=round (corr_matrix, 3)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))
sns.heatmap(corr_matrix, annot     = True, cbar      = False,annot_kws = {"size": 10}, vmin      = -1, vmax      = 1,center    = 0, cmap      = sns.diverging_palette(20, 220, n=200),square    = True, ax        = ax ),ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right', ),ax.tick_params(labelsize = 10)
plt.show()
No description has been provided for this image

VIF¶

In [19]:
# VIF dataframe
vif = pd.DataFrame()
vif["valeur"] = pd.DataFrame(X[BC].columns)
  
# calculer VIF pour chaque valeur
vif["VIF"] = [variance_inflation_factor(X[BC].values, i)
                          for i in range(len(X[BC].columns))]
  
print(vif)
         valeur        VIF
0      diagonal   9.571588
1   height_left  10.693729
2  height_right  12.474296
3    margin_low   9.510445
4     margin_up  16.641133
5        length   8.715947

PAIRPLOT¶

In [20]:
#DF X est normalise et is_genuine 1 true et 0 false
sns.pairplot(XT, hue='is_genuine', diag_kind="hist",kind="kde",palette="Set2",
             height=1.4)
plt.show()
No description has been provided for this image

IMPUTER¶

In [21]:
billets_tous.iloc[billets_avec_nulls.index.to_list()]
Out[21]:
is_genuine diagonal height_left height_right margin_low margin_up length
72 True 171.94 103.89 103.45 NaN 3.25 112.79
99 True 171.93 104.07 104.18 NaN 3.14 113.08
151 True 172.07 103.80 104.38 NaN 3.02 112.93
197 True 171.45 103.66 103.80 NaN 3.62 113.27
241 True 171.83 104.14 104.06 NaN 3.02 112.36
251 True 171.80 103.26 102.82 NaN 2.95 113.22
284 True 171.92 103.83 103.76 NaN 3.23 113.29
334 True 171.85 103.70 103.96 NaN 3.00 113.36
410 True 172.56 103.72 103.51 NaN 3.12 112.95
413 True 172.30 103.66 103.50 NaN 3.16 112.95
445 True 172.34 104.42 103.22 NaN 3.01 112.97
481 True 171.81 103.53 103.96 NaN 2.71 113.99
505 True 172.01 103.97 104.05 NaN 2.98 113.65
611 True 171.80 103.68 103.49 NaN 3.30 112.84
654 True 171.97 103.69 103.54 NaN 2.70 112.79
675 True 171.60 103.85 103.91 NaN 2.56 113.27
710 True 172.03 103.97 103.86 NaN 3.07 112.65
739 True 172.07 103.74 103.76 NaN 3.09 112.41
742 True 172.14 104.06 103.96 NaN 3.24 113.07
780 True 172.41 103.95 103.79 NaN 3.13 113.41
798 True 171.96 103.84 103.62 NaN 3.01 114.44
844 True 171.62 104.14 104.49 NaN 2.99 113.35
845 True 172.02 104.21 104.05 NaN 2.90 113.62
871 True 171.37 104.07 103.75 NaN 3.07 113.27
895 True 171.81 103.68 103.80 NaN 2.98 113.82
919 True 171.92 103.68 103.45 NaN 2.58 113.68
945 True 172.09 103.74 103.52 NaN 3.02 112.78
946 True 171.63 103.87 104.66 NaN 3.27 112.68
981 True 172.02 104.23 103.72 NaN 2.99 113.37
1076 False 171.57 104.27 104.44 NaN 3.21 111.87
1121 False 171.40 104.38 104.19 NaN 3.17 112.39
1176 False 171.59 104.05 103.94 NaN 3.02 111.29
1303 False 172.17 104.49 103.76 NaN 2.93 111.21
1315 False 172.08 104.15 104.17 NaN 3.40 112.29
1347 False 171.72 104.46 104.12 NaN 3.61 110.31
1435 False 172.66 104.33 104.41 NaN 3.56 111.47
1438 False 171.90 104.28 104.29 NaN 3.24 111.49

KNNImputer¶

In [22]:
imputer = KNNImputer(n_neighbors=5) 
billets_tous_knnimputer = pd.DataFrame(imputer.fit_transform(billets_tous), columns=billets_tous.columns)

REGRESION LINEARE¶

In [23]:
# On separe
donnees_avec_margin_low = billets_tous[billets_tous['margin_low'].notnull()]
donnees_sans_margin_low = billets_tous[billets_tous['margin_low'].isnull()]

# Train test split
X_train_lin, X_test_lin, y_train_lin, y_test_lin = train_test_split(donnees_avec_margin_low.drop('margin_low', axis=1)
                                                                    , donnees_avec_margin_low['margin_low'], test_size=0.15, random_state=42)
In [24]:
X_test_lin.shape, X_train_lin.shape ,y_train_lin.shape, y_test_lin.shape
Out[24]:
((220, 6), (1243, 6), (1243,), (220,))
In [25]:
lmodel=LinearRegression()
In [26]:
lmodel.fit(X_train_lin, y_train_lin)
Out[26]:
LinearRegression()
In [27]:
y_pred=lmodel.predict(X_test_lin)
In [28]:
(y_pred-y_test_lin).hist()
Out[28]:
<AxesSubplot:>
No description has been provided for this image

METRIQUES REGRESION LINEARE¶

In [29]:
X = donnees_avec_margin_low.drop('margin_low', axis=1)
X= X.drop('is_genuine', axis=1)
y = donnees_avec_margin_low['margin_low']
X = sm.add_constant(X) 
model = sm.OLS(y,X).fit()
print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             margin_low   R-squared:                       0.477
Model:                            OLS   Adj. R-squared:                  0.476
Method:                 Least Squares   F-statistic:                     266.1
Date:                Tue, 16 May 2023   Prob (F-statistic):          2.60e-202
Time:                        12:18:22   Log-Likelihood:                -1001.3
No. Observations:                1463   AIC:                             2015.
Df Residuals:                    1457   BIC:                             2046.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           22.9948      9.656      2.382      0.017       4.055      41.935
diagonal        -0.1111      0.041     -2.680      0.007      -0.192      -0.030
height_left      0.1841      0.045      4.113      0.000       0.096       0.272
height_right     0.2571      0.043      5.978      0.000       0.173       0.342
margin_up        0.2562      0.064      3.980      0.000       0.130       0.382
length          -0.4091      0.018    -22.627      0.000      -0.445      -0.374
==============================================================================
Omnibus:                       73.627   Durbin-Watson:                   1.893
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               95.862
Skew:                           0.482   Prob(JB):                     1.53e-21
Kurtosis:                       3.801   Cond. No.                     1.94e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.94e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
In [30]:
r2 = r2_score(y, 
              lmodel.predict(donnees_avec_margin_low.drop('margin_low', axis=1)))
mae = mean_absolute_error(y, lmodel.predict(donnees_avec_margin_low.drop('margin_low', axis=1)))
print("R2:", r2)
print("MAE:", mae)
R2: 0.6166656453942228
MAE: 0.3160336084059514
In [31]:
# Entreiner model
X_train_margin_low = donnees_avec_margin_low.drop('margin_low', axis=1)
y_train_margin_low = donnees_avec_margin_low['margin_low']
model_margin_low = LinearRegression()
model_margin_low.fit(X_train_margin_low, y_train_margin_low)
# VALIDER MODEL
y_pred=model_margin_low.predict(X_test_lin)
#y_test-y_pred
In [32]:
# Entreiner model
X_train_margin_low = donnees_avec_margin_low.drop('margin_low', axis=1)
y_train_margin_low = donnees_avec_margin_low['margin_low']
model_margin_low = LinearRegression()
model_margin_low.fit(X_train_margin_low, y_train_margin_low)

# Prediction donnes sans margin
X_test_margin_low = donnees_sans_margin_low.drop('margin_low', axis=1)
donnees_sans_margin_low['margin_low'] = model_margin_low.predict(X_test_margin_low)

# merge/concat
billets_tous_linear = pd.concat([donnees_avec_margin_low, donnees_sans_margin_low])
/var/folders/pd/z7nn_cm91pg9fjmhgsnqvvd80000gn/T/ipykernel_24189/1793560917.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  donnees_sans_margin_low['margin_low'] = model_margin_low.predict(X_test_margin_low)
In [33]:
donnees_sans_margin_low.describe()
Out[33]:
diagonal height_left height_right margin_low margin_up length
count 37.000000 37.000000 37.000000 37.000000 37.000000 37.000000
mean 171.928649 103.958378 103.874054 4.359368 3.087838 112.826486
std 0.297145 0.288680 0.380631 0.470245 0.239782 0.851388
min 171.370000 103.260000 102.820000 3.993571 2.560000 110.310000
25% 171.800000 103.720000 103.620000 4.096960 2.990000 112.410000
50% 171.930000 103.950000 103.860000 4.125390 3.070000 112.950000
75% 172.070000 104.150000 104.120000 4.198423 3.230000 113.350000
max 172.660000 104.490000 104.660000 5.302069 3.620000 114.440000

KNNImputer vs REGRESION LINEARE:¶

In [34]:
billets_tous_knnimputer.iloc[billets_avec_nulls.index.to_list()]-donnees_sans_margin_low
Out[34]:
is_genuine diagonal height_left height_right margin_low margin_up length
72 0.0 0.0 0.0 0.0 -0.018954 0.0 0.0
99 0.0 0.0 0.0 0.0 0.166010 0.0 0.0
151 0.0 0.0 0.0 0.0 0.079997 0.0 0.0
197 0.0 0.0 0.0 0.0 0.112429 0.0 0.0
241 0.0 0.0 0.0 0.0 0.095601 0.0 0.0
251 0.0 0.0 0.0 0.0 0.253716 0.0 0.0
284 0.0 0.0 0.0 0.0 0.003876 0.0 0.0
334 0.0 0.0 0.0 0.0 0.210610 0.0 0.0
410 0.0 0.0 0.0 0.0 0.091272 0.0 0.0
413 0.0 0.0 0.0 0.0 -0.057633 0.0 0.0
445 0.0 0.0 0.0 0.0 0.027027 0.0 0.0
481 0.0 0.0 0.0 0.0 0.113620 0.0 0.0
505 0.0 0.0 0.0 0.0 -0.118484 0.0 0.0
611 0.0 0.0 0.0 0.0 -0.069068 0.0 0.0
654 0.0 0.0 0.0 0.0 -0.180377 0.0 0.0
675 0.0 0.0 0.0 0.0 -0.073551 0.0 0.0
710 0.0 0.0 0.0 0.0 -0.131868 0.0 0.0
739 0.0 0.0 0.0 0.0 0.085159 0.0 0.0
742 0.0 0.0 0.0 0.0 -0.043843 0.0 0.0
780 0.0 0.0 0.0 0.0 0.191238 0.0 0.0
798 0.0 0.0 0.0 0.0 0.171498 0.0 0.0
844 0.0 0.0 0.0 0.0 0.160824 0.0 0.0
845 0.0 0.0 0.0 0.0 0.087712 0.0 0.0
871 0.0 0.0 0.0 0.0 -0.003938 0.0 0.0
895 0.0 0.0 0.0 0.0 0.126464 0.0 0.0
919 0.0 0.0 0.0 0.0 0.059577 0.0 0.0
945 0.0 0.0 0.0 0.0 0.016377 0.0 0.0
946 0.0 0.0 0.0 0.0 -0.082960 0.0 0.0
981 0.0 0.0 0.0 0.0 0.236159 0.0 0.0
1076 0.0 0.0 0.0 0.0 0.018315 0.0 0.0
1121 0.0 0.0 0.0 0.0 0.053183 0.0 0.0
1176 0.0 0.0 0.0 0.0 0.061481 0.0 0.0
1303 0.0 0.0 0.0 0.0 0.093931 0.0 0.0
1315 0.0 0.0 0.0 0.0 0.025642 0.0 0.0
1347 0.0 0.0 0.0 0.0 -0.099468 0.0 0.0
1435 0.0 0.0 0.0 0.0 -0.187450 0.0 0.0
1438 0.0 0.0 0.0 0.0 0.225249 0.0 0.0
In [35]:
fig, ax = plt.subplots()

billets_tous_knnimputer[['margin_low']].iloc[billets_avec_nulls.index.to_list()].boxplot(ax=ax,showmeans=True,positions=[0])
donnees_sans_margin_low[['margin_low']].boxplot(ax=ax, showmeans=True, positions=[1])
ax.set_xticklabels(['Knnimputer', 'Regresion lineare'])
ax.set_ylabel('margin_low')
ax.set_title('Comparacion des distribuctons de margin_low avec differents methodes')
Out[35]:
Text(0.5, 1.0, 'Comparacion des distribuctons de margin_low avec differents methodes')
No description has been provided for this image
In [36]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(billets_tous_linear), columns=billets_tous_linear.columns)
In [37]:
X['is_genuine']=billets_tous_linear['is_genuine']
In [38]:
XT.info() # afficher des informations sur les données
XT.describe(include='all') # afficher des statistiques descriptives sur les données
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   is_genuine    1500 non-null   float64
 1   diagonal      1500 non-null   float64
 2   height_left   1500 non-null   float64
 3   height_right  1500 non-null   float64
 4   margin_low    1463 non-null   float64
 5   margin_up     1500 non-null   float64
 6   length        1500 non-null   float64
dtypes: float64(7)
memory usage: 82.2 KB
Out[38]:
is_genuine diagonal height_left height_right margin_low margin_up length
count 1500.000000 1500.000000 1500.000000 1500.000000 1463.000000 1500.000000 1500.000000
mean 0.666667 0.466213 0.511226 0.516576 0.384175 0.537484 0.644141
std 0.471562 0.154921 0.172104 0.152877 0.169340 0.141349 0.176309
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.360406 0.390805 0.417840 0.264031 0.439024 0.513131
50% 1.000000 0.467005 0.517241 0.516432 0.339286 0.530488 0.701010
75% 1.000000 0.573604 0.626437 0.624413 0.482143 0.634146 0.777778
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

DIVISION DES DONNÉES EN VALEURS (x) ET SI EST GENUINE OU PAS (y)¶

In [39]:
#est-ce une valeur continue (un nombre) ou bien une valeur discrète (une catégorie) ? 
#Le premier cas est appelé une régression, le second une classification  
#Depuis openclassrooms, Initiez-vous au Machine Learning 

#alors je fais regression...
In [40]:
x = billets_tous_linear.drop('is_genuine', axis=1)
y = billets_tous_linear['is_genuine']

DIVISION DES DONNES EN DONNES POUR ENTREINER ET DE TEST¶

In [41]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(x, y, test_size=0.15, random_state=42)

MODELE REGRESION LOGISTIQUE¶

In [42]:
# Entreiner le model de regresion logistique
Logistique_Regression_model = LogisticRegression()
Logistique_Regression_model.fit(X_train_log, y_train_log)

y_pred_log = Logistique_Regression_model.predict(X_test_log)
In [43]:
# prediction:
y_pred_log = Logistique_Regression_model.predict(X_test_log)

#y_pred_log est ma prediction et y_test est ce que j'avais separé
y_pred_log_tous= Logistique_Regression_model.predict (billets_tous_linear.drop('is_genuine', axis=1))
y_log_tous=billets_tous_linear['is_genuine']

METRIQUES¶

In [44]:
#precision:
accuracy_log = accuracy_score(y_log_tous, y_pred_log_tous)
In [45]:
print("Precision:" ,accuracy_log)
#matriz de confusion también r score
Precision: 0.9913333333333333
In [46]:
print(classification_report(y_log_tous, y_pred_log_tous)
   
     )
              precision    recall  f1-score   support

       False       0.99      0.98      0.99       500
        True       0.99      1.00      0.99      1000

    accuracy                           0.99      1500
   macro avg       0.99      0.99      0.99      1500
weighted avg       0.99      0.99      0.99      1500

In [47]:
Z = sm.add_constant(x)
model = sm.Logit(y, Z).fit()
print(model.summary())
Optimization terminated successfully.
         Current function value: 0.026349
         Iterations 13
                           Logit Regression Results                           
==============================================================================
Dep. Variable:             is_genuine   No. Observations:                 1500
Model:                          Logit   Df Residuals:                     1493
Method:                           MLE   Df Model:                            6
Date:                Tue, 16 May 2023   Pseudo R-squ.:                  0.9586
Time:                        12:18:23   Log-Likelihood:                -39.524
converged:                       True   LL-Null:                       -954.77
Covariance Type:            nonrobust   LLR p-value:                     0.000
================================================================================
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const         -230.4143    249.954     -0.922      0.357    -720.316     259.487
diagonal         0.2562      1.135      0.226      0.821      -1.969       2.481
height_left     -1.2845      1.117     -1.150      0.250      -3.474       0.905
height_right    -2.8941      1.143     -2.533      0.011      -5.134      -0.654
margin_low      -6.0235      0.988     -6.099      0.000      -7.959      -4.088
margin_up      -10.2986      2.206     -4.669      0.000     -14.622      -5.975
length           6.0561      0.897      6.751      0.000       4.298       7.814
================================================================================

Possibly complete quasi-separation: A fraction 0.53 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.
In [48]:
y_test_log.shape, y_pred_log.shape
Out[48]:
((225,), (225,))

COURBE ROC¶

In [49]:
# calculer courbe ROC
fpr, tpr, thresholds = roc_curve(y_log_tous, y_pred_log_tous)

# calculer  AUC
auc = roc_auc_score(y_log_tous, y_pred_log_tous)

# grafique courbe ROC
plt.plot(fpr, tpr, label='AUC = {:.2f}'.format(auc))
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel('Taux de faux positif')
plt.ylabel('Taux de vrai positif')
plt.title('Courbe ROC')
plt.legend()
plt.show()
No description has been provided for this image

MATRICE DE CONFUSION¶

In [50]:
cm = confusion_matrix(y_pred_log_tous, y_log_tous)
In [51]:
plt.matshow(cm, cmap=plt.cm.Blues)
plt.xlabel('Prediction')
plt.ylabel('Vrai')
plt.plot([-0.5, cm.shape[0]-0.5], [-0.5, cm.shape[1]-0.5], 'g-')  # línea diagonal roja
#plt.plot([-0.5, cm.shape[0]-0.5], [cm.shape[1]-0.5, -0.5], 'r-')  # línea diagonal verde

for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(x=j, y=i, s=cm[i, j], va='center', ha='center', fontsize=14, weight='bold')
plt.show()
No description has been provided for this image

SUPRESSION VARIABLES PAS SIGNIFICATIVES¶

In [52]:
# SANS P value >0.5
In [53]:
x = billets_tous_linear.drop('is_genuine', axis=1).drop('diagonal', axis=1).drop('height_left', axis=1)
y = billets_tous_linear['is_genuine']
In [54]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(x, y, test_size=0.15, random_state=42)
# Entreiner le model de regresion logistique
Logistique_Regression_model = LogisticRegression()
Logistique_Regression_model.fit(X_train_log, y_train_log)

y_pred_log = Logistique_Regression_model.predict(X_test_log)
# prediction:
y_pred_log = Logistique_Regression_model.predict(X_test_log)

#y_pred_log est ma prediction et y_test est ce que j'avais separé
y_pred_log_tous= Logistique_Regression_model.predict (x)
y_log_tous=billets_tous_linear['is_genuine']

#------METRIQUES
#precision:
accuracy_log = accuracy_score(y_log_tous, y_pred_log_tous)
In [55]:
print("Precision:" ,accuracy_log)
Precision: 0.99
In [56]:
print(classification_report(y_log_tous, y_pred_log_tous)
   
     )
              precision    recall  f1-score   support

       False       0.99      0.98      0.98       500
        True       0.99      1.00      0.99      1000

    accuracy                           0.99      1500
   macro avg       0.99      0.99      0.99      1500
weighted avg       0.99      0.99      0.99      1500

In [57]:
Z = sm.add_constant(x)
model = sm.Logit(y, Z).fit()
print(model.summary())
Optimization terminated successfully.
         Current function value: 0.026815
         Iterations 13
                           Logit Regression Results                           
==============================================================================
Dep. Variable:             is_genuine   No. Observations:                 1500
Model:                          Logit   Df Residuals:                     1495
Method:                           MLE   Df Model:                            4
Date:                Tue, 16 May 2023   Pseudo R-squ.:                  0.9579
Time:                        12:18:23   Log-Likelihood:                -40.223
converged:                       True   LL-Null:                       -954.77
Covariance Type:            nonrobust   LLR p-value:                     0.000
================================================================================
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const         -285.3339    139.160     -2.050      0.040    -558.083     -12.585
height_right    -3.3223      1.117     -2.975      0.003      -5.511      -1.134
margin_low      -6.3080      0.963     -6.547      0.000      -8.196      -4.420
margin_up      -10.4027      2.196     -4.737      0.000     -14.707      -6.098
length           6.1579      0.888      6.934      0.000       4.417       7.898
================================================================================

Possibly complete quasi-separation: A fraction 0.55 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.
In [58]:
# calculer courbe ROC
fpr, tpr, thresholds = roc_curve(y_log_tous, y_pred_log_tous)

# calculer  AUC
auc = roc_auc_score(y_log_tous, y_pred_log_tous)

# grafique courbe ROC
plt.plot(fpr, tpr, label='AUC = {:.2f}'.format(auc))
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel('Taux de faux positif')
plt.ylabel('Taux de vrai positif')
plt.title('Courbe ROC')
plt.legend()
plt.show()
No description has been provided for this image

ACP¶

In [59]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)
In [60]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(x)
In [61]:
plt.scatter(X_pca[y == 0][:, 0], X_pca[y == 0][:, 1], color='red', label='False (0)')
plt.scatter(X_pca[y == 1][:, 0], X_pca[y == 1][:, 1], color='blue', label='True (1)')

plt.xlabel('CP 1')
plt.ylabel('CP 2')
plt.title('Visualizacion 2D de 5 variables avec PCA')
          #avec DONNÉES AVEC imputer regression lineare
plt.legend(loc='upper left')
plt.show()
No description has been provided for this image
In [62]:
pca = PCA()
pca.fit(x)

explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

plt.figure()
plt.plot(range(1, len(explained_variance_ratio) + 1), cumulative_explained_variance, marker='o')
plt.xlabel('Nombre de composantes principales')
plt.ylabel('Taux de la variance expliquée')
plt.show()
print("Nombre CP qu'on a besoin : ", np.argmax(cumulative_explained_variance >= 0.70) + 1, " (plus que 70% d'explication de variabilité)")
No description has been provided for this image
Nombre CP qu'on a besoin :  1  (plus que 70% d'explication de variabilité)
In [63]:
scree = (pca.explained_variance_ratio_*100).round(2)
scree_cum = scree.cumsum().round()
x_list = range(1, 5)
list(x_list)
plt.bar(x_list, scree)
plt.plot(x_list, scree_cum,c="red",marker='o')
plt.xlabel("rang de l'axe d'inertie")
plt.ylabel("pourcentage d'inertie")
plt.title("Eboulis des valeurs propres")
plt.show(block=False)
No description has been provided for this image
In [64]:
t, r = 0,1
fig, ax = plt.subplots(figsize=(10, 9))
for i in range(0, pca.components_.shape[1]):
    ax.arrow(0,
             0,  # Start the arrow at the origin
             pca.components_[0, i],  #0 for PC1
             pca.components_[1, i],  #1 for PC2
             
             head_width=0.07,
             head_length=0.07, 
             width=0.02,              )

    plt.text(pca.components_[0, i] + 0.05,
             pca.components_[1, i] + 0.05,
             BC[i])
    
# affichage des lignes horizontales et verticales
plt.plot([-1, 1], [0, 0], color='grey', ls='--')
plt.plot([0, 0], [-1, 1], color='grey', ls='--')


# nom des axes, avec le pourcentage d'inertie expliqué
plt.xlabel('F{} ({}%)'.format(t+1, round(100*pca.explained_variance_ratio_[t],1)))
plt.ylabel('F{} ({}%)'.format(r+1, round(100*pca.explained_variance_ratio_[r],1)))

plt.title("Cercle des corrélations (CP{} et CP{})".format(t+1, r+1))


an = np.linspace(0, 2 * np.pi, 100)
plt.plot(np.cos(an), np.sin(an))  # Add a unit circle for scale
plt.axis('equal')
plt.show(block=False)
No description has been provided for this image

Nombre de ACP¶

In [65]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(billets_tous_linear[BC])
In [66]:
df_X_pca=pd.DataFrame(X_pca, index=X.index)
In [67]:
df_X_pca['is_genuine']=X['is_genuine']
In [68]:
sum(df_X_pca['is_genuine']==X['is_genuine'])
Out[68]:
1500
In [69]:
df_X_pca
Out[69]:
0 1 is_genuine
0 0.113911 0.237710 True
1 -0.832936 -0.377681 True
2 -0.500740 0.167628 True
3 -1.146882 -0.216228 True
4 -0.118929 -0.460558 True
... ... ... ...
1495 1.624551 -0.148959 False
1496 0.779278 0.382925 False
1497 2.437424 -0.731457 False
1498 1.485415 -0.103298 False
1499 1.471789 -0.004096 False

1500 rows × 3 columns

In [70]:
x_pca = df_X_pca.drop('is_genuine', axis=1)
y = df_X_pca['is_genuine']
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(x_pca, y, test_size=0.20, random_state=42)
In [71]:
Logistique_Regression_model_acp = LogisticRegression()
Logistique_Regression_model_acp.fit(X_train_log, y_train_log)
y_pred_log = Logistique_Regression_model_acp.predict(X_test_log)
In [72]:
accuracy_score(y_test_log, y_pred_log)
Out[72]:
0.9433333333333334
In [73]:
accuracy_score(df_X_pca['is_genuine'], Logistique_Regression_model_acp.predict(df_X_pca.drop('is_genuine', axis=1)))
Out[73]:
0.9486666666666667

K-Means¶

In [74]:
X_kmeans=billets_tous_linear
y = X['is_genuine']
In [75]:
y= billets_tous_linear['is_genuine']
In [ ]:
 
In [76]:
X_train_kmeans, X_test_kmeans, y_train_kmeans, y_test_kmeans = train_test_split(x, y, test_size=0.15, random_state=72)
In [77]:
kmeans = KMeans(n_clusters=2) 
kmeans.fit(x)
y_pred_kmeans = kmeans.fit_predict(x)
In [78]:
x['is_genuine_kmeans']=kmeans.labels_
In [79]:
x['is_genuine']=y
In [80]:
x=x.replace({0:2})
x=x.replace({1:0})
x=x.replace({2:1})
In [81]:
y_pred_tous=kmeans.predict(x[[#'diagonal', 'height_left', 
                              'height_right', 'margin_low',
       'margin_up', 'length']])
X_kmeans['is_genuine_kmeans']=y_pred_tous
In [82]:
y_pred_kmeans
Out[82]:
array([0, 0, 0, ..., 1, 1, 1], dtype=int32)
In [83]:
sum(X_kmeans['is_genuine']!=X_kmeans['is_genuine_kmeans'])
Out[83]:
1474
In [84]:
accuracy_score(kmeans.labels_, y)
Out[84]:
0.017333333333333333
In [85]:
accuracy_score(x['is_genuine_kmeans'], x['is_genuine'])
Out[85]:
0.9826666666666667
In [86]:
(y_test_kmeans.shape, y_pred_kmeans.shape)
Out[86]:
((225,), (1500,))
In [87]:
cm = confusion_matrix(x['is_genuine_kmeans'], x['is_genuine'])
In [88]:
print(cm)
[[476   2]
 [ 24 998]]
In [89]:
plt.matshow(cm, cmap=plt.cm.Blues)
plt.xlabel('Prediction')
plt.ylabel('Vrai')
plt.plot([-0.5, cm.shape[0]-0.5], [-0.5, cm.shape[1]-0.5], 'g-')  # línea diagonal roja
#plt.plot([-0.5, cm.shape[0]-0.5], [cm.shape[1]-0.5, -0.5], 'r-')  # línea diagonal verde

for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(x=j, y=i, s=cm[i, j], va='center', ha='center', fontsize=14, weight='bold')
plt.show()
#La diagonale vert représente les prédictions correctes.
#La croix (rouge) qui croise de la diagonale vert représente les prédictions incorrectes.
#Il y a 2 faux négatifs.
#Il y a 24 faux positifs.

#Il y a 998 vrai positifs
#Il y a 476 vrai négatifs

#[476 2]
#[24 998]
No description has been provided for this image

IMPORT Billets Production¶

In [90]:
billetsP = pd.read_csv('billets_production.csv', sep=',')
In [91]:
billetsP
Out[91]:
diagonal height_left height_right margin_low margin_up length id
0 171.76 104.01 103.54 5.21 3.30 111.42 A_1
1 171.87 104.17 104.13 6.00 3.31 112.09 A_2
2 172.00 104.58 104.29 4.99 3.39 111.57 A_3
3 172.49 104.55 104.34 4.44 3.03 113.20 A_4
4 171.65 103.63 103.56 3.77 3.16 113.33 A_5
In [92]:
billetsP.head()
Out[92]:
diagonal height_left height_right margin_low margin_up length id
0 171.76 104.01 103.54 5.21 3.30 111.42 A_1
1 171.87 104.17 104.13 6.00 3.31 112.09 A_2
2 172.00 104.58 104.29 4.99 3.39 111.57 A_3
3 172.49 104.55 104.34 4.44 3.03 113.20 A_4
4 171.65 103.63 103.56 3.77 3.16 113.33 A_5
In [93]:
billetsP.info()
billetsP.describe(include='all')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   diagonal      5 non-null      float64
 1   height_left   5 non-null      float64
 2   height_right  5 non-null      float64
 3   margin_low    5 non-null      float64
 4   margin_up     5 non-null      float64
 5   length        5 non-null      float64
 6   id            5 non-null      object 
dtypes: float64(6), object(1)
memory usage: 408.0+ bytes
Out[93]:
diagonal height_left height_right margin_low margin_up length id
count 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5
unique NaN NaN NaN NaN NaN NaN 5
top NaN NaN NaN NaN NaN NaN A_1
freq NaN NaN NaN NaN NaN NaN 1
mean 171.954000 104.188000 103.972000 4.882000 3.238000 112.322000 NaN
std 0.326542 0.396257 0.393027 0.836821 0.142724 0.897201 NaN
min 171.650000 103.630000 103.540000 3.770000 3.030000 111.420000 NaN
25% 171.760000 104.010000 103.560000 4.440000 3.160000 111.570000 NaN
50% 171.870000 104.170000 104.130000 4.990000 3.300000 112.090000 NaN
75% 172.000000 104.550000 104.290000 5.210000 3.310000 113.200000 NaN
max 172.490000 104.580000 104.340000 6.000000 3.390000 113.330000 NaN
In [94]:
sns.pairplot(billetsP[['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up','length']])
#, hue='id')
plt.show()
No description has been provided for this image
In [95]:
scaler = StandardScaler()
XP = pd.DataFrame(scaler.fit_transform(billetsP[['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
       'length']]
                                      ), columns=['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
       'length'])

DONNEES StandardScale¶

In [96]:
XP
Out[96]:
diagonal height_left height_right margin_low margin_up length
0 -0.664228 -0.502224 -1.228900 0.438224 0.485681 -1.124014
1 -0.287604 -0.050787 0.449459 1.493702 0.564017 -0.289103
2 0.157497 1.106022 0.904607 0.144293 1.190702 -0.937094
3 1.835186 1.021377 1.046841 -0.590533 -1.629382 1.094107
4 -1.040852 -1.574388 -1.172007 -1.485686 -0.611018 1.256104
In [97]:
sns.pairplot(XP[['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
       'length']])
plt.show()
No description has been provided for this image

PREDICTION billets_production.csv¶

In [98]:
billetsP['is_genuine']=Logistique_Regression_model.predict(XP.drop('diagonal', axis=1).drop('height_left', axis=1))
In [99]:
billetsP
Out[99]:
diagonal height_left height_right margin_low margin_up length id is_genuine
0 171.76 104.01 103.54 5.21 3.30 111.42 A_1 False
1 171.87 104.17 104.13 6.00 3.31 112.09 A_2 False
2 172.00 104.58 104.29 4.99 3.39 111.57 A_3 False
3 172.49 104.55 104.34 4.44 3.03 113.20 A_4 True
4 171.65 103.63 103.56 3.77 3.16 113.33 A_5 True

Kmeans billets_production¶

In [100]:
kmeans.predict(XP.drop('diagonal', axis=1).drop('height_left', axis=1))
Out[100]:
array([1, 1, 1, 1, 1], dtype=int32)
In [101]:
(XP.drop('diagonal', axis=1).drop('height_left', axis=1))
Out[101]:
height_right margin_low margin_up length
0 -1.228900 0.438224 0.485681 -1.124014
1 0.449459 1.493702 0.564017 -0.289103
2 0.904607 0.144293 1.190702 -0.937094
3 1.046841 -0.590533 -1.629382 1.094107
4 -1.172007 -1.485686 -0.611018 1.256104
In [102]:
Logistique_Regression_model.predict(XP.drop('diagonal', axis=1).drop('height_left', axis=1))
Out[102]:
array([False, False, False,  True,  True])

TEST DE L’ALGORITHME¶

In [103]:
algoritme=pd.read_csv('billets_production.csv', sep=',')
In [104]:
scaler = StandardScaler()
TA = pd.DataFrame(scaler.fit_transform(algoritme[['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
       'length']]
                                      ), columns=['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
       'length'])
In [105]:
TA
Out[105]:
diagonal height_left height_right margin_low margin_up length
0 -0.664228 -0.502224 -1.228900 0.438224 0.485681 -1.124014
1 -0.287604 -0.050787 0.449459 1.493702 0.564017 -0.289103
2 0.157497 1.106022 0.904607 0.144293 1.190702 -0.937094
3 1.835186 1.021377 1.046841 -0.590533 -1.629382 1.094107
4 -1.040852 -1.574388 -1.172007 -1.485686 -0.611018 1.256104
In [106]:
Logistique_Regression_model.predict(TA.drop('diagonal', axis=1).drop('height_left', axis=1))
Out[106]:
array([False, False, False,  True,  True])
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: