IMPORT¶
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression as LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,r2_score, mean_absolute_error,roc_curve, roc_auc_score
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from scipy.stats import mode
from sklearn.impute import SimpleImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
In [2]:
billets = pd.read_csv('billets.csv', sep=';')
In [3]:
billets
Out[3]:
is_genuine | diagonal | height_left | height_right | margin_low | margin_up | length | |
---|---|---|---|---|---|---|---|
0 | True | 171.81 | 104.86 | 104.95 | 4.52 | 2.89 | 112.83 |
1 | True | 171.46 | 103.36 | 103.66 | 3.77 | 2.99 | 113.09 |
2 | True | 172.69 | 104.48 | 103.50 | 4.40 | 2.94 | 113.16 |
3 | True | 171.36 | 103.91 | 103.94 | 3.62 | 3.01 | 113.51 |
4 | True | 171.73 | 104.28 | 103.46 | 4.04 | 3.48 | 112.54 |
... | ... | ... | ... | ... | ... | ... | ... |
1495 | False | 171.75 | 104.38 | 104.17 | 4.42 | 3.09 | 111.28 |
1496 | False | 172.19 | 104.63 | 104.44 | 5.27 | 3.37 | 110.97 |
1497 | False | 171.80 | 104.01 | 104.12 | 5.51 | 3.36 | 111.95 |
1498 | False | 172.06 | 104.28 | 104.06 | 5.17 | 3.46 | 112.25 |
1499 | False | 171.47 | 104.15 | 103.82 | 4.63 | 3.37 | 112.07 |
1500 rows × 7 columns
HEAD, INFO, DESCRIBE¶
In [4]:
billets.head() # afficher les premières lignes du tableau de données
Out[4]:
is_genuine | diagonal | height_left | height_right | margin_low | margin_up | length | |
---|---|---|---|---|---|---|---|
0 | True | 171.81 | 104.86 | 104.95 | 4.52 | 2.89 | 112.83 |
1 | True | 171.46 | 103.36 | 103.66 | 3.77 | 2.99 | 113.09 |
2 | True | 172.69 | 104.48 | 103.50 | 4.40 | 2.94 | 113.16 |
3 | True | 171.36 | 103.91 | 103.94 | 3.62 | 3.01 | 113.51 |
4 | True | 171.73 | 104.28 | 103.46 | 4.04 | 3.48 | 112.54 |
In [5]:
billets.info() # afficher des informations sur les données
billets.describe(include='all') # afficher des statistiques descriptives sur les données
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1500 entries, 0 to 1499 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 is_genuine 1500 non-null bool 1 diagonal 1500 non-null float64 2 height_left 1500 non-null float64 3 height_right 1500 non-null float64 4 margin_low 1463 non-null float64 5 margin_up 1500 non-null float64 6 length 1500 non-null float64 dtypes: bool(1), float64(6) memory usage: 71.9 KB
Out[5]:
is_genuine | diagonal | height_left | height_right | margin_low | margin_up | length | |
---|---|---|---|---|---|---|---|
count | 1500 | 1500.000000 | 1500.000000 | 1500.000000 | 1463.000000 | 1500.000000 | 1500.00000 |
unique | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
top | True | NaN | NaN | NaN | NaN | NaN | NaN |
freq | 1000 | NaN | NaN | NaN | NaN | NaN | NaN |
mean | NaN | 171.958440 | 104.029533 | 103.920307 | 4.485967 | 3.151473 | 112.67850 |
std | NaN | 0.305195 | 0.299462 | 0.325627 | 0.663813 | 0.231813 | 0.87273 |
min | NaN | 171.040000 | 103.140000 | 102.820000 | 2.980000 | 2.270000 | 109.49000 |
25% | NaN | 171.750000 | 103.820000 | 103.710000 | 4.015000 | 2.990000 | 112.03000 |
50% | NaN | 171.960000 | 104.040000 | 103.920000 | 4.310000 | 3.140000 | 112.96000 |
75% | NaN | 172.170000 | 104.230000 | 104.150000 | 4.870000 | 3.310000 | 113.34000 |
max | NaN | 173.010000 | 104.880000 | 104.950000 | 6.900000 | 3.910000 | 114.44000 |
In [6]:
print("Il y a :\n\n\n", billets.isnull().sum() , '\n\n\nvaleurs nulls')
Il y a : is_genuine 0 diagonal 0 height_left 0 height_right 0 margin_low 37 margin_up 0 length 0 dtype: int64 valeurs nulls
In [7]:
#nulls = billets.loc[billets['margin_low'].isnull()]
print(billets.loc[billets['margin_low'].isnull()])
is_genuine diagonal height_left height_right margin_low margin_up \ 72 True 171.94 103.89 103.45 NaN 3.25 99 True 171.93 104.07 104.18 NaN 3.14 151 True 172.07 103.80 104.38 NaN 3.02 197 True 171.45 103.66 103.80 NaN 3.62 241 True 171.83 104.14 104.06 NaN 3.02 251 True 171.80 103.26 102.82 NaN 2.95 284 True 171.92 103.83 103.76 NaN 3.23 334 True 171.85 103.70 103.96 NaN 3.00 410 True 172.56 103.72 103.51 NaN 3.12 413 True 172.30 103.66 103.50 NaN 3.16 445 True 172.34 104.42 103.22 NaN 3.01 481 True 171.81 103.53 103.96 NaN 2.71 505 True 172.01 103.97 104.05 NaN 2.98 611 True 171.80 103.68 103.49 NaN 3.30 654 True 171.97 103.69 103.54 NaN 2.70 675 True 171.60 103.85 103.91 NaN 2.56 710 True 172.03 103.97 103.86 NaN 3.07 739 True 172.07 103.74 103.76 NaN 3.09 742 True 172.14 104.06 103.96 NaN 3.24 780 True 172.41 103.95 103.79 NaN 3.13 798 True 171.96 103.84 103.62 NaN 3.01 844 True 171.62 104.14 104.49 NaN 2.99 845 True 172.02 104.21 104.05 NaN 2.90 871 True 171.37 104.07 103.75 NaN 3.07 895 True 171.81 103.68 103.80 NaN 2.98 919 True 171.92 103.68 103.45 NaN 2.58 945 True 172.09 103.74 103.52 NaN 3.02 946 True 171.63 103.87 104.66 NaN 3.27 981 True 172.02 104.23 103.72 NaN 2.99 1076 False 171.57 104.27 104.44 NaN 3.21 1121 False 171.40 104.38 104.19 NaN 3.17 1176 False 171.59 104.05 103.94 NaN 3.02 1303 False 172.17 104.49 103.76 NaN 2.93 1315 False 172.08 104.15 104.17 NaN 3.40 1347 False 171.72 104.46 104.12 NaN 3.61 1435 False 172.66 104.33 104.41 NaN 3.56 1438 False 171.90 104.28 104.29 NaN 3.24 length 72 112.79 99 113.08 151 112.93 197 113.27 241 112.36 251 113.22 284 113.29 334 113.36 410 112.95 413 112.95 445 112.97 481 113.99 505 113.65 611 112.84 654 112.79 675 113.27 710 112.65 739 112.41 742 113.07 780 113.41 798 114.44 844 113.35 845 113.62 871 113.27 895 113.82 919 113.68 945 112.78 946 112.68 981 113.37 1076 111.87 1121 112.39 1176 111.29 1303 111.21 1315 112.29 1347 110.31 1435 111.47 1438 111.49
BILLETS AVEC NULLS¶
In [8]:
billets_avec_nulls = billets.loc[billets['margin_low'].isna()]
billets_tous=billets
billets = billets.dropna(subset=['margin_low'])
billets_avec_nulls
Out[8]:
is_genuine | diagonal | height_left | height_right | margin_low | margin_up | length | |
---|---|---|---|---|---|---|---|
72 | True | 171.94 | 103.89 | 103.45 | NaN | 3.25 | 112.79 |
99 | True | 171.93 | 104.07 | 104.18 | NaN | 3.14 | 113.08 |
151 | True | 172.07 | 103.80 | 104.38 | NaN | 3.02 | 112.93 |
197 | True | 171.45 | 103.66 | 103.80 | NaN | 3.62 | 113.27 |
241 | True | 171.83 | 104.14 | 104.06 | NaN | 3.02 | 112.36 |
251 | True | 171.80 | 103.26 | 102.82 | NaN | 2.95 | 113.22 |
284 | True | 171.92 | 103.83 | 103.76 | NaN | 3.23 | 113.29 |
334 | True | 171.85 | 103.70 | 103.96 | NaN | 3.00 | 113.36 |
410 | True | 172.56 | 103.72 | 103.51 | NaN | 3.12 | 112.95 |
413 | True | 172.30 | 103.66 | 103.50 | NaN | 3.16 | 112.95 |
445 | True | 172.34 | 104.42 | 103.22 | NaN | 3.01 | 112.97 |
481 | True | 171.81 | 103.53 | 103.96 | NaN | 2.71 | 113.99 |
505 | True | 172.01 | 103.97 | 104.05 | NaN | 2.98 | 113.65 |
611 | True | 171.80 | 103.68 | 103.49 | NaN | 3.30 | 112.84 |
654 | True | 171.97 | 103.69 | 103.54 | NaN | 2.70 | 112.79 |
675 | True | 171.60 | 103.85 | 103.91 | NaN | 2.56 | 113.27 |
710 | True | 172.03 | 103.97 | 103.86 | NaN | 3.07 | 112.65 |
739 | True | 172.07 | 103.74 | 103.76 | NaN | 3.09 | 112.41 |
742 | True | 172.14 | 104.06 | 103.96 | NaN | 3.24 | 113.07 |
780 | True | 172.41 | 103.95 | 103.79 | NaN | 3.13 | 113.41 |
798 | True | 171.96 | 103.84 | 103.62 | NaN | 3.01 | 114.44 |
844 | True | 171.62 | 104.14 | 104.49 | NaN | 2.99 | 113.35 |
845 | True | 172.02 | 104.21 | 104.05 | NaN | 2.90 | 113.62 |
871 | True | 171.37 | 104.07 | 103.75 | NaN | 3.07 | 113.27 |
895 | True | 171.81 | 103.68 | 103.80 | NaN | 2.98 | 113.82 |
919 | True | 171.92 | 103.68 | 103.45 | NaN | 2.58 | 113.68 |
945 | True | 172.09 | 103.74 | 103.52 | NaN | 3.02 | 112.78 |
946 | True | 171.63 | 103.87 | 104.66 | NaN | 3.27 | 112.68 |
981 | True | 172.02 | 104.23 | 103.72 | NaN | 2.99 | 113.37 |
1076 | False | 171.57 | 104.27 | 104.44 | NaN | 3.21 | 111.87 |
1121 | False | 171.40 | 104.38 | 104.19 | NaN | 3.17 | 112.39 |
1176 | False | 171.59 | 104.05 | 103.94 | NaN | 3.02 | 111.29 |
1303 | False | 172.17 | 104.49 | 103.76 | NaN | 2.93 | 111.21 |
1315 | False | 172.08 | 104.15 | 104.17 | NaN | 3.40 | 112.29 |
1347 | False | 171.72 | 104.46 | 104.12 | NaN | 3.61 | 110.31 |
1435 | False | 172.66 | 104.33 | 104.41 | NaN | 3.56 | 111.47 |
1438 | False | 171.90 | 104.28 | 104.29 | NaN | 3.24 | 111.49 |
In [9]:
BC=billets.columns
BC=['diagonal', 'height_left', 'height_right', 'margin_low',
'margin_up', 'length']
In [10]:
billets=billets.copy()
#D['is_genuine']=D['is_genuine'].astype(int)
billets.loc[:, 'is_genuine'] = billets['is_genuine'].astype(int)
EDA¶
HISTOGRAMS¶
In [11]:
for column in BC:
plt.title(column, fontsize=14)
for value in billets['is_genuine'].unique():
billets.loc[billets['is_genuine']==value, column].dropna().hist(alpha=0.5, label=value)
plt.xlabel("Valeur")
plt.ylabel("Frequence")
plt.legend(title='is_genuine')
plt.show()
BOXPLOTS¶
In [12]:
for column in BC:
#plt.title(column, fontsize=14)
billets.boxplot(column=column, by='is_genuine')
plt.xlabel("is_genuine")
plt.ylabel(column)
plt.show()
In [13]:
billets.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1463 entries, 0 to 1499 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 is_genuine 1463 non-null int64 1 diagonal 1463 non-null float64 2 height_left 1463 non-null float64 3 height_right 1463 non-null float64 4 margin_low 1463 non-null float64 5 margin_up 1463 non-null float64 6 length 1463 non-null float64 dtypes: float64(6), int64(1) memory usage: 91.4 KB
In [14]:
#scaler = StandardScaler()
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(billets), columns=billets.columns)
In [15]:
XT = pd.DataFrame(scaler.fit_transform(billets_tous), columns=billets_tous.columns)
CORRELATIONS¶
In [16]:
corr_matrix = billets.corr(method='pearson')
corr_matrix=round (corr_matrix, 3)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))
sns.heatmap(corr_matrix, annot = True, cbar = False,annot_kws = {"size": 10}, vmin = -1, vmax = 1,center = 0, cmap = sns.diverging_palette(20, 220, n=200),square = True, ax = ax ),ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right', ),ax.tick_params(labelsize = 10)
plt.show()
In [17]:
corr_matrix = billets[billets['is_genuine']==0].corr(method='pearson')
corr_matrix=round (corr_matrix, 3)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))
sns.heatmap(corr_matrix, annot = True, cbar = False,annot_kws = {"size": 10}, vmin = -1, vmax = 1,center = 0, cmap = sns.diverging_palette(20, 220, n=200),square = True, ax = ax ),ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right', ),ax.tick_params(labelsize = 10)
plt.show()
In [18]:
corr_matrix = billets[billets['is_genuine']==1].corr(method='pearson')
corr_matrix=round (corr_matrix, 3)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))
sns.heatmap(corr_matrix, annot = True, cbar = False,annot_kws = {"size": 10}, vmin = -1, vmax = 1,center = 0, cmap = sns.diverging_palette(20, 220, n=200),square = True, ax = ax ),ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right', ),ax.tick_params(labelsize = 10)
plt.show()
VIF¶
In [19]:
# VIF dataframe
vif = pd.DataFrame()
vif["valeur"] = pd.DataFrame(X[BC].columns)
# calculer VIF pour chaque valeur
vif["VIF"] = [variance_inflation_factor(X[BC].values, i)
for i in range(len(X[BC].columns))]
print(vif)
valeur VIF 0 diagonal 9.571588 1 height_left 10.693729 2 height_right 12.474296 3 margin_low 9.510445 4 margin_up 16.641133 5 length 8.715947
PAIRPLOT¶
In [20]:
#DF X est normalise et is_genuine 1 true et 0 false
sns.pairplot(XT, hue='is_genuine', diag_kind="hist",kind="kde",palette="Set2",
height=1.4)
plt.show()
IMPUTER¶
In [21]:
billets_tous.iloc[billets_avec_nulls.index.to_list()]
Out[21]:
is_genuine | diagonal | height_left | height_right | margin_low | margin_up | length | |
---|---|---|---|---|---|---|---|
72 | True | 171.94 | 103.89 | 103.45 | NaN | 3.25 | 112.79 |
99 | True | 171.93 | 104.07 | 104.18 | NaN | 3.14 | 113.08 |
151 | True | 172.07 | 103.80 | 104.38 | NaN | 3.02 | 112.93 |
197 | True | 171.45 | 103.66 | 103.80 | NaN | 3.62 | 113.27 |
241 | True | 171.83 | 104.14 | 104.06 | NaN | 3.02 | 112.36 |
251 | True | 171.80 | 103.26 | 102.82 | NaN | 2.95 | 113.22 |
284 | True | 171.92 | 103.83 | 103.76 | NaN | 3.23 | 113.29 |
334 | True | 171.85 | 103.70 | 103.96 | NaN | 3.00 | 113.36 |
410 | True | 172.56 | 103.72 | 103.51 | NaN | 3.12 | 112.95 |
413 | True | 172.30 | 103.66 | 103.50 | NaN | 3.16 | 112.95 |
445 | True | 172.34 | 104.42 | 103.22 | NaN | 3.01 | 112.97 |
481 | True | 171.81 | 103.53 | 103.96 | NaN | 2.71 | 113.99 |
505 | True | 172.01 | 103.97 | 104.05 | NaN | 2.98 | 113.65 |
611 | True | 171.80 | 103.68 | 103.49 | NaN | 3.30 | 112.84 |
654 | True | 171.97 | 103.69 | 103.54 | NaN | 2.70 | 112.79 |
675 | True | 171.60 | 103.85 | 103.91 | NaN | 2.56 | 113.27 |
710 | True | 172.03 | 103.97 | 103.86 | NaN | 3.07 | 112.65 |
739 | True | 172.07 | 103.74 | 103.76 | NaN | 3.09 | 112.41 |
742 | True | 172.14 | 104.06 | 103.96 | NaN | 3.24 | 113.07 |
780 | True | 172.41 | 103.95 | 103.79 | NaN | 3.13 | 113.41 |
798 | True | 171.96 | 103.84 | 103.62 | NaN | 3.01 | 114.44 |
844 | True | 171.62 | 104.14 | 104.49 | NaN | 2.99 | 113.35 |
845 | True | 172.02 | 104.21 | 104.05 | NaN | 2.90 | 113.62 |
871 | True | 171.37 | 104.07 | 103.75 | NaN | 3.07 | 113.27 |
895 | True | 171.81 | 103.68 | 103.80 | NaN | 2.98 | 113.82 |
919 | True | 171.92 | 103.68 | 103.45 | NaN | 2.58 | 113.68 |
945 | True | 172.09 | 103.74 | 103.52 | NaN | 3.02 | 112.78 |
946 | True | 171.63 | 103.87 | 104.66 | NaN | 3.27 | 112.68 |
981 | True | 172.02 | 104.23 | 103.72 | NaN | 2.99 | 113.37 |
1076 | False | 171.57 | 104.27 | 104.44 | NaN | 3.21 | 111.87 |
1121 | False | 171.40 | 104.38 | 104.19 | NaN | 3.17 | 112.39 |
1176 | False | 171.59 | 104.05 | 103.94 | NaN | 3.02 | 111.29 |
1303 | False | 172.17 | 104.49 | 103.76 | NaN | 2.93 | 111.21 |
1315 | False | 172.08 | 104.15 | 104.17 | NaN | 3.40 | 112.29 |
1347 | False | 171.72 | 104.46 | 104.12 | NaN | 3.61 | 110.31 |
1435 | False | 172.66 | 104.33 | 104.41 | NaN | 3.56 | 111.47 |
1438 | False | 171.90 | 104.28 | 104.29 | NaN | 3.24 | 111.49 |
KNNImputer¶
In [22]:
imputer = KNNImputer(n_neighbors=5)
billets_tous_knnimputer = pd.DataFrame(imputer.fit_transform(billets_tous), columns=billets_tous.columns)
REGRESION LINEARE¶
In [23]:
# On separe
donnees_avec_margin_low = billets_tous[billets_tous['margin_low'].notnull()]
donnees_sans_margin_low = billets_tous[billets_tous['margin_low'].isnull()]
# Train test split
X_train_lin, X_test_lin, y_train_lin, y_test_lin = train_test_split(donnees_avec_margin_low.drop('margin_low', axis=1)
, donnees_avec_margin_low['margin_low'], test_size=0.15, random_state=42)
In [24]:
X_test_lin.shape, X_train_lin.shape ,y_train_lin.shape, y_test_lin.shape
Out[24]:
((220, 6), (1243, 6), (1243,), (220,))
In [25]:
lmodel=LinearRegression()
In [26]:
lmodel.fit(X_train_lin, y_train_lin)
Out[26]:
LinearRegression()
In [27]:
y_pred=lmodel.predict(X_test_lin)
In [28]:
(y_pred-y_test_lin).hist()
Out[28]:
<AxesSubplot:>
METRIQUES REGRESION LINEARE¶
In [29]:
X = donnees_avec_margin_low.drop('margin_low', axis=1)
X= X.drop('is_genuine', axis=1)
y = donnees_avec_margin_low['margin_low']
X = sm.add_constant(X)
model = sm.OLS(y,X).fit()
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: margin_low R-squared: 0.477 Model: OLS Adj. R-squared: 0.476 Method: Least Squares F-statistic: 266.1 Date: Tue, 16 May 2023 Prob (F-statistic): 2.60e-202 Time: 12:18:22 Log-Likelihood: -1001.3 No. Observations: 1463 AIC: 2015. Df Residuals: 1457 BIC: 2046. Df Model: 5 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- const 22.9948 9.656 2.382 0.017 4.055 41.935 diagonal -0.1111 0.041 -2.680 0.007 -0.192 -0.030 height_left 0.1841 0.045 4.113 0.000 0.096 0.272 height_right 0.2571 0.043 5.978 0.000 0.173 0.342 margin_up 0.2562 0.064 3.980 0.000 0.130 0.382 length -0.4091 0.018 -22.627 0.000 -0.445 -0.374 ============================================================================== Omnibus: 73.627 Durbin-Watson: 1.893 Prob(Omnibus): 0.000 Jarque-Bera (JB): 95.862 Skew: 0.482 Prob(JB): 1.53e-21 Kurtosis: 3.801 Cond. No. 1.94e+05 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.94e+05. This might indicate that there are strong multicollinearity or other numerical problems.
In [30]:
r2 = r2_score(y,
lmodel.predict(donnees_avec_margin_low.drop('margin_low', axis=1)))
mae = mean_absolute_error(y, lmodel.predict(donnees_avec_margin_low.drop('margin_low', axis=1)))
print("R2:", r2)
print("MAE:", mae)
R2: 0.6166656453942228 MAE: 0.3160336084059514
In [31]:
# Entreiner model
X_train_margin_low = donnees_avec_margin_low.drop('margin_low', axis=1)
y_train_margin_low = donnees_avec_margin_low['margin_low']
model_margin_low = LinearRegression()
model_margin_low.fit(X_train_margin_low, y_train_margin_low)
# VALIDER MODEL
y_pred=model_margin_low.predict(X_test_lin)
#y_test-y_pred
In [32]:
# Entreiner model
X_train_margin_low = donnees_avec_margin_low.drop('margin_low', axis=1)
y_train_margin_low = donnees_avec_margin_low['margin_low']
model_margin_low = LinearRegression()
model_margin_low.fit(X_train_margin_low, y_train_margin_low)
# Prediction donnes sans margin
X_test_margin_low = donnees_sans_margin_low.drop('margin_low', axis=1)
donnees_sans_margin_low['margin_low'] = model_margin_low.predict(X_test_margin_low)
# merge/concat
billets_tous_linear = pd.concat([donnees_avec_margin_low, donnees_sans_margin_low])
/var/folders/pd/z7nn_cm91pg9fjmhgsnqvvd80000gn/T/ipykernel_24189/1793560917.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy donnees_sans_margin_low['margin_low'] = model_margin_low.predict(X_test_margin_low)
In [33]:
donnees_sans_margin_low.describe()
Out[33]:
diagonal | height_left | height_right | margin_low | margin_up | length | |
---|---|---|---|---|---|---|
count | 37.000000 | 37.000000 | 37.000000 | 37.000000 | 37.000000 | 37.000000 |
mean | 171.928649 | 103.958378 | 103.874054 | 4.359368 | 3.087838 | 112.826486 |
std | 0.297145 | 0.288680 | 0.380631 | 0.470245 | 0.239782 | 0.851388 |
min | 171.370000 | 103.260000 | 102.820000 | 3.993571 | 2.560000 | 110.310000 |
25% | 171.800000 | 103.720000 | 103.620000 | 4.096960 | 2.990000 | 112.410000 |
50% | 171.930000 | 103.950000 | 103.860000 | 4.125390 | 3.070000 | 112.950000 |
75% | 172.070000 | 104.150000 | 104.120000 | 4.198423 | 3.230000 | 113.350000 |
max | 172.660000 | 104.490000 | 104.660000 | 5.302069 | 3.620000 | 114.440000 |
KNNImputer vs REGRESION LINEARE:¶
In [34]:
billets_tous_knnimputer.iloc[billets_avec_nulls.index.to_list()]-donnees_sans_margin_low
Out[34]:
is_genuine | diagonal | height_left | height_right | margin_low | margin_up | length | |
---|---|---|---|---|---|---|---|
72 | 0.0 | 0.0 | 0.0 | 0.0 | -0.018954 | 0.0 | 0.0 |
99 | 0.0 | 0.0 | 0.0 | 0.0 | 0.166010 | 0.0 | 0.0 |
151 | 0.0 | 0.0 | 0.0 | 0.0 | 0.079997 | 0.0 | 0.0 |
197 | 0.0 | 0.0 | 0.0 | 0.0 | 0.112429 | 0.0 | 0.0 |
241 | 0.0 | 0.0 | 0.0 | 0.0 | 0.095601 | 0.0 | 0.0 |
251 | 0.0 | 0.0 | 0.0 | 0.0 | 0.253716 | 0.0 | 0.0 |
284 | 0.0 | 0.0 | 0.0 | 0.0 | 0.003876 | 0.0 | 0.0 |
334 | 0.0 | 0.0 | 0.0 | 0.0 | 0.210610 | 0.0 | 0.0 |
410 | 0.0 | 0.0 | 0.0 | 0.0 | 0.091272 | 0.0 | 0.0 |
413 | 0.0 | 0.0 | 0.0 | 0.0 | -0.057633 | 0.0 | 0.0 |
445 | 0.0 | 0.0 | 0.0 | 0.0 | 0.027027 | 0.0 | 0.0 |
481 | 0.0 | 0.0 | 0.0 | 0.0 | 0.113620 | 0.0 | 0.0 |
505 | 0.0 | 0.0 | 0.0 | 0.0 | -0.118484 | 0.0 | 0.0 |
611 | 0.0 | 0.0 | 0.0 | 0.0 | -0.069068 | 0.0 | 0.0 |
654 | 0.0 | 0.0 | 0.0 | 0.0 | -0.180377 | 0.0 | 0.0 |
675 | 0.0 | 0.0 | 0.0 | 0.0 | -0.073551 | 0.0 | 0.0 |
710 | 0.0 | 0.0 | 0.0 | 0.0 | -0.131868 | 0.0 | 0.0 |
739 | 0.0 | 0.0 | 0.0 | 0.0 | 0.085159 | 0.0 | 0.0 |
742 | 0.0 | 0.0 | 0.0 | 0.0 | -0.043843 | 0.0 | 0.0 |
780 | 0.0 | 0.0 | 0.0 | 0.0 | 0.191238 | 0.0 | 0.0 |
798 | 0.0 | 0.0 | 0.0 | 0.0 | 0.171498 | 0.0 | 0.0 |
844 | 0.0 | 0.0 | 0.0 | 0.0 | 0.160824 | 0.0 | 0.0 |
845 | 0.0 | 0.0 | 0.0 | 0.0 | 0.087712 | 0.0 | 0.0 |
871 | 0.0 | 0.0 | 0.0 | 0.0 | -0.003938 | 0.0 | 0.0 |
895 | 0.0 | 0.0 | 0.0 | 0.0 | 0.126464 | 0.0 | 0.0 |
919 | 0.0 | 0.0 | 0.0 | 0.0 | 0.059577 | 0.0 | 0.0 |
945 | 0.0 | 0.0 | 0.0 | 0.0 | 0.016377 | 0.0 | 0.0 |
946 | 0.0 | 0.0 | 0.0 | 0.0 | -0.082960 | 0.0 | 0.0 |
981 | 0.0 | 0.0 | 0.0 | 0.0 | 0.236159 | 0.0 | 0.0 |
1076 | 0.0 | 0.0 | 0.0 | 0.0 | 0.018315 | 0.0 | 0.0 |
1121 | 0.0 | 0.0 | 0.0 | 0.0 | 0.053183 | 0.0 | 0.0 |
1176 | 0.0 | 0.0 | 0.0 | 0.0 | 0.061481 | 0.0 | 0.0 |
1303 | 0.0 | 0.0 | 0.0 | 0.0 | 0.093931 | 0.0 | 0.0 |
1315 | 0.0 | 0.0 | 0.0 | 0.0 | 0.025642 | 0.0 | 0.0 |
1347 | 0.0 | 0.0 | 0.0 | 0.0 | -0.099468 | 0.0 | 0.0 |
1435 | 0.0 | 0.0 | 0.0 | 0.0 | -0.187450 | 0.0 | 0.0 |
1438 | 0.0 | 0.0 | 0.0 | 0.0 | 0.225249 | 0.0 | 0.0 |
In [35]:
fig, ax = plt.subplots()
billets_tous_knnimputer[['margin_low']].iloc[billets_avec_nulls.index.to_list()].boxplot(ax=ax,showmeans=True,positions=[0])
donnees_sans_margin_low[['margin_low']].boxplot(ax=ax, showmeans=True, positions=[1])
ax.set_xticklabels(['Knnimputer', 'Regresion lineare'])
ax.set_ylabel('margin_low')
ax.set_title('Comparacion des distribuctons de margin_low avec differents methodes')
Out[35]:
Text(0.5, 1.0, 'Comparacion des distribuctons de margin_low avec differents methodes')
In [36]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(billets_tous_linear), columns=billets_tous_linear.columns)
In [37]:
X['is_genuine']=billets_tous_linear['is_genuine']
In [38]:
XT.info() # afficher des informations sur les données
XT.describe(include='all') # afficher des statistiques descriptives sur les données
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1500 entries, 0 to 1499 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 is_genuine 1500 non-null float64 1 diagonal 1500 non-null float64 2 height_left 1500 non-null float64 3 height_right 1500 non-null float64 4 margin_low 1463 non-null float64 5 margin_up 1500 non-null float64 6 length 1500 non-null float64 dtypes: float64(7) memory usage: 82.2 KB
Out[38]:
is_genuine | diagonal | height_left | height_right | margin_low | margin_up | length | |
---|---|---|---|---|---|---|---|
count | 1500.000000 | 1500.000000 | 1500.000000 | 1500.000000 | 1463.000000 | 1500.000000 | 1500.000000 |
mean | 0.666667 | 0.466213 | 0.511226 | 0.516576 | 0.384175 | 0.537484 | 0.644141 |
std | 0.471562 | 0.154921 | 0.172104 | 0.152877 | 0.169340 | 0.141349 | 0.176309 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 0.360406 | 0.390805 | 0.417840 | 0.264031 | 0.439024 | 0.513131 |
50% | 1.000000 | 0.467005 | 0.517241 | 0.516432 | 0.339286 | 0.530488 | 0.701010 |
75% | 1.000000 | 0.573604 | 0.626437 | 0.624413 | 0.482143 | 0.634146 | 0.777778 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
DIVISION DES DONNÉES EN VALEURS (x) ET SI EST GENUINE OU PAS (y)¶
In [39]:
#est-ce une valeur continue (un nombre) ou bien une valeur discrète (une catégorie) ?
#Le premier cas est appelé une régression, le second une classification
#Depuis openclassrooms, Initiez-vous au Machine Learning
#alors je fais regression...
In [40]:
x = billets_tous_linear.drop('is_genuine', axis=1)
y = billets_tous_linear['is_genuine']
DIVISION DES DONNES EN DONNES POUR ENTREINER ET DE TEST¶
In [41]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(x, y, test_size=0.15, random_state=42)
MODELE REGRESION LOGISTIQUE¶
In [42]:
# Entreiner le model de regresion logistique
Logistique_Regression_model = LogisticRegression()
Logistique_Regression_model.fit(X_train_log, y_train_log)
y_pred_log = Logistique_Regression_model.predict(X_test_log)
In [43]:
# prediction:
y_pred_log = Logistique_Regression_model.predict(X_test_log)
#y_pred_log est ma prediction et y_test est ce que j'avais separé
y_pred_log_tous= Logistique_Regression_model.predict (billets_tous_linear.drop('is_genuine', axis=1))
y_log_tous=billets_tous_linear['is_genuine']
METRIQUES¶
In [44]:
#precision:
accuracy_log = accuracy_score(y_log_tous, y_pred_log_tous)
In [45]:
print("Precision:" ,accuracy_log)
#matriz de confusion también r score
Precision: 0.9913333333333333
In [46]:
print(classification_report(y_log_tous, y_pred_log_tous)
)
precision recall f1-score support False 0.99 0.98 0.99 500 True 0.99 1.00 0.99 1000 accuracy 0.99 1500 macro avg 0.99 0.99 0.99 1500 weighted avg 0.99 0.99 0.99 1500
In [47]:
Z = sm.add_constant(x)
model = sm.Logit(y, Z).fit()
print(model.summary())
Optimization terminated successfully. Current function value: 0.026349 Iterations 13 Logit Regression Results ============================================================================== Dep. Variable: is_genuine No. Observations: 1500 Model: Logit Df Residuals: 1493 Method: MLE Df Model: 6 Date: Tue, 16 May 2023 Pseudo R-squ.: 0.9586 Time: 12:18:23 Log-Likelihood: -39.524 converged: True LL-Null: -954.77 Covariance Type: nonrobust LLR p-value: 0.000 ================================================================================ coef std err z P>|z| [0.025 0.975] -------------------------------------------------------------------------------- const -230.4143 249.954 -0.922 0.357 -720.316 259.487 diagonal 0.2562 1.135 0.226 0.821 -1.969 2.481 height_left -1.2845 1.117 -1.150 0.250 -3.474 0.905 height_right -2.8941 1.143 -2.533 0.011 -5.134 -0.654 margin_low -6.0235 0.988 -6.099 0.000 -7.959 -4.088 margin_up -10.2986 2.206 -4.669 0.000 -14.622 -5.975 length 6.0561 0.897 6.751 0.000 4.298 7.814 ================================================================================ Possibly complete quasi-separation: A fraction 0.53 of observations can be perfectly predicted. This might indicate that there is complete quasi-separation. In this case some parameters will not be identified.
In [48]:
y_test_log.shape, y_pred_log.shape
Out[48]:
((225,), (225,))
COURBE ROC¶
In [49]:
# calculer courbe ROC
fpr, tpr, thresholds = roc_curve(y_log_tous, y_pred_log_tous)
# calculer AUC
auc = roc_auc_score(y_log_tous, y_pred_log_tous)
# grafique courbe ROC
plt.plot(fpr, tpr, label='AUC = {:.2f}'.format(auc))
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel('Taux de faux positif')
plt.ylabel('Taux de vrai positif')
plt.title('Courbe ROC')
plt.legend()
plt.show()
MATRICE DE CONFUSION¶
In [50]:
cm = confusion_matrix(y_pred_log_tous, y_log_tous)
In [51]:
plt.matshow(cm, cmap=plt.cm.Blues)
plt.xlabel('Prediction')
plt.ylabel('Vrai')
plt.plot([-0.5, cm.shape[0]-0.5], [-0.5, cm.shape[1]-0.5], 'g-') # línea diagonal roja
#plt.plot([-0.5, cm.shape[0]-0.5], [cm.shape[1]-0.5, -0.5], 'r-') # línea diagonal verde
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
plt.text(x=j, y=i, s=cm[i, j], va='center', ha='center', fontsize=14, weight='bold')
plt.show()
SUPRESSION VARIABLES PAS SIGNIFICATIVES¶
In [52]:
# SANS P value >0.5
In [53]:
x = billets_tous_linear.drop('is_genuine', axis=1).drop('diagonal', axis=1).drop('height_left', axis=1)
y = billets_tous_linear['is_genuine']
In [54]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(x, y, test_size=0.15, random_state=42)
# Entreiner le model de regresion logistique
Logistique_Regression_model = LogisticRegression()
Logistique_Regression_model.fit(X_train_log, y_train_log)
y_pred_log = Logistique_Regression_model.predict(X_test_log)
# prediction:
y_pred_log = Logistique_Regression_model.predict(X_test_log)
#y_pred_log est ma prediction et y_test est ce que j'avais separé
y_pred_log_tous= Logistique_Regression_model.predict (x)
y_log_tous=billets_tous_linear['is_genuine']
#------METRIQUES
#precision:
accuracy_log = accuracy_score(y_log_tous, y_pred_log_tous)
In [55]:
print("Precision:" ,accuracy_log)
Precision: 0.99
In [56]:
print(classification_report(y_log_tous, y_pred_log_tous)
)
precision recall f1-score support False 0.99 0.98 0.98 500 True 0.99 1.00 0.99 1000 accuracy 0.99 1500 macro avg 0.99 0.99 0.99 1500 weighted avg 0.99 0.99 0.99 1500
In [57]:
Z = sm.add_constant(x)
model = sm.Logit(y, Z).fit()
print(model.summary())
Optimization terminated successfully. Current function value: 0.026815 Iterations 13 Logit Regression Results ============================================================================== Dep. Variable: is_genuine No. Observations: 1500 Model: Logit Df Residuals: 1495 Method: MLE Df Model: 4 Date: Tue, 16 May 2023 Pseudo R-squ.: 0.9579 Time: 12:18:23 Log-Likelihood: -40.223 converged: True LL-Null: -954.77 Covariance Type: nonrobust LLR p-value: 0.000 ================================================================================ coef std err z P>|z| [0.025 0.975] -------------------------------------------------------------------------------- const -285.3339 139.160 -2.050 0.040 -558.083 -12.585 height_right -3.3223 1.117 -2.975 0.003 -5.511 -1.134 margin_low -6.3080 0.963 -6.547 0.000 -8.196 -4.420 margin_up -10.4027 2.196 -4.737 0.000 -14.707 -6.098 length 6.1579 0.888 6.934 0.000 4.417 7.898 ================================================================================ Possibly complete quasi-separation: A fraction 0.55 of observations can be perfectly predicted. This might indicate that there is complete quasi-separation. In this case some parameters will not be identified.
In [58]:
# calculer courbe ROC
fpr, tpr, thresholds = roc_curve(y_log_tous, y_pred_log_tous)
# calculer AUC
auc = roc_auc_score(y_log_tous, y_pred_log_tous)
# grafique courbe ROC
plt.plot(fpr, tpr, label='AUC = {:.2f}'.format(auc))
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel('Taux de faux positif')
plt.ylabel('Taux de vrai positif')
plt.title('Courbe ROC')
plt.legend()
plt.show()
ACP¶
In [59]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)
In [60]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(x)
In [61]:
plt.scatter(X_pca[y == 0][:, 0], X_pca[y == 0][:, 1], color='red', label='False (0)')
plt.scatter(X_pca[y == 1][:, 0], X_pca[y == 1][:, 1], color='blue', label='True (1)')
plt.xlabel('CP 1')
plt.ylabel('CP 2')
plt.title('Visualizacion 2D de 5 variables avec PCA')
#avec DONNÉES AVEC imputer regression lineare
plt.legend(loc='upper left')
plt.show()
In [62]:
pca = PCA()
pca.fit(x)
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)
plt.figure()
plt.plot(range(1, len(explained_variance_ratio) + 1), cumulative_explained_variance, marker='o')
plt.xlabel('Nombre de composantes principales')
plt.ylabel('Taux de la variance expliquée')
plt.show()
print("Nombre CP qu'on a besoin : ", np.argmax(cumulative_explained_variance >= 0.70) + 1, " (plus que 70% d'explication de variabilité)")
Nombre CP qu'on a besoin : 1 (plus que 70% d'explication de variabilité)
In [63]:
scree = (pca.explained_variance_ratio_*100).round(2)
scree_cum = scree.cumsum().round()
x_list = range(1, 5)
list(x_list)
plt.bar(x_list, scree)
plt.plot(x_list, scree_cum,c="red",marker='o')
plt.xlabel("rang de l'axe d'inertie")
plt.ylabel("pourcentage d'inertie")
plt.title("Eboulis des valeurs propres")
plt.show(block=False)
In [64]:
t, r = 0,1
fig, ax = plt.subplots(figsize=(10, 9))
for i in range(0, pca.components_.shape[1]):
ax.arrow(0,
0, # Start the arrow at the origin
pca.components_[0, i], #0 for PC1
pca.components_[1, i], #1 for PC2
head_width=0.07,
head_length=0.07,
width=0.02, )
plt.text(pca.components_[0, i] + 0.05,
pca.components_[1, i] + 0.05,
BC[i])
# affichage des lignes horizontales et verticales
plt.plot([-1, 1], [0, 0], color='grey', ls='--')
plt.plot([0, 0], [-1, 1], color='grey', ls='--')
# nom des axes, avec le pourcentage d'inertie expliqué
plt.xlabel('F{} ({}%)'.format(t+1, round(100*pca.explained_variance_ratio_[t],1)))
plt.ylabel('F{} ({}%)'.format(r+1, round(100*pca.explained_variance_ratio_[r],1)))
plt.title("Cercle des corrélations (CP{} et CP{})".format(t+1, r+1))
an = np.linspace(0, 2 * np.pi, 100)
plt.plot(np.cos(an), np.sin(an)) # Add a unit circle for scale
plt.axis('equal')
plt.show(block=False)
Nombre de ACP¶
In [65]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(billets_tous_linear[BC])
In [66]:
df_X_pca=pd.DataFrame(X_pca, index=X.index)
In [67]:
df_X_pca['is_genuine']=X['is_genuine']
In [68]:
sum(df_X_pca['is_genuine']==X['is_genuine'])
Out[68]:
1500
In [69]:
df_X_pca
Out[69]:
0 | 1 | is_genuine | |
---|---|---|---|
0 | 0.113911 | 0.237710 | True |
1 | -0.832936 | -0.377681 | True |
2 | -0.500740 | 0.167628 | True |
3 | -1.146882 | -0.216228 | True |
4 | -0.118929 | -0.460558 | True |
... | ... | ... | ... |
1495 | 1.624551 | -0.148959 | False |
1496 | 0.779278 | 0.382925 | False |
1497 | 2.437424 | -0.731457 | False |
1498 | 1.485415 | -0.103298 | False |
1499 | 1.471789 | -0.004096 | False |
1500 rows × 3 columns
In [70]:
x_pca = df_X_pca.drop('is_genuine', axis=1)
y = df_X_pca['is_genuine']
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(x_pca, y, test_size=0.20, random_state=42)
In [71]:
Logistique_Regression_model_acp = LogisticRegression()
Logistique_Regression_model_acp.fit(X_train_log, y_train_log)
y_pred_log = Logistique_Regression_model_acp.predict(X_test_log)
In [72]:
accuracy_score(y_test_log, y_pred_log)
Out[72]:
0.9433333333333334
In [73]:
accuracy_score(df_X_pca['is_genuine'], Logistique_Regression_model_acp.predict(df_X_pca.drop('is_genuine', axis=1)))
Out[73]:
0.9486666666666667
K-Means¶
In [74]:
X_kmeans=billets_tous_linear
y = X['is_genuine']
In [75]:
y= billets_tous_linear['is_genuine']
In [ ]:
In [76]:
X_train_kmeans, X_test_kmeans, y_train_kmeans, y_test_kmeans = train_test_split(x, y, test_size=0.15, random_state=72)
In [77]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(x)
y_pred_kmeans = kmeans.fit_predict(x)
In [78]:
x['is_genuine_kmeans']=kmeans.labels_
In [79]:
x['is_genuine']=y
In [80]:
x=x.replace({0:2})
x=x.replace({1:0})
x=x.replace({2:1})
In [81]:
y_pred_tous=kmeans.predict(x[[#'diagonal', 'height_left',
'height_right', 'margin_low',
'margin_up', 'length']])
X_kmeans['is_genuine_kmeans']=y_pred_tous
In [82]:
y_pred_kmeans
Out[82]:
array([0, 0, 0, ..., 1, 1, 1], dtype=int32)
In [83]:
sum(X_kmeans['is_genuine']!=X_kmeans['is_genuine_kmeans'])
Out[83]:
1474
In [84]:
accuracy_score(kmeans.labels_, y)
Out[84]:
0.017333333333333333
In [85]:
accuracy_score(x['is_genuine_kmeans'], x['is_genuine'])
Out[85]:
0.9826666666666667
In [86]:
(y_test_kmeans.shape, y_pred_kmeans.shape)
Out[86]:
((225,), (1500,))
In [87]:
cm = confusion_matrix(x['is_genuine_kmeans'], x['is_genuine'])
In [88]:
print(cm)
[[476 2] [ 24 998]]
In [89]:
plt.matshow(cm, cmap=plt.cm.Blues)
plt.xlabel('Prediction')
plt.ylabel('Vrai')
plt.plot([-0.5, cm.shape[0]-0.5], [-0.5, cm.shape[1]-0.5], 'g-') # línea diagonal roja
#plt.plot([-0.5, cm.shape[0]-0.5], [cm.shape[1]-0.5, -0.5], 'r-') # línea diagonal verde
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
plt.text(x=j, y=i, s=cm[i, j], va='center', ha='center', fontsize=14, weight='bold')
plt.show()
#La diagonale vert représente les prédictions correctes.
#La croix (rouge) qui croise de la diagonale vert représente les prédictions incorrectes.
#Il y a 2 faux négatifs.
#Il y a 24 faux positifs.
#Il y a 998 vrai positifs
#Il y a 476 vrai négatifs
#[476 2]
#[24 998]
IMPORT Billets Production¶
In [90]:
billetsP = pd.read_csv('billets_production.csv', sep=',')
In [91]:
billetsP
Out[91]:
diagonal | height_left | height_right | margin_low | margin_up | length | id | |
---|---|---|---|---|---|---|---|
0 | 171.76 | 104.01 | 103.54 | 5.21 | 3.30 | 111.42 | A_1 |
1 | 171.87 | 104.17 | 104.13 | 6.00 | 3.31 | 112.09 | A_2 |
2 | 172.00 | 104.58 | 104.29 | 4.99 | 3.39 | 111.57 | A_3 |
3 | 172.49 | 104.55 | 104.34 | 4.44 | 3.03 | 113.20 | A_4 |
4 | 171.65 | 103.63 | 103.56 | 3.77 | 3.16 | 113.33 | A_5 |
In [92]:
billetsP.head()
Out[92]:
diagonal | height_left | height_right | margin_low | margin_up | length | id | |
---|---|---|---|---|---|---|---|
0 | 171.76 | 104.01 | 103.54 | 5.21 | 3.30 | 111.42 | A_1 |
1 | 171.87 | 104.17 | 104.13 | 6.00 | 3.31 | 112.09 | A_2 |
2 | 172.00 | 104.58 | 104.29 | 4.99 | 3.39 | 111.57 | A_3 |
3 | 172.49 | 104.55 | 104.34 | 4.44 | 3.03 | 113.20 | A_4 |
4 | 171.65 | 103.63 | 103.56 | 3.77 | 3.16 | 113.33 | A_5 |
In [93]:
billetsP.info()
billetsP.describe(include='all')
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5 entries, 0 to 4 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 diagonal 5 non-null float64 1 height_left 5 non-null float64 2 height_right 5 non-null float64 3 margin_low 5 non-null float64 4 margin_up 5 non-null float64 5 length 5 non-null float64 6 id 5 non-null object dtypes: float64(6), object(1) memory usage: 408.0+ bytes
Out[93]:
diagonal | height_left | height_right | margin_low | margin_up | length | id | |
---|---|---|---|---|---|---|---|
count | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5 |
unique | NaN | NaN | NaN | NaN | NaN | NaN | 5 |
top | NaN | NaN | NaN | NaN | NaN | NaN | A_1 |
freq | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
mean | 171.954000 | 104.188000 | 103.972000 | 4.882000 | 3.238000 | 112.322000 | NaN |
std | 0.326542 | 0.396257 | 0.393027 | 0.836821 | 0.142724 | 0.897201 | NaN |
min | 171.650000 | 103.630000 | 103.540000 | 3.770000 | 3.030000 | 111.420000 | NaN |
25% | 171.760000 | 104.010000 | 103.560000 | 4.440000 | 3.160000 | 111.570000 | NaN |
50% | 171.870000 | 104.170000 | 104.130000 | 4.990000 | 3.300000 | 112.090000 | NaN |
75% | 172.000000 | 104.550000 | 104.290000 | 5.210000 | 3.310000 | 113.200000 | NaN |
max | 172.490000 | 104.580000 | 104.340000 | 6.000000 | 3.390000 | 113.330000 | NaN |
In [94]:
sns.pairplot(billetsP[['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up','length']])
#, hue='id')
plt.show()
In [95]:
scaler = StandardScaler()
XP = pd.DataFrame(scaler.fit_transform(billetsP[['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
'length']]
), columns=['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
'length'])
DONNEES StandardScale¶
In [96]:
XP
Out[96]:
diagonal | height_left | height_right | margin_low | margin_up | length | |
---|---|---|---|---|---|---|
0 | -0.664228 | -0.502224 | -1.228900 | 0.438224 | 0.485681 | -1.124014 |
1 | -0.287604 | -0.050787 | 0.449459 | 1.493702 | 0.564017 | -0.289103 |
2 | 0.157497 | 1.106022 | 0.904607 | 0.144293 | 1.190702 | -0.937094 |
3 | 1.835186 | 1.021377 | 1.046841 | -0.590533 | -1.629382 | 1.094107 |
4 | -1.040852 | -1.574388 | -1.172007 | -1.485686 | -0.611018 | 1.256104 |
In [97]:
sns.pairplot(XP[['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
'length']])
plt.show()
PREDICTION billets_production.csv¶
In [98]:
billetsP['is_genuine']=Logistique_Regression_model.predict(XP.drop('diagonal', axis=1).drop('height_left', axis=1))
In [99]:
billetsP
Out[99]:
diagonal | height_left | height_right | margin_low | margin_up | length | id | is_genuine | |
---|---|---|---|---|---|---|---|---|
0 | 171.76 | 104.01 | 103.54 | 5.21 | 3.30 | 111.42 | A_1 | False |
1 | 171.87 | 104.17 | 104.13 | 6.00 | 3.31 | 112.09 | A_2 | False |
2 | 172.00 | 104.58 | 104.29 | 4.99 | 3.39 | 111.57 | A_3 | False |
3 | 172.49 | 104.55 | 104.34 | 4.44 | 3.03 | 113.20 | A_4 | True |
4 | 171.65 | 103.63 | 103.56 | 3.77 | 3.16 | 113.33 | A_5 | True |
Kmeans billets_production¶
In [100]:
kmeans.predict(XP.drop('diagonal', axis=1).drop('height_left', axis=1))
Out[100]:
array([1, 1, 1, 1, 1], dtype=int32)
In [101]:
(XP.drop('diagonal', axis=1).drop('height_left', axis=1))
Out[101]:
height_right | margin_low | margin_up | length | |
---|---|---|---|---|
0 | -1.228900 | 0.438224 | 0.485681 | -1.124014 |
1 | 0.449459 | 1.493702 | 0.564017 | -0.289103 |
2 | 0.904607 | 0.144293 | 1.190702 | -0.937094 |
3 | 1.046841 | -0.590533 | -1.629382 | 1.094107 |
4 | -1.172007 | -1.485686 | -0.611018 | 1.256104 |
In [102]:
Logistique_Regression_model.predict(XP.drop('diagonal', axis=1).drop('height_left', axis=1))
Out[102]:
array([False, False, False, True, True])
TEST DE L’ALGORITHME¶
In [103]:
algoritme=pd.read_csv('billets_production.csv', sep=',')
In [104]:
scaler = StandardScaler()
TA = pd.DataFrame(scaler.fit_transform(algoritme[['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
'length']]
), columns=['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up',
'length'])
In [105]:
TA
Out[105]:
diagonal | height_left | height_right | margin_low | margin_up | length | |
---|---|---|---|---|---|---|
0 | -0.664228 | -0.502224 | -1.228900 | 0.438224 | 0.485681 | -1.124014 |
1 | -0.287604 | -0.050787 | 0.449459 | 1.493702 | 0.564017 | -0.289103 |
2 | 0.157497 | 1.106022 | 0.904607 | 0.144293 | 1.190702 | -0.937094 |
3 | 1.835186 | 1.021377 | 1.046841 | -0.590533 | -1.629382 | 1.094107 |
4 | -1.040852 | -1.574388 | -1.172007 | -1.485686 | -0.611018 | 1.256104 |
In [106]:
Logistique_Regression_model.predict(TA.drop('diagonal', axis=1).drop('height_left', axis=1))
Out[106]:
array([False, False, False, True, True])
In [ ]:
In [ ]:
In [ ]:
In [ ]: