-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdiagnostics.py
More file actions
115 lines (87 loc) · 2.96 KB
/
diagnostics.py
File metadata and controls
115 lines (87 loc) · 2.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""
Python script meant to measure model and data diagnostics.
"""
import pandas as pd
import timeit
import os
import json
from joblib import load
from common_functions import preprocess_data
import subprocess
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
with open('config.json', 'r') as f:
"""
Load config.json and correct path variable.
"""
config = json.load(f)
model_path = os.path.join(config['prod_deployment_path'])
test_data_path = os.path.join(config['test_data_path'])
def model_predictions(dataset_path):
"""
Function to get model predictions.
"""
# read the deployed model and a test dataset, calculate predictions
model = load(os.path.join(model_path, "trainedmodel.pkl"))
encoder = load(os.path.join(model_path, "encoder.pkl"))
if dataset_path is None:
dataset_path = "testdata.csv"
df = pd.read_csv(os.path.join(test_data_path, dataset_path))
df_x, df_y, _ = preprocess_data(df, encoder)
y_pred = model.predict(df_x)
return y_pred, df_y
def dataframe_summary():
"""
Function to get summary statistics.
"""
# calculate summary statistics here
df = pd.read_csv(os.path.join(test_data_path, "testdata.csv"))
numeric_columns = [
"lastmonth_activity",
"lastyear_activity",
"number_of_employees"
]
result = []
for column in numeric_columns:
result.append([column, "mean", df[column].mean()])
result.append([column, "median", df[column].median()])
result.append([column, "standard deviation", df[column].std()])
return result
def missing_data():
"""
Function to deal with missing data.
"""
df = pd.read_csv(os.path.join(test_data_path, "testdata.csv"))
result = []
for column in df.columns:
count_na = df[column].isna().sum()
count_not_na = df[column].count()
count_total = count_not_na + count_na
result.append([column, str(int(count_na/count_total*100))+"%"])
return str(result)
def execution_time():
"""
Function to get timings.
"""
# calculate timing of training.py and ingestion.py
result = []
for procedure in ["training.py" , "ingestion.py"]:
starttime = timeit.default_timer()
os.system('python3 %s' % procedure)
timing=timeit.default_timer() - starttime
result.append([procedure, timing])
return str(result)
def outdated_packages_list():
"""
Function to check dependencies.
"""
outdated_packages = subprocess.check_output(['pip', 'list', '--outdated']).decode(sys.stdout.encoding)
return str(outdated_packages)
if __name__ == '__main__':
logging.info("Running diagnostics!")
model_predictions(None)
execution_time()
dataframe_summary()
missing_data()
outdated_packages_list()