from __future__ import print_function
import matplotlib.pyplot as plt

import numpy as np
import os
import pickle as pickle
import time
import sys
import h2o

# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline

from __future__ import print_function
import matplotlib.pyplot as plt

import numpy as np
import os
import pickle as pickle
import time
import sys
import h2o

# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline

os.chdir('d:/Data/Gdeeplearning-Udacity')
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

os.chdir('d:/Data/Gdeeplearning-Udacity')
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

GLM on H2O.ai

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM (Zulu 8.17.0.3-win64) (build 25.102-b14, mixed mode)
  Starting server from D:\Anaconda3\h2o_jar\h2o.jar
  Ice root: C:\Users\Public\Documents\Wondershare\CreatorTemp\tmp73wyrjxi
  JVM stdout: C:\Users\Public\Documents\Wondershare\CreatorTemp\tmp73wyrjxi\h2o_e2its_started_from_python.out
  JVM stderr: C:\Users\Public\Documents\Wondershare\CreatorTemp\tmp73wyrjxi\h2o_e2its_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.
Warning: Your H2O cluster version is too old (3 months and 9 days)! Please download and install the latest version from http://h2o.ai/download/

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM (Zulu 8.17.0.3-win64) (build 25.102-b14, mixed mode)
  Starting server from D:\Anaconda3\h2o_jar\h2o.jar
  Ice root: C:\Users\Public\Documents\Wondershare\CreatorTemp\tmp73wyrjxi
  JVM stdout: C:\Users\Public\Documents\Wondershare\CreatorTemp\tmp73wyrjxi\h2o_e2its_started_from_python.out
  JVM stderr: C:\Users\Public\Documents\Wondershare\CreatorTemp\tmp73wyrjxi\h2o_e2its_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.
Warning: Your H2O cluster version is too old (3 months and 9 days)! Please download and install the latest version from http://h2o.ai/download/

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
  labels = labels.reshape((-1, 1)).astype(str)
  #labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels

tf_train_dataset, tf_train_labels = reformat(train_dataset, train_labels)
tf_valid_dataset, tf_valid_labels = reformat(valid_dataset, valid_labels)
tf_test_dataset, tf_test_labels = reformat(test_dataset, test_labels)

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
  labels = labels.reshape((-1, 1)).astype(str)
  #labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels

tf_train_dataset, tf_train_labels = reformat(train_dataset, train_labels)
tf_valid_dataset, tf_valid_labels = reformat(valid_dataset, valid_labels)
tf_test_dataset, tf_test_labels = reformat(test_dataset, test_labels)

train_dataset_h2o = h2o.H2OFrame(python_obj=tf_train_dataset)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%

train_dataset_h2o = h2o.H2OFrame(python_obj=tf_train_dataset)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%

valid_dataset_h2o = h2o.H2OFrame(python_obj=tf_valid_dataset)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%

valid_dataset_h2o = h2o.H2OFrame(python_obj=tf_valid_dataset)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%

#deleting non used memory
train_dataset = []
train_labels = []
test_dataset = []
test_labels = []
valid_dataset = []
valid_labels = []

#deleting non used memory
train_dataset = []
train_labels = []
test_dataset = []
test_labels = []
valid_dataset = []
valid_labels = []

from h2o.estimators.glm import H2OGeneralizedLinearEstimator

from h2o.estimators.glm import H2OGeneralizedLinearEstimator

glm_multi_v1 = H2OGeneralizedLinearEstimator(model_id='glm_v1',family='multinomial',solver='AUTO',\
                                            nfolds=4,alpha=0.0 ,lambda_=0.001,\
                                            score_each_iteration = True,early_stopping=True, seed = 1234)

glm_multi_v1 = H2OGeneralizedLinearEstimator(model_id='glm_v1',family='multinomial',solver='AUTO',\
                                            nfolds=4,alpha=0.0 ,lambda_=0.001,\
                                            score_each_iteration = True,early_stopping=True, seed = 1234)

predictors_X = train_dataset_h2o.col_names[:-1] 
results_y = train_dataset_h2o.col_names[-1]

predictors_X = train_dataset_h2o.col_names[:-1] 
results_y = train_dataset_h2o.col_names[-1]

start=time.time()
glm_multi_v1.train(x=predictors_X, y=results_y, \

glm Model Build progress: |███████████████████████████████████████████████| 100%
time 853.3969919681549

start=time.time()
glm_multi_v1.train(x=predictors_X, y=results_y, \

glm Model Build progress: |███████████████████████████████████████████████| 100%
time 853.3969919681549

glm_multi_v1

Model Details
=============
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  glm_v1
GLM Model: summary

ModelMetricsMultinomialGLM: glm
** Reported on train data. **

MSE: 0.16822519190326915
RMSE: 0.4101526446376631

ModelMetricsMultinomialGLM: glm
** Reported on validation data. **

MSE: 0.17262832959326135
RMSE: 0.41548565509926016

ModelMetricsMultinomialGLM: glm
** Reported on cross-validation data. **

MSE: 0.1718545212776373
RMSE: 0.41455339979022887
Cross-Validation Metrics Summary:

Scoring History:

See the whole table with table.as_data_frame()

glm_multi_v1

Model Details
=============
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  glm_v1
GLM Model: summary

ModelMetricsMultinomialGLM: glm
** Reported on train data. **

MSE: 0.16822519190326915
RMSE: 0.4101526446376631

ModelMetricsMultinomialGLM: glm
** Reported on validation data. **

MSE: 0.17262832959326135
RMSE: 0.41548565509926016

ModelMetricsMultinomialGLM: glm
** Reported on cross-validation data. **

MSE: 0.1718545212776373
RMSE: 0.41455339979022887
Cross-Validation Metrics Summary:

Scoring History:

See the whole table with table.as_data_frame()

glm prediction progress: |████████████████████████████████████████████████| 100%
Train Accuracy: 0.836965
glm prediction progress: |████████████████████████████████████████████████| 100%
Valid Accuracy: 0.8332
glm prediction progress: |████████████████████████████████████████████████| 100%
Test Accuracy: 0.8986

glm prediction progress: |████████████████████████████████████████████████| 100%
Train Accuracy: 0.836965
glm prediction progress: |████████████████████████████████████████████████| 100%
Valid Accuracy: 0.8332
glm prediction progress: |████████████████████████████████████████████████| 100%
Test Accuracy: 0.8986


El algoritmo, desde el punto de vista de recursos presenta comportamientos mucho mejores con respecto a las pruebas realizadas anteriormente. El consumo de recursos de memoria es muy bueno (2,6 Gb)  y consigue aprovechar el 100% de rendimiento en CPU de la máquina.


El algoritmo, desde el punto de vista de recursos presenta comportamientos mucho mejores con respecto a las pruebas realizadas anteriormente. El consumo de recursos de memoria es muy bueno (2,6 Gb)  y consigue aprovechar el 100% de rendimiento en CPU de la máquina.

H2O cluster uptime:	06 secs
H2O cluster version:	3.10.0.9
H2O cluster version age:	3 months and 9 days !!!
H2O cluster name:	H2O_from_python_e2its_gbwxm1
H2O cluster total nodes:	1
H2O cluster free memory:	3.530 Gb
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster status:	accepting new members, healthy
H2O connection url:	http://127.0.0.1:54321
H2O connection proxy:	None
Python version:	3.5.2 final

	family	link	regularization	number_of_predictors_total	number_of_active_predictors	number_of_iterations	training_frame
	multinomial	multinomial	Ridge ( lambda = 0.001 )	7850	7840	50	py_1_sid_a4c1

	mean	sd	cv_1_valid	cv_2_valid	cv_3_valid	cv_4_valid
accuracy	0.8328426	0.0012030	0.8338638	0.8344597	0.8330132	0.8300337
err	0.1671574	0.0012030	0.1661362	0.1655403	0.1669868	0.1699663
err_count	8357.75	52.34113	8299.0	8304.0	8346.0	8482.0
logloss	0.6341491	0.0043168	0.6347954	0.6262802	0.6322534	0.6432675
max_per_class_error	0.2134856	0.0013493	0.2154245	0.2122186	0.2110312	0.2152681
mean_per_class_accuracy	0.8328531	0.0012293	0.8338676	0.8345117	0.8330554	0.8299777
mean_per_class_error	0.1671469	0.0012293	0.1661324	0.1654883	0.1669446	0.1700222
mse	0.1718562	0.0007704	0.1714213	0.1706311	0.17177	0.1736027
null_deviance	230261.66	318.8281	230044.92	231011.97	230169.4	229820.34
r2	0.9791685	0.0001137	0.9792296	0.979303	0.9792477	0.9788939
residual_deviance	63413.84	354.82706	63419.875	62832.19	63200.05	64203.246
rmse	0.4145534	0.0009282	0.4140305	0.4130752	0.4144514	0.4166565

	timestamp	duration	iteration	negative_log_likelihood	objective
	2017-02-12 21:29:41	0.000 sec	0	460517.0185988	2.3025851
	2017-02-12 21:29:43	2.443 sec	1	230975.2526532	1.1548763
	2017-02-12 21:29:46	5.043 sec	2	182336.1420963	0.9116807
	2017-02-12 21:29:49	8.408 sec	3	162076.0761888	0.8103804
	2017-02-12 21:29:52	11.771 sec	4	149946.0303992	0.7497302
---	---	---	---	---	---
	2017-02-12 21:32:11	2 min 30.693 sec	46	123698.3410792	0.6184917
	2017-02-12 21:32:15	2 min 34.064 sec	47	123610.0715447	0.6180504
	2017-02-12 21:32:18	2 min 37.420 sec	48	123565.0645532	0.6178253
	2017-02-12 21:32:21	2 min 40.786 sec	49	123486.6772247	0.6174334
	2017-02-12 21:32:25	2 min 44.153 sec	50	123384.3931645	0.6169220

martes, 11 de abril de 2017

Clasificador MultiClase - H2O.ai (General Linear Model)

H2O.ai 3.10

Inicialización

Lo primero que vamos a hacer es cargar las librerías y módulos necesarios.

GLM on H2O.ai

Experimentos

Desde el punto de vista de rendimiento

El algoritmo, desde el punto de vista de recursos presenta comportamientos mucho mejores con respecto a las pruebas realizadas anteriormente. El consumo de recursos de memoria es muy bueno (2,6 Gb) y consigue aprovechar el 100% de rendimiento en CPU de la máquina.

Conclusión