from sklearn.datasets import load_svmlight_file
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.neural_network import BernoulliRBM
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

#load dataset in svmlib format
X, y = load_svmlight_file("dataset1.txt")

#X is scipy.sparse CSR matrix, we need to convert it to numpy array
X = X.toarray()

#scaling to [0,1]
min_max_scaler = preprocessing.MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)

#split train-testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3) #30% test

#model: RBM Feature Extraction + SVM
rbm = BernoulliRBM(random_state=0, verbose=True, n_components=128, learning_rate = 0.01)
svm = SVC(kernel='linear')
classifier = Pipeline(steps=[('rbm', rbm), ('svm', svm)])

#training...
classifier.fit(X_train, y_train)

#predict testing data
y_predict = classifier.predict(X_test)

#reporting classification results on testing data (performance)
print(classification_report(y_test, y_predict))

#reporting confusion matrix
print(confusion_matrix(y_test, y_predict))
