Here I am again with Machine Learning! This time I’ve achieved a great result though (for me at least!). By using another great dataset from UCI I was able to write a decent ML script which scored 95% in the testing part! I am really satisfied with the result.
Here is a sample of what the script should be able to read (in the example the number 9):
Some numbers, as the one above, were clear, others not so clear, since they were handwritten and then somehow (I do not know how) converted into digital images.
I had a hard time figuring out how the attributes in the dataset were coded but in the end I managed to figure it out! I guess making up such a dataset was a really long and boring work.
Anyway here is my script and below you can find the result of the test on the last 50 numbers or so.
import matplotlib.pyplot as plt | |
import numpy as np | |
import os | |
# Import of support vector machine (svm) | |
from sklearn import svm | |
"""--------------------------------SETTINGS---------------------------------""" | |
# Load data from .txt file | |
os.chdir("C:\\") | |
file = open("semeion.txt","r") | |
data = file.read() | |
file.close() | |
# Get index max (number of rows in the data) | |
def getIndexMax(data): | |
dataSplitted = data.split("\n") | |
return len(dataSplitted) | |
# Prepare data for fitting function | |
# Answer: if True, returns answers (labels) | |
# Training: if True, returns training samples (with no answers, only samples) | |
# Last: if True, returns only last training sample/answer. Useful for testing | |
def returnDataToUse(data,index,answers=False,training=False,last=False): | |
dataSplitted = data.split("\n") | |
# Check that the index is not bigger than our dataset | |
if index > len(dataSplitted): | |
print("Index out of bounds, index max:",len(dataSplitted)) | |
return 0 | |
# This bit of code returns answers | |
if answers and not training: | |
firstLine = dataSplitted[0][:-1].split(" ") | |
firstAnsw = firstLine[256:] | |
firstAnsw2 = [int(i) for i in firstAnsw] | |
firstAnswInt = firstAnsw2.index(1) | |
correctAnswers = [firstAnswInt] | |
i = 1 | |
while i < index: | |
temp = dataSplitted[i][:-1].split(" ") | |
temp2 = temp[256:] | |
temp3 = [int(k) for k in temp2] | |
temp4 = temp3.index(1) | |
correctAnswers.append(temp4) | |
i += 1 | |
completeAnswers = np.array(correctAnswers) | |
if last: | |
return completeAnswers[-1] | |
else: | |
return completeAnswers | |
# This bit of code returns pure samples | |
if training and not answers: | |
firstLine = dataSplitted[0][:-1].split(" ") | |
firstTraining = firstLine[:256] | |
trainingArray = np.array([float(i) for i in firstTraining]) | |
i = 1 | |
while i < index: | |
temp = dataSplitted[i][:-1].split(" ") | |
temp2 = temp[:256] | |
temp3 = np.array([float(k) for k in temp2]) | |
trainingArray = np.vstack((trainingArray,temp3)) | |
i += 1 | |
if last: | |
return trainingArray[-1] | |
else: | |
return trainingArray | |
# This function displays the image of the number (sample at row x) | |
# and prints the answer the predictor should give us back | |
def displayImage(data,row): | |
# Split each row | |
dataSplitted = data.split("\n") | |
# Get the 'rowth' row | |
strings = dataSplitted[row] | |
# Split row into numbers(string), and avoid blank at the end | |
stringsSplitted = (strings[:-1]).split(" ") | |
# Get target and convert it into numbers, then in a numpy array | |
risp = stringsSplitted[256:] | |
rispInt = [int(i) for i in risp] | |
rispNp = np.array(rispInt) | |
# Print original data and number to guess in readable format | |
print(rispInt) | |
print("Number to guess:",rispInt.index(1),"\n") | |
# Training array converted into float numbers | |
training = stringsSplitted[:256] | |
trainingFloat = [float(i) for i in training] | |
# Building 16x16 (image) array | |
#. | |
i = 16 | |
k = 0 | |
img = np.array(trainingFloat[:16]) | |
while i <= len(trainingFloat): | |
#print(i) | |
#print(k) | |
temp = np.array(trainingFloat[k:i]) | |
img = np.vstack((img,temp)) | |
k = i | |
i += 16 | |
# Plot image | |
plt.imshow(img,cmap=plt.cm.gray_r,interpolation="nearest") | |
plt.show() | |
"""------------------------------TRAINING------------------------------------""" | |
# FIX THE NUMBER OF TRAINING SAMPLES | |
trainingSamples = 1500 | |
# Gamma: gradient descent parameter | |
clf = svm.SVC(gamma=0.01, C=100) | |
# Index max | |
print("Maximum index:",getIndexMax(data),"\n") | |
answerArray = returnDataToUse(data,trainingSamples,answers=True) | |
trainingAr = returnDataToUse(data,trainingSamples,training=True) | |
x,y = trainingAr,answerArray | |
#Fit the data | |
print("Training...") | |
clf.fit(x,y) | |
"""------------------------------Sample prediction--------------------------""" | |
# CHOOSE AN EXAMPLE TO PREDICT | |
example = 1555 | |
predictQ = returnDataToUse(data,example,training=True,last=True) | |
predictA = returnDataToUse(data,example,answers=True,last=True) | |
print("Prediction:",clf.predict(predictQ)) | |
print("Actual answer",predictA,"\n") | |
# Display the actual image | |
displayImage(data,example) | |
"""------------------------------Testing Function----------------------------""" | |
# Actual testing on residual samples (=samples not used for training) | |
print("Testing...") | |
correct = 0 | |
wrong = 0 | |
j = example+1 | |
while j < getIndexMax(data): | |
q = returnDataToUse(data,j,training=True,last=True) | |
a = returnDataToUse(data,j,answers=True,last=True) | |
p = clf.predict(q) | |
if a == p[0]: | |
correct += 1 | |
else: | |
wrong += 1 | |
j += 1 | |
print("Statistics, correct answers:",correct/(correct+wrong)) |
This time I got 89% success rate! Pretty good I guess! I wonder whether I could train Python to recognize other things, maybe faces or other! Well first of all I have to figure out how to convert a picture into readable numpy arrays. Readable for Python of course!! If you have any suggestion please do leave a comment!
# >>> ================================ RESTART ================================ | |
# >>> | |
# Maximum index: 1593 | |
# | |
# Training... | |
# Prediction: [9] | |
# Actual answer 9 | |
# | |
# [0, 0, 0, 0, 0, 0, 0, 0, 0, 1] | |
# Number to guess: 9 | |
# | |
# Testing... | |
# Statistics, correct answers: 0.8918918918918919 | |
# >>> |
Here below is the citation of the source where I found the dataset “Semeion Handwritten Digits Data Set”:
Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.
and
Semeion Research Center of Sciences of Communication, via Sersale 117, 00128 Rome, Italy
Tattile Via Gaetano Donizetti, 1-3-5,25030 Mairano (Brescia), Italy.
Hope this was interesting!