School/Bigdata
[빅데이터처리]필기체 인식 오류율
0lynny
2022. 2. 8. 20:44
import sys
from os import listdir
import numpy as np
import operator
def createDataSet(dirname):
labels = []
trainingList = listdir(dirname)
m = len(trainingList)
matrix = np.zeros((m, 1024))
for i in range(m):
fileName = trainingList[i]
answer = int(fileName.split('_')[0])
labels.append(answer)
matrix[i, :] = imgVector(dirname + '/' + fileName)
return matrix, labels
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis = 1)
distances = sqDistances ** 0.5
sortedDistIndicies = distances.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def imgVector(filename):
vector = np.zeros((1, 1024))
f = open(filename)
for i in range(32):
line = f.readline()
for j in range(32):
vector[0, 32 * i + j] = int(line[j])
return vector
trainingFileName = sys.argv[1]
testFileName = sys.argv[2]
testFileList = listdir(testFileName)
length = len(testFileList)
matrix, labels = createDataSet(trainingFileName)
for k in range(1, 21):
datacount = 0
dataerrorCount = 0
for i in range(length):
answer = int(testFileList[i].split('_')[0])
testData = imgVector(testFileName + '/' + testFileList[i])
classifiedResult = classify0(testData, matrix, labels, k)
datacount += 1
if answer != classifiedResult :
dataerrorCount += 1
print(int(dataerrorCount / datacount * 100))