School/Bigdata

[빅데이터처리]필기체 인식 오류율

0lynny 2022. 2. 8. 20:44
import sys
from os import listdir
import numpy as np
import operator

def createDataSet(dirname):
    labels = []
    trainingList = listdir(dirname)
    m = len(trainingList)
    matrix = np.zeros((m, 1024)) 

    for i in range(m): 
        fileName = trainingList[i]
        answer = int(fileName.split('_')[0]) 
        labels.append(answer)
        matrix[i, :] = imgVector(dirname + '/' + fileName)
    return matrix, labels 

def classify0(inX, dataSet, labels, k): 
    dataSetSize = dataSet.shape[0]
    diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
    sqDiffMat = diffMat ** 2 
    sqDistances = sqDiffMat.sum(axis = 1) 
    distances = sqDistances ** 0.5 
    sortedDistIndicies = distances.argsort() 
    classCount = {} 

    for i in range(k): 
        voteIlabel = labels[sortedDistIndicies[i]] 
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 
    sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse=True) 
    return sortedClassCount[0][0]

def imgVector(filename): 
    vector = np.zeros((1, 1024)) 
    f = open(filename)
    for i in range(32):
        line = f.readline()            
        for j in range(32):
            vector[0, 32 * i + j] = int(line[j])
    return vector        

trainingFileName = sys.argv[1]
testFileName = sys.argv[2]

testFileList = listdir(testFileName)
length = len(testFileList)

matrix, labels = createDataSet(trainingFileName)

for k in range(1, 21): 
    datacount = 0 
    dataerrorCount = 0 
    
    for i in range(length): 
        answer = int(testFileList[i].split('_')[0])
        testData = imgVector(testFileName + '/' + testFileList[i])
        classifiedResult = classify0(testData, matrix, labels, k)
        
        datacount += 1
        if answer != classifiedResult :
            dataerrorCount += 1
    
    print(int(dataerrorCount / datacount * 100))