ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • [빅데이터처리]필기체 인식 오류율
    School/Bigdata 2022. 2. 8. 20:44
    import sys
    from os import listdir
    import numpy as np
    import operator
    
    def createDataSet(dirname):
        labels = []
        trainingList = listdir(dirname)
        m = len(trainingList)
        matrix = np.zeros((m, 1024)) 
    
        for i in range(m): 
            fileName = trainingList[i]
            answer = int(fileName.split('_')[0]) 
            labels.append(answer)
            matrix[i, :] = imgVector(dirname + '/' + fileName)
        return matrix, labels 
    
    def classify0(inX, dataSet, labels, k): 
        dataSetSize = dataSet.shape[0]
        diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
        sqDiffMat = diffMat ** 2 
        sqDistances = sqDiffMat.sum(axis = 1) 
        distances = sqDistances ** 0.5 
        sortedDistIndicies = distances.argsort() 
        classCount = {} 
    
        for i in range(k): 
            voteIlabel = labels[sortedDistIndicies[i]] 
            classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 
        sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse=True) 
        return sortedClassCount[0][0]
    
    def imgVector(filename): 
        vector = np.zeros((1, 1024)) 
        f = open(filename)
        for i in range(32):
            line = f.readline()            
            for j in range(32):
                vector[0, 32 * i + j] = int(line[j])
        return vector        
    
    trainingFileName = sys.argv[1]
    testFileName = sys.argv[2]
    
    testFileList = listdir(testFileName)
    length = len(testFileList)
    
    matrix, labels = createDataSet(trainingFileName)
    
    for k in range(1, 21): 
        datacount = 0 
        dataerrorCount = 0 
        
        for i in range(length): 
            answer = int(testFileList[i].split('_')[0])
            testData = imgVector(testFileName + '/' + testFileList[i])
            classifiedResult = classify0(testData, matrix, labels, k)
            
            datacount += 1
            if answer != classifiedResult :
                dataerrorCount += 1
        
        print(int(dataerrorCount / datacount * 100))
Designed by Tistory.