recom sys

Recommendation SYSTEM 

Recommendation system is quite popular recent years. A variety of the Internet companies use recommendation system to recommend similar product to users. For example, Netflix used recommendation system to recommend movies to users. Amazon also used it to pop up similar item to online customers. The following is a simple recommendation system I wrote for my data mining class. The input is user-movie list; and output is top three movies recommended for each user. The logic for it is simple: to find similar users using Jaccard similarity first, and then find top three movies similar users have watched but original user didn't. This program was wrote in Python. Let me know if you have better solution! 

import json
import sys
import ast

hashTable = []
numOfUsers = 0
numOfMovies = 0
originalTable = []
userList = []


# hash function
def hash(x, i):
    return (3 * x + i) % 100


# calculate the hashTable
def computeHashTable(row, iteration):
    for i in range(1, iteration + 1):
        temp = []
        for x in range(row):
            temp.append(hash(x, i))
        hashTable.append(temp)
    return hashTable


# compare similarity
def compare(list1, list2):
    for i in range(4):
        if list1[i] != list2[i]:
            return False
    return True


# main method
if __name__ == '__main__':
    inputData = open(sys.argv[1])
    #inputData = open("input4.json")
    # parsing data
    for line in inputData:
        line = ast.literal_eval(line.replace("\r\n", ""))
        originalTable.append(line[1])
        userList.append(line[0])
        # Compare to get the number of movies we need to compare
        # No need to calculate a table of 100 rows
        if (line[1][-1] > numOfMovies):
            numOfMovies = line[1][-1]
        numOfMovies = numOfMovies+1

    numOfUsers = len(originalTable)
    computeHashTable(numOfMovies, 20)


    finalComputedTable = [[0 for x in range(numOfUsers)] for x in range(20)]
    finalInvertedComputedTable = [[0 for x in range(20)] for x in range(numOfUsers)]

    for i in range(numOfUsers):
        for j in range(20):
            min = float("inf")
            for item in originalTable[i]:
                if min > hashTable[j][item]:
                    min = hashTable[j][item]

            finalInvertedComputedTable[i][j] = min



    result = []
    for i in range(0, numOfUsers):
        for j in range(i + 1, numOfUsers):
            for k in range(0, 5):
                if compare(finalInvertedComputedTable[i][k * 4:(k + 1) * 4],
                           finalInvertedComputedTable[j][k * 4:(k + 1) * 4]):
                    result.append([userList[i], userList[j]])
                    break

    for i in result:
        i.sort()
        print i

        
from math import *
import json
import sys
import ast
import heapq
import operator

originalTable = []
userList = []
numOfUsers = 0
numOfMovies = 0


# Calculate jaccard similarity
def jaccard_similarity(x, y):
    intersection = len(set.intersection(*[set(x), set(y)]))
    union = len(set.union(*[set(x), set(y)]))
    return intersection / float(union)


if __name__ == '__main__':
    inputData = open(sys.argv[1])
    inputSimilarUser = open(sys.argv[2])

    for line in inputData:
        line = ast.literal_eval(line.replace("\r\n", ""))
        originalTable.append(line[1])
        userList.append(line[0])

        if (line[1][-1] > numOfMovies):
            numOfMovies = line[1][-1]
        numOfMovies = numOfMovies + 1



    # get user list from question 1
    UserList_Lsh = []
    input2_list = []

    for line in inputSimilarUser:
        line = ast.literal_eval(line.replace("\r\n", ""))
        input2_list.append(line)
        for user in line:
            if user not in UserList_Lsh:
                UserList_Lsh.append(user)



    # Jaccard Similarity
    userSimilarList = {}
    for user in UserList_Lsh:
        similarUserList = {}
        similar = []
        for pair in input2_list:
            if user in pair:
                similarUserList[[x for x in pair if x != user][0]] = jaccard_similarity(originalTable[userList.index(pair[0])], originalTable[userList.index(pair[1])])

        temp1= sorted(similarUserList.items(), key = operator.itemgetter(1), reverse=True)

        for i in temp1:
            if len(temp1)>5:
                if i[1]>= temp1[4][1]:
                    similar.append(i[0])
            else:
                similar.append(i[0])
            if similar:
                userSimilarList[user] = similar


    # choose the 3 movies
    result =[]
    finalresult =[]
    for i in range(len(UserList_Lsh)):
        tempList = []
        recom_movie = []
        for j in userSimilarList[UserList_Lsh[i]]:

            tempList+=originalTable[userList.index(j)]

        for x in set(tempList):
            if tempList.count(x) >= 3 and x not in originalTable[userList.index(UserList_Lsh[i])]:
                recom_movie.append(x)
                recom_movie.sort()
        if recom_movie:
            result.append([UserList_Lsh[i],recom_movie])



    for i in result:
         print i