Recommendation SYSTEM
Recommendation system is quite popular recent years. A variety of the Internet companies use recommendation system to recommend similar product to users. For example, Netflix used recommendation system to recommend movies to users. Amazon also used it to pop up similar item to online customers. The following is a simple recommendation system I wrote for my data mining class. The input is user-movie list; and output is top three movies recommended for each user. The logic for it is simple: to find similar users using Jaccard similarity first, and then find top three movies similar users have watched but original user didn't. This program was wrote in Python. Let me know if you have better solution!
import json import sys import ast hashTable = [] numOfUsers = 0 numOfMovies = 0 originalTable = [] userList = [] # hash function def hash(x, i): return (3 * x + i) % 100 # calculate the hashTable def computeHashTable(row, iteration): for i in range(1, iteration + 1): temp = [] for x in range(row): temp.append(hash(x, i)) hashTable.append(temp) return hashTable # compare similarity def compare(list1, list2): for i in range(4): if list1[i] != list2[i]: return False return True # main method if __name__ == '__main__': inputData = open(sys.argv[1]) #inputData = open("input4.json") # parsing data for line in inputData: line = ast.literal_eval(line.replace("\r\n", "")) originalTable.append(line[1]) userList.append(line[0]) # Compare to get the number of movies we need to compare # No need to calculate a table of 100 rows if (line[1][-1] > numOfMovies): numOfMovies = line[1][-1] numOfMovies = numOfMovies+1 numOfUsers = len(originalTable) computeHashTable(numOfMovies, 20) finalComputedTable = [[0 for x in range(numOfUsers)] for x in range(20)] finalInvertedComputedTable = [[0 for x in range(20)] for x in range(numOfUsers)] for i in range(numOfUsers): for j in range(20): min = float("inf") for item in originalTable[i]: if min > hashTable[j][item]: min = hashTable[j][item] finalInvertedComputedTable[i][j] = min result = [] for i in range(0, numOfUsers): for j in range(i + 1, numOfUsers): for k in range(0, 5): if compare(finalInvertedComputedTable[i][k * 4:(k + 1) * 4], finalInvertedComputedTable[j][k * 4:(k + 1) * 4]): result.append([userList[i], userList[j]]) break for i in result: i.sort() print i from math import * import json import sys import ast import heapq import operator originalTable = [] userList = [] numOfUsers = 0 numOfMovies = 0 # Calculate jaccard similarity def jaccard_similarity(x, y): intersection = len(set.intersection(*[set(x), set(y)])) union = len(set.union(*[set(x), set(y)])) return intersection / float(union) if __name__ == '__main__': inputData = open(sys.argv[1]) inputSimilarUser = open(sys.argv[2]) for line in inputData: line = ast.literal_eval(line.replace("\r\n", "")) originalTable.append(line[1]) userList.append(line[0]) if (line[1][-1] > numOfMovies): numOfMovies = line[1][-1] numOfMovies = numOfMovies + 1 # get user list from question 1 UserList_Lsh = [] input2_list = [] for line in inputSimilarUser: line = ast.literal_eval(line.replace("\r\n", "")) input2_list.append(line) for user in line: if user not in UserList_Lsh: UserList_Lsh.append(user) # Jaccard Similarity userSimilarList = {} for user in UserList_Lsh: similarUserList = {} similar = [] for pair in input2_list: if user in pair: similarUserList[[x for x in pair if x != user][0]] = jaccard_similarity(originalTable[userList.index(pair[0])], originalTable[userList.index(pair[1])]) temp1= sorted(similarUserList.items(), key = operator.itemgetter(1), reverse=True) for i in temp1: if len(temp1)>5: if i[1]>= temp1[4][1]: similar.append(i[0]) else: similar.append(i[0]) if similar: userSimilarList[user] = similar # choose the 3 movies result =[] finalresult =[] for i in range(len(UserList_Lsh)): tempList = [] recom_movie = [] for j in userSimilarList[UserList_Lsh[i]]: tempList+=originalTable[userList.index(j)] for x in set(tempList): if tempList.count(x) >= 3 and x not in originalTable[userList.index(UserList_Lsh[i])]: recom_movie.append(x) recom_movie.sort() if recom_movie: result.append([UserList_Lsh[i],recom_movie]) for i in result: print i