# Infinite thanks to Talcos from the mtgsalvation forums, who among # many, many other things wrote the original version of this code. # I have merely ported it to fit my needs. import re import sys import subprocess import os import struct import math import multiprocessing import utils import cardlib import transforms import namediff libdir = os.path.dirname(os.path.realpath(__file__)) datadir = os.path.realpath(os.path.join(libdir, '../data')) # multithreading control parameters cores = multiprocessing.cpu_count() # max length of vocabulary entries max_w = 50 #### snip! #### def read_vector_file(fname): with open(fname, 'rb') as f: words = int(f.read(4)) size = int(f.read(4)) vocab = [' '] * (words * max_w) M = [] for b in range(0,words): a = 0 while True: c = f.read(1) vocab[b * max_w + a] = c; if len(c) == 0 or c == ' ': break if (a < max_w) and vocab[b * max_w + a] != '\n': a += 1 tmp = list(struct.unpack('f'*size,f.read(4 * size))) length = math.sqrt(sum([tmp[i] * tmp[i] for i in range(0,len(tmp))])) for i in range(0,len(tmp)): tmp[i] /= length M.append(tmp) return ((''.join(vocab)).split(),M) def makevector(vocabulary,vecs,sequence): words = sequence.split() indices = [] for word in words: if word not in vocabulary: #print("Missing word in vocabulary: " + word) continue #return [0.0]*len(vecs[0]) indices.append(vocabulary.index(word)) #res = map(sum,[vecs[i] for i in indices]) res = None for v in [vecs[i] for i in indices]: if res == None: res = v else: res = [x + y for x, y in zip(res,v)] # bad things happen if we have a vector of only unknown words if res is None: return [0.0]*len(vecs[0]) length = math.sqrt(sum([res[i] * res[i] for i in range(0,len(res))])) for i in range(0,len(res)): res[i] /= length return res #### !snip #### try: import numpy def cosine_similarity(v1,v2): A = numpy.array([v1,v2]) # from http://stackoverflow.com/questions/17627219/whats-the-fastest-way-in-python-to-calculate-cosine-similarity-given-sparse-mat # base similarity matrix (all dot products) # replace this with A.dot(A.T).todense() for sparse representation similarity = numpy.dot(A, A.T) # squared magnitude of preference vectors (number of occurrences) square_mag = numpy.diag(similarity) # inverse squared magnitude inv_square_mag = 1 / square_mag # if it doesn't occur, set it's inverse magnitude to zero (instead of inf) inv_square_mag[numpy.isinf(inv_square_mag)] = 0 # inverse of the magnitude inv_mag = numpy.sqrt(inv_square_mag) # cosine similarity (elementwise multiply by inverse magnitudes) cosine = similarity * inv_mag cosine = cosine.T * inv_mag return cosine[0][1] except ImportError: def cosine_similarity(v1,v2): #compute cosine similarity of v1 to v2: (v1 dot v1)/{||v1||*||v2||) sumxx, sumxy, sumyy = 0, 0, 0 for i in range(len(v1)): x = v1[i]; y = v2[i] sumxx += x*x sumyy += y*y sumxy += x*y return sumxy/math.sqrt(sumxx*sumyy) def cosine_similarity_name(cardvec, v, name): return (cosine_similarity(cardvec, v), name) # we need to put the logic in a regular function (as opposed to a method of an object) # so that we can pass the function to multiprocessing def f_nearest(card, vocab, vecs, cardvecs, n): if isinstance(card, cardlib.Card): words = card.vectorize().split('\n\n')[0] else: # assume it's a string (that's already a vector) words = card if not words: return [] cardvec = makevector(vocab, vecs, words) comparisons = [cosine_similarity_name(cardvec, v, name) for (name, v) in cardvecs] comparisons.sort(reverse = True) comp_n = comparisons[:n] if isinstance(card, cardlib.Card) and card.bside: comp_n += f_nearest(card.bside, vocab, vecs, cardvecs, n=n) return comp_n def f_nearest_per_thread(workitem): (workcards, vocab, vecs, cardvecs, n) = workitem return map(lambda card: f_nearest(card, vocab, vecs, cardvecs, n), workcards) class CBOW: def __init__(self, verbose = True, vector_fname = os.path.join(datadir, 'cbow.bin'), card_fname = os.path.join(datadir, 'output.txt')): self.verbose = verbose self.cardvecs = [] if self.verbose: print 'Building a cbow model...' if self.verbose: print ' Reading binary vector data from: ' + vector_fname (vocab, vecs) = read_vector_file(vector_fname) self.vocab = vocab self.vecs = vecs if self.verbose: print ' Reading encoded cards from: ' + card_fname print ' They\'d better be in the same order as the file used to build the vector model!' with open(card_fname, 'rt') as f: text = f.read() for card_src in text.split(utils.cardsep): if card_src: card = cardlib.Card(card_src) name = card.name self.cardvecs += [(name, makevector(self.vocab, self.vecs, card.vectorize()))] if self.verbose: print '... Done.' print ' vocab size: ' + str(len(self.vocab)) print ' raw vecs: ' + str(len(self.vecs)) print ' card vecs: ' + str(len(self.cardvecs)) def nearest(self, card, n=5): return f_nearest(card, self.vocab, self.vecs, self.cardvecs, n) def nearest_par(self, cards, n=5, threads=cores): workpool = multiprocessing.Pool(threads) proto_worklist = namediff.list_split(cards, threads) worklist = map(lambda x: (x, self.vocab, self.vecs, self.cardvecs, n), proto_worklist) donelist = workpool.map(f_nearest_per_thread, worklist) return namediff.list_flatten(donelist)