# Infinite thanks to Talcos from the mtgsalvation forums, who among # many, many other things wrote the original version of this code. # I have merely ported it to fit my needs. import re import sys import subprocess import os import struct import math import utils import cardlib import transforms # # this would be nice, but doing it naively makes things worse # from joblib import Parallel, delayed # import multiprocessing libdir = os.path.dirname(os.path.realpath(__file__)) datadir = os.path.realpath(os.path.join(libdir, '../data')) # # multithreading control parameters # cores = multiprocessing.cpu_count() # segments = cores / 2 if cores / 2 > 0 else 1 # max length of vocabulary entries max_w = 50 #### snip! #### def read_vector_file(fname): with open(fname, 'rb') as f: words = int(f.read(4)) size = int(f.read(4)) vocab = [' '] * (words * max_w) M = [] for b in range(0,words): a = 0 while True: c = f.read(1) vocab[b * max_w + a] = c; if len(c) == 0 or c == ' ': break if (a < max_w) and vocab[b * max_w + a] != '\n': a += 1 tmp = list(struct.unpack('f'*size,f.read(4 * size))) length = math.sqrt(sum([tmp[i] * tmp[i] for i in range(0,len(tmp))])) for i in range(0,len(tmp)): tmp[i] /= length M.append(tmp) return ((''.join(vocab)).split(),M) def makevector(vocabulary,vecs,sequence): words = sequence.split() indices = [] for word in words: if word not in vocabulary: #print("Missing word in vocabulary: " + word) continue #return [0.0]*len(vecs[0]) indices.append(vocabulary.index(word)) #res = map(sum,[vecs[i] for i in indices]) res = None for v in [vecs[i] for i in indices]: if res == None: res = v else: res = [x + y for x, y in zip(res,v)] length = math.sqrt(sum([res[i] * res[i] for i in range(0,len(res))])) for i in range(0,len(res)): res[i] /= length return res #### !snip #### try: import numpy def cosine_similarity(v1,v2): A = numpy.array([v1,v2]) # from http://stackoverflow.com/questions/17627219/whats-the-fastest-way-in-python-to-calculate-cosine-similarity-given-sparse-mat # base similarity matrix (all dot products) # replace this with A.dot(A.T).todense() for sparse representation similarity = numpy.dot(A, A.T) # squared magnitude of preference vectors (number of occurrences) square_mag = numpy.diag(similarity) # inverse squared magnitude inv_square_mag = 1 / square_mag # if it doesn't occur, set it's inverse magnitude to zero (instead of inf) inv_square_mag[numpy.isinf(inv_square_mag)] = 0 # inverse of the magnitude inv_mag = numpy.sqrt(inv_square_mag) # cosine similarity (elementwise multiply by inverse magnitudes) cosine = similarity * inv_mag cosine = cosine.T * inv_mag return cosine[0][1] except ImportError: def cosine_similarity(v1,v2): #compute cosine similarity of v1 to v2: (v1 dot v1)/{||v1||*||v2||) sumxx, sumxy, sumyy = 0, 0, 0 for i in range(len(v1)): x = v1[i]; y = v2[i] sumxx += x*x sumyy += y*y sumxy += x*y return sumxy/math.sqrt(sumxx*sumyy) def cosine_similarity_name(cardvec, v, name): return (cosine_similarity(cardvec, v), name) class CBOW: def __init__(self, verbose = True, vector_fname = os.path.join(datadir, 'cbow.bin'), card_fname = os.path.join(datadir, 'output.txt')): self.verbose = verbose self.cardvecs = [] if self.verbose: print 'Building a cbow model...' if self.verbose: print ' Reading binary vector data from: ' + vector_fname (vocab, vecs) = read_vector_file(vector_fname) self.vocab = vocab self.vecs = vecs if self.verbose: print ' Reading encoded cards from: ' + card_fname print ' They\'d better be in the same order as the file used to build the vector model!' with open(card_fname, 'rt') as f: text = f.read() for card_src in text.split(utils.cardsep): if card_src: card = cardlib.Card(card_src) name = card.name self.cardvecs += [(name, makevector(self.vocab, self.vecs, card.vectorize()))] # self.par = Parallel(n_jobs=segments) if self.verbose: print '... Done.' print ' vocab size: ' + str(len(self.vocab)) print ' raw vecs: ' + str(len(self.vecs)) print ' card vecs: ' + str(len(self.cardvecs)) def nearest(self, card, n=5): if isinstance(card, cardlib.Card): words = card.vectorize().split('\n\n')[0] else: # assume it's a string (that's already a vector) words = card if not words: return [] cardvec = makevector(self.vocab, self.vecs, words) comparisons = [cosine_similarity_name(cardvec, v, name) for (name, v) in self.cardvecs] # comparisons = self.par(delayed(cosine_similarity_name)(cardvec, v, name) # for (name, v) in self.cardvecs) comparisons.sort(reverse = True) comp_n = comparisons[:n] if isinstance(card, cardlib.Card) and card.bside: comp_n += self.nearest(card.bside) return comp_n