mtgencode/lib/cbow.py

# Infinite thanks to Talcos from the mtgsalvation forums, who among
# many, many other things wrote the original version of this code.
# I have merely ported it to fit my needs.

import re
import sys
import subprocess
import os
import struct
import math
import utils
import cardlib
import transforms

# # this would be nice, but doing it naively makes things worse
# from joblib import Parallel, delayed
# import multiprocessing

libdir = os.path.dirname(os.path.realpath(__file__))
datadir = os.path.realpath(os.path.join(libdir, '../data'))

# # multithreading control parameters
# cores = multiprocessing.cpu_count()
# segments = cores / 2 if cores / 2 > 0 else 1

# max length of vocabulary entries
max_w = 50


#### snip! ####

def read_vector_file(fname):
    with open(fname, 'rb') as f:
        words = int(f.read(4))
        size = int(f.read(4))
        vocab = [' '] * (words * max_w)
        M = []
        for b in range(0,words):
            a = 0
            while True:
                c = f.read(1)
                vocab[b * max_w + a] = c;
                if len(c) == 0 or c == ' ':
                    break
                if (a < max_w) and vocab[b * max_w + a] != '\n':
                    a += 1
            tmp = list(struct.unpack('f'*size,f.read(4 * size)))
            length = math.sqrt(sum([tmp[i] * tmp[i] for i in range(0,len(tmp))]))
            for i in range(0,len(tmp)):
                tmp[i] /= length
            M.append(tmp)
        return ((''.join(vocab)).split(),M)

def makevector(vocabulary,vecs,sequence):
    words = sequence.split()
    indices = []
    for word in words:
        if word not in vocabulary:
            #print("Missing word in vocabulary: " + word)
            continue
            #return [0.0]*len(vecs[0])
        indices.append(vocabulary.index(word))
    #res = map(sum,[vecs[i] for i in indices])
    res = None
    for v in [vecs[i] for i in indices]:
        if res == None:
            res = v
        else:
            res = [x + y for x, y in zip(res,v)]
    length = math.sqrt(sum([res[i] * res[i] for i in range(0,len(res))]))
    for i in range(0,len(res)):
        res[i] /= length
    return res

#### !snip ####


try:
    import numpy
    def cosine_similarity(v1,v2):
        A = numpy.array([v1,v2])

        # from http://stackoverflow.com/questions/17627219/whats-the-fastest-way-in-python-to-calculate-cosine-similarity-given-sparse-mat

        # base similarity matrix (all dot products)
        # replace this with A.dot(A.T).todense() for sparse representation
        similarity = numpy.dot(A, A.T)
        
        # squared magnitude of preference vectors (number of occurrences)
        square_mag = numpy.diag(similarity)

        # inverse squared magnitude
        inv_square_mag = 1 / square_mag

        # if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
        inv_square_mag[numpy.isinf(inv_square_mag)] = 0

        # inverse of the magnitude
        inv_mag = numpy.sqrt(inv_square_mag)
        
        # cosine similarity (elementwise multiply by inverse magnitudes)
        cosine = similarity * inv_mag
        cosine = cosine.T * inv_mag
    
        return cosine[0][1]

except ImportError:
    def cosine_similarity(v1,v2):
        #compute cosine similarity of v1 to v2: (v1 dot v1)/{||v1||*||v2||)
        sumxx, sumxy, sumyy = 0, 0, 0
        for i in range(len(v1)):
            x = v1[i]; y = v2[i]
            sumxx += x*x
            sumyy += y*y
            sumxy += x*y
        return sumxy/math.sqrt(sumxx*sumyy)

def cosine_similarity_name(cardvec, v, name):
    return (cosine_similarity(cardvec, v), name)


class CBOW:
    def __init__(self, verbose = True,
                 vector_fname = os.path.join(datadir, 'cbow.bin'), 
                 card_fname = os.path.join(datadir, 'output.txt')):
        self.verbose = verbose
        self.cardvecs = []

        if self.verbose:
            print 'Building a cbow model...'

        if self.verbose:
            print '  Reading binary vector data from: ' + vector_fname
        (vocab, vecs) = read_vector_file(vector_fname)
        self.vocab = vocab
        self.vecs = vecs
        
        if self.verbose:
            print '  Reading encoded cards from: ' + card_fname
            print '  They\'d better be in the same order as the file used to build the vector model!'
        with open(card_fname, 'rt') as f:
            text = f.read()
        for card_src in text.split(utils.cardsep):
            if card_src:
                card = cardlib.Card(card_src)
                name = card.name
                self.cardvecs += [(name, makevector(self.vocab, 
                                                    self.vecs, 
                                                    card.vectorize()))]

        # self.par = Parallel(n_jobs=segments)
                
        if self.verbose:
            print '... Done.'
            print '  vocab size: ' + str(len(self.vocab))
            print '  raw vecs:   ' + str(len(self.vecs))
            print '  card vecs:  ' + str(len(self.cardvecs))

    def nearest(self, card, n=5):
        if isinstance(card, cardlib.Card):
            words = card.vectorize().split('\n\n')[0]
        else:
            # assume it's a string (that's already a vector)
            words = card
            
        if not words:
            return []

        cardvec = makevector(self.vocab, self.vecs, words)

        comparisons = [cosine_similarity_name(cardvec, v, name) for (name, v) in self.cardvecs]
        # comparisons = self.par(delayed(cosine_similarity_name)(cardvec, v, name) 
        #                        for (name, v) in self.cardvecs)

        comparisons.sort(reverse = True)
        comp_n = comparisons[:n]
        
        if isinstance(card, cardlib.Card) and card.bside:
            comp_n += self.nearest(card.bside)

        return comp_n
various improvements, added cbow 2015-07-29 08:21:34 +00:00			`# Infinite thanks to Talcos from the mtgsalvation forums, who among`
			`# many, many other things wrote the original version of this code.`
			`# I have merely ported it to fit my needs.`

			`import re`
			`import sys`
			`import subprocess`
			`import os`
			`import struct`
			`import math`
			`import utils`
			`import cardlib`
			`import transforms`

			`# # this would be nice, but doing it naively makes things worse`
			`# from joblib import Parallel, delayed`
			`# import multiprocessing`

			`libdir = os.path.dirname(os.path.realpath(__file__))`
			`datadir = os.path.realpath(os.path.join(libdir, '../data'))`

			`# # multithreading control parameters`
			`# cores = multiprocessing.cpu_count()`
			`# segments = cores / 2 if cores / 2 > 0 else 1`

			`# max length of vocabulary entries`
			`max_w = 50`


			`#### snip! ####`

			`def read_vector_file(fname):`
			`with open(fname, 'rb') as f:`
			`words = int(f.read(4))`
			`size = int(f.read(4))`
			`vocab = [' '] * (words * max_w)`
			`M = []`
			`for b in range(0,words):`
			`a = 0`
			`while True:`
			`c = f.read(1)`
			`vocab[b * max_w + a] = c;`
			`if len(c) == 0 or c == ' ':`
			`break`
			`if (a < max_w) and vocab[b * max_w + a] != '\n':`
			`a += 1`
			`tmp = list(struct.unpack('f'size,f.read(4 size)))`
			`length = math.sqrt(sum([tmp[i] * tmp[i] for i in range(0,len(tmp))]))`
			`for i in range(0,len(tmp)):`
			`tmp[i] /= length`
			`M.append(tmp)`
			`return ((''.join(vocab)).split(),M)`

			`def makevector(vocabulary,vecs,sequence):`
			`words = sequence.split()`
			`indices = []`
			`for word in words:`
			`if word not in vocabulary:`
			`#print("Missing word in vocabulary: " + word)`
			`continue`
			`#return [0.0]*len(vecs[0])`
			`indices.append(vocabulary.index(word))`
			`#res = map(sum,[vecs[i] for i in indices])`
			`res = None`
			`for v in [vecs[i] for i in indices]:`
			`if res == None:`
			`res = v`
			`else:`
			`res = [x + y for x, y in zip(res,v)]`
			`length = math.sqrt(sum([res[i] * res[i] for i in range(0,len(res))]))`
			`for i in range(0,len(res)):`
			`res[i] /= length`
			`return res`

			`#### !snip ####`


			`try:`
			`import numpy`
			`def cosine_similarity(v1,v2):`
			`A = numpy.array([v1,v2])`

			`# from http://stackoverflow.com/questions/17627219/whats-the-fastest-way-in-python-to-calculate-cosine-similarity-given-sparse-mat`

			`# base similarity matrix (all dot products)`
			`# replace this with A.dot(A.T).todense() for sparse representation`
			`similarity = numpy.dot(A, A.T)`

			`# squared magnitude of preference vectors (number of occurrences)`
			`square_mag = numpy.diag(similarity)`

			`# inverse squared magnitude`
			`inv_square_mag = 1 / square_mag`

			`# if it doesn't occur, set it's inverse magnitude to zero (instead of inf)`
			`inv_square_mag[numpy.isinf(inv_square_mag)] = 0`

			`# inverse of the magnitude`
			`inv_mag = numpy.sqrt(inv_square_mag)`

			`# cosine similarity (elementwise multiply by inverse magnitudes)`
			`cosine = similarity * inv_mag`
			`cosine = cosine.T * inv_mag`

			`return cosine[0][1]`

			`except ImportError:`
			`def cosine_similarity(v1,v2):`
			`#compute cosine similarity of v1 to v2: (v1 dot v1)/{\|\|v1\|\|*\|\|v2\|\|)`
			`sumxx, sumxy, sumyy = 0, 0, 0`
			`for i in range(len(v1)):`
			`x = v1[i]; y = v2[i]`
			`sumxx += x*x`
			`sumyy += y*y`
			`sumxy += x*y`
			`return sumxy/math.sqrt(sumxx*sumyy)`

			`def cosine_similarity_name(cardvec, v, name):`
			`return (cosine_similarity(cardvec, v), name)`


			`class CBOW:`
			`def __init__(self, verbose = True,`
			`vector_fname = os.path.join(datadir, 'cbow.bin'),`
			`card_fname = os.path.join(datadir, 'output.txt')):`
			`self.verbose = verbose`
			`self.cardvecs = []`

			`if self.verbose:`
			`print 'Building a cbow model...'`

			`if self.verbose:`
			`print ' Reading binary vector data from: ' + vector_fname`
			`(vocab, vecs) = read_vector_file(vector_fname)`
			`self.vocab = vocab`
			`self.vecs = vecs`

			`if self.verbose:`
			`print ' Reading encoded cards from: ' + card_fname`
			`print ' They\'d better be in the same order as the file used to build the vector model!'`
			`with open(card_fname, 'rt') as f:`
			`text = f.read()`
			`for card_src in text.split(utils.cardsep):`
			`if card_src:`
			`card = cardlib.Card(card_src)`
			`name = card.name`
			`self.cardvecs += [(name, makevector(self.vocab,`
			`self.vecs,`
			`card.vectorize()))]`

			`# self.par = Parallel(n_jobs=segments)`

			`if self.verbose:`
			`print '... Done.'`
			`print ' vocab size: ' + str(len(self.vocab))`
			`print ' raw vecs: ' + str(len(self.vecs))`
			`print ' card vecs: ' + str(len(self.cardvecs))`

			`def nearest(self, card, n=5):`
			`if isinstance(card, cardlib.Card):`
			`words = card.vectorize().split('\n\n')[0]`
			`else:`
			`# assume it's a string (that's already a vector)`
			`words = card`

			`if not words:`
			`return []`

			`cardvec = makevector(self.vocab, self.vecs, words)`

			`comparisons = [cosine_similarity_name(cardvec, v, name) for (name, v) in self.cardvecs]`
			`# comparisons = self.par(delayed(cosine_similarity_name)(cardvec, v, name)`
			`# for (name, v) in self.cardvecs)`

			`comparisons.sort(reverse = True)`
			`comp_n = comparisons[:n]`

			`if isinstance(card, cardlib.Card) and card.bside:`
			`comp_n += self.nearest(card.bside)`

			`return comp_n`