added parallel versions to cbow and namediff analyses, not used yet though
This commit is contained in:
parent
b3860eb924
commit
18e1a66f88
2 changed files with 89 additions and 41 deletions
69
lib/cbow.py
69
lib/cbow.py
|
@ -8,20 +8,18 @@ import subprocess
|
||||||
import os
|
import os
|
||||||
import struct
|
import struct
|
||||||
import math
|
import math
|
||||||
|
import multiprocessing
|
||||||
|
|
||||||
import utils
|
import utils
|
||||||
import cardlib
|
import cardlib
|
||||||
import transforms
|
import transforms
|
||||||
|
import namediff
|
||||||
# # this would be nice, but doing it naively makes things worse
|
|
||||||
# from joblib import Parallel, delayed
|
|
||||||
# import multiprocessing
|
|
||||||
|
|
||||||
libdir = os.path.dirname(os.path.realpath(__file__))
|
libdir = os.path.dirname(os.path.realpath(__file__))
|
||||||
datadir = os.path.realpath(os.path.join(libdir, '../data'))
|
datadir = os.path.realpath(os.path.join(libdir, '../data'))
|
||||||
|
|
||||||
# # multithreading control parameters
|
# multithreading control parameters
|
||||||
# cores = multiprocessing.cpu_count()
|
cores = multiprocessing.cpu_count()
|
||||||
# segments = cores / 2 if cores / 2 > 0 else 1
|
|
||||||
|
|
||||||
# max length of vocabulary entries
|
# max length of vocabulary entries
|
||||||
max_w = 50
|
max_w = 50
|
||||||
|
@ -118,6 +116,33 @@ except ImportError:
|
||||||
def cosine_similarity_name(cardvec, v, name):
|
def cosine_similarity_name(cardvec, v, name):
|
||||||
return (cosine_similarity(cardvec, v), name)
|
return (cosine_similarity(cardvec, v), name)
|
||||||
|
|
||||||
|
# we need to put the logic in a regular function (as opposed to a method of an object)
|
||||||
|
# so that we can pass the function to multiprocessing
|
||||||
|
def f_nearest(card, vocab, vecs, cardvecs, n):
|
||||||
|
if isinstance(card, cardlib.Card):
|
||||||
|
words = card.vectorize().split('\n\n')[0]
|
||||||
|
else:
|
||||||
|
# assume it's a string (that's already a vector)
|
||||||
|
words = card
|
||||||
|
|
||||||
|
if not words:
|
||||||
|
return []
|
||||||
|
|
||||||
|
cardvec = makevector(vocab, vecs, words)
|
||||||
|
|
||||||
|
comparisons = [cosine_similarity_name(cardvec, v, name) for (name, v) in cardvecs]
|
||||||
|
|
||||||
|
comparisons.sort(reverse = True)
|
||||||
|
comp_n = comparisons[:n]
|
||||||
|
|
||||||
|
if isinstance(card, cardlib.Card) and card.bside:
|
||||||
|
comp_n += f_nearest(card.bside, vocab, vecs, cardvecs, n=n)
|
||||||
|
|
||||||
|
return comp_n
|
||||||
|
|
||||||
|
def f_nearest_per_thread(workitem):
|
||||||
|
(workcards, vocab, vecs, cardvecs, n) = workitem
|
||||||
|
return map(lambda card: f_nearest(card, vocab, vecs, cardvecs, n), workcards)
|
||||||
|
|
||||||
class CBOW:
|
class CBOW:
|
||||||
def __init__(self, verbose = True,
|
def __init__(self, verbose = True,
|
||||||
|
@ -147,8 +172,6 @@ class CBOW:
|
||||||
self.cardvecs += [(name, makevector(self.vocab,
|
self.cardvecs += [(name, makevector(self.vocab,
|
||||||
self.vecs,
|
self.vecs,
|
||||||
card.vectorize()))]
|
card.vectorize()))]
|
||||||
|
|
||||||
# self.par = Parallel(n_jobs=segments)
|
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print '... Done.'
|
print '... Done.'
|
||||||
|
@ -157,25 +180,11 @@ class CBOW:
|
||||||
print ' card vecs: ' + str(len(self.cardvecs))
|
print ' card vecs: ' + str(len(self.cardvecs))
|
||||||
|
|
||||||
def nearest(self, card, n=5):
|
def nearest(self, card, n=5):
|
||||||
if isinstance(card, cardlib.Card):
|
return f_nearest(card, self.vocab, self.vecs, self.cardvecs, n)
|
||||||
words = card.vectorize().split('\n\n')[0]
|
|
||||||
else:
|
|
||||||
# assume it's a string (that's already a vector)
|
|
||||||
words = card
|
|
||||||
|
|
||||||
if not words:
|
|
||||||
return []
|
|
||||||
|
|
||||||
cardvec = makevector(self.vocab, self.vecs, words)
|
def nearest_par(self, cards, n=5, threads=cores):
|
||||||
|
workpool = multiprocessing.Pool(threads)
|
||||||
comparisons = [cosine_similarity_name(cardvec, v, name) for (name, v) in self.cardvecs]
|
proto_worklist = namediff.list_split(cards, threads)
|
||||||
# comparisons = self.par(delayed(cosine_similarity_name)(cardvec, v, name)
|
worklist = map(lambda x: (x, self.vocab, self.vecs, self.cardvecs, n), proto_worklist)
|
||||||
# for (name, v) in self.cardvecs)
|
donelist = workpool.map(f_nearest_per_thread, worklist)
|
||||||
|
return namediff.list_flatten(donelist)
|
||||||
comparisons.sort(reverse = True)
|
|
||||||
comp_n = comparisons[:n]
|
|
||||||
|
|
||||||
if isinstance(card, cardlib.Card) and card.bside:
|
|
||||||
comp_n += self.nearest(card.bside)
|
|
||||||
|
|
||||||
return comp_n
|
|
||||||
|
|
|
@ -1,11 +1,54 @@
|
||||||
|
# This module is misleadingly named, as it has other utilities as well
|
||||||
|
# that are generally necessary when trying to postprocess output by
|
||||||
|
# comparing it against existing cards.
|
||||||
|
|
||||||
import difflib
|
import difflib
|
||||||
import os
|
import os
|
||||||
|
import multiprocessing
|
||||||
|
|
||||||
import jdecode
|
import jdecode
|
||||||
import cardlib
|
import cardlib
|
||||||
|
|
||||||
libdir = os.path.dirname(os.path.realpath(__file__))
|
libdir = os.path.dirname(os.path.realpath(__file__))
|
||||||
datadir = os.path.realpath(os.path.join(libdir, '../data'))
|
datadir = os.path.realpath(os.path.join(libdir, '../data'))
|
||||||
|
|
||||||
|
# multithreading control parameters
|
||||||
|
cores = multiprocessing.cpu_count()
|
||||||
|
|
||||||
|
# split a list into n pieces; return a list of these lists
|
||||||
|
# has slightly interesting behavior, in that if n is large, it can
|
||||||
|
# run out of elements early and return less than n lists
|
||||||
|
def list_split(l, n):
|
||||||
|
if n <= 0:
|
||||||
|
return l
|
||||||
|
split_size = len(l) / n
|
||||||
|
if len(l) % n > 0:
|
||||||
|
split_size += 1
|
||||||
|
return [l[i:i+split_size] for i in range(0, len(l), split_size)]
|
||||||
|
|
||||||
|
# flatten a list of lists into a single list of all their contents, in order
|
||||||
|
def list_flatten(l):
|
||||||
|
return [item for sublist in l for item in sublist]
|
||||||
|
|
||||||
|
|
||||||
|
# isolated logic for multiprocessing
|
||||||
|
def f_nearest(name, matchers, n):
|
||||||
|
for m in matchers:
|
||||||
|
m.set_seq1(name)
|
||||||
|
ratios = [(m.ratio(), m.b) for m in matchers]
|
||||||
|
ratios.sort(reverse = True)
|
||||||
|
|
||||||
|
if ratios[0][0] >= 1:
|
||||||
|
return ratios[:1]
|
||||||
|
else:
|
||||||
|
return ratios[:n]
|
||||||
|
|
||||||
|
def f_nearest_per_thread(workitem):
|
||||||
|
(worknames, names, n) = workitem
|
||||||
|
# each thread (well, process) needs to generate its own matchers
|
||||||
|
matchers = [difflib.SequenceMatcher(b=name, autojunk=False) for name in names]
|
||||||
|
return map(lambda name: f_nearest(name, matchers, n), worknames)
|
||||||
|
|
||||||
class Namediff:
|
class Namediff:
|
||||||
def __init__(self, verbose = True,
|
def __init__(self, verbose = True,
|
||||||
json_fname = os.path.join(datadir, 'AllSets.json')):
|
json_fname = os.path.join(datadir, 'AllSets.json')):
|
||||||
|
@ -43,15 +86,11 @@ class Namediff:
|
||||||
print '... Done.'
|
print '... Done.'
|
||||||
|
|
||||||
def nearest(self, name, n=3):
|
def nearest(self, name, n=3):
|
||||||
for m in self.matchers:
|
return f_nearest(name, self.matchers, n)
|
||||||
m.set_seq1(name)
|
|
||||||
ratios = [(m.ratio(), m.b) for m in self.matchers]
|
|
||||||
ratios.sort(reverse = True)
|
|
||||||
|
|
||||||
if ratios[0][0] >= 1:
|
|
||||||
return ratios[:1]
|
|
||||||
else:
|
|
||||||
return ratios[:n]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def nearest_par(self, names, n=3, threads=cores):
|
||||||
|
workpool = multiprocessing.Pool(threads)
|
||||||
|
proto_worklist = list_split(names, threads)
|
||||||
|
worklist = map(lambda x: (x, self.names, n), proto_worklist)
|
||||||
|
donelist = workpool.map(f_nearest_per_thread, worklist)
|
||||||
|
return list_flatten(donelist)
|
||||||
|
|
Loading…
Reference in a new issue