mtgencode/lib/namediff.py

124 lines
4.2 KiB
Python
Raw Normal View History

# This module is misleadingly named, as it has other utilities as well
# that are generally necessary when trying to postprocess output by
# comparing it against existing cards.
2015-08-02 05:16:30 +00:00
import difflib
import os
import multiprocessing
import utils
2015-08-02 05:16:30 +00:00
import jdecode
import cardlib
libdir = os.path.dirname(os.path.realpath(__file__))
datadir = os.path.realpath(os.path.join(libdir, '../data'))
# multithreading control parameters
cores = multiprocessing.cpu_count()
# split a list into n pieces; return a list of these lists
# has slightly interesting behavior, in that if n is large, it can
# run out of elements early and return less than n lists
def list_split(l, n):
if n <= 0:
return l
split_size = len(l) / n
if len(l) % n > 0:
split_size += 1
return [l[i:i+split_size] for i in range(0, len(l), split_size)]
# flatten a list of lists into a single list of all their contents, in order
def list_flatten(l):
return [item for sublist in l for item in sublist]
# isolated logic for multiprocessing
def f_nearest(name, matchers, n):
for m in matchers:
m.set_seq1(name)
ratios = [(m.ratio(), m.b) for m in matchers]
ratios.sort(reverse = True)
if ratios[0][0] >= 1:
return ratios[:1]
else:
return ratios[:n]
def f_nearest_per_thread(workitem):
(worknames, names, n) = workitem
# each thread (well, process) needs to generate its own matchers
matchers = [difflib.SequenceMatcher(b=name, autojunk=False) for name in names]
return [f_nearest(name, matchers, n) for name in worknames]
2015-08-02 05:16:30 +00:00
class Namediff:
def __init__(self, verbose = True,
json_fname = os.path.join(datadir, 'AllSets.json')):
self.verbose = verbose
self.names = {}
self.codes = {}
2015-12-01 21:36:47 +00:00
self.cardstrings = {}
2015-08-02 05:16:30 +00:00
if self.verbose:
print('Setting up namediff...')
2015-08-02 05:16:30 +00:00
if self.verbose:
print(' Reading names from: ' + json_fname)
2015-08-02 05:16:30 +00:00
json_srcs = jdecode.mtg_open_json(json_fname, verbose)
namecount = 0
for json_cardname in sorted(json_srcs):
if len(json_srcs[json_cardname]) > 0:
jcards = json_srcs[json_cardname]
# just use the first one
idx = 0
card = cardlib.Card(jcards[idx])
name = card.name
jname = jcards[idx]['name']
jcode = jcards[idx][utils.json_field_info_code]
if 'number' in jcards[idx]:
jnum = jcards[idx]['number']
else:
jnum = ''
2015-08-02 05:16:30 +00:00
if name in self.names:
print(' Duplicate name ' + name + ', ignoring.')
2015-08-02 05:16:30 +00:00
else:
self.names[name] = jname
2015-12-01 21:36:47 +00:00
self.cardstrings[name] = card.encode()
if jcode and jnum:
self.codes[name] = jcode + '/' + jnum + '.jpg'
else:
self.codes[name] = ''
2015-08-02 05:16:30 +00:00
namecount += 1
print(' Read ' + str(namecount) + ' unique cardnames')
print(' Building SequenceMatcher objects.')
2015-08-02 05:16:30 +00:00
self.matchers = [difflib.SequenceMatcher(
b=n, autojunk=False) for n in self.names]
self.card_matchers = [difflib.SequenceMatcher(
b=self.cardstrings[n], autojunk=False) for n in self.cardstrings]
print('... Done.')
2015-08-02 05:16:30 +00:00
def nearest(self, name, n=3):
return f_nearest(name, self.matchers, n)
2015-08-02 05:16:30 +00:00
def nearest_par(self, names, n=3, threads=cores):
workpool = multiprocessing.Pool(threads)
proto_worklist = list_split(names, threads)
worklist = [(x, self.names, n) for x in proto_worklist]
donelist = workpool.map(f_nearest_per_thread, worklist)
return list_flatten(donelist)
2015-12-01 21:36:47 +00:00
def nearest_card(self, card, n=5):
return f_nearest(card.encode(), self.card_matchers, n)
def nearest_card_par(self, cards, n=5, threads=cores):
workpool = multiprocessing.Pool(threads)
proto_worklist = list_split(cards, threads)
worklist = [([c.encode() for c in x], list(
self.cardstrings.values()), n) for x in proto_worklist]
2015-12-01 21:36:47 +00:00
donelist = workpool.map(f_nearest_per_thread, worklist)
return list_flatten(donelist)