# This module is misleadingly named, as it has other utilities as well # that are generally necessary when trying to postprocess output by # comparing it against existing cards. import difflib import os import multiprocessing import utils import jdecode import cardlib libdir = os.path.dirname(os.path.realpath(__file__)) datadir = os.path.realpath(os.path.join(libdir, '../data')) # multithreading control parameters cores = multiprocessing.cpu_count() # split a list into n pieces; return a list of these lists # has slightly interesting behavior, in that if n is large, it can # run out of elements early and return less than n lists def list_split(l, n): if n <= 0: return l split_size = len(l) / n if len(l) % n > 0: split_size += 1 return [l[i:i+split_size] for i in range(0, len(l), split_size)] # flatten a list of lists into a single list of all their contents, in order def list_flatten(l): return [item for sublist in l for item in sublist] # isolated logic for multiprocessing def f_nearest(name, matchers, n): for m in matchers: m.set_seq1(name) ratios = [(m.ratio(), m.b) for m in matchers] ratios.sort(reverse = True) if ratios[0][0] >= 1: return ratios[:1] else: return ratios[:n] def f_nearest_per_thread(workitem): (worknames, names, n) = workitem # each thread (well, process) needs to generate its own matchers matchers = [difflib.SequenceMatcher(b=name, autojunk=False) for name in names] return map(lambda name: f_nearest(name, matchers, n), worknames) class Namediff: def __init__(self, verbose = True, json_fname = os.path.join(datadir, 'AllSets.json')): self.verbose = verbose self.names = {} self.codes = {} self.cardstrings = {} if self.verbose: print 'Setting up namediff...' if self.verbose: print ' Reading names from: ' + json_fname json_srcs = jdecode.mtg_open_json(json_fname, verbose) namecount = 0 for json_cardname in sorted(json_srcs): if len(json_srcs[json_cardname]) > 0: jcards = json_srcs[json_cardname] # just use the first one idx = 0 card = cardlib.Card(jcards[idx]) name = card.name jname = jcards[idx]['name'] jcode = jcards[idx][utils.json_field_info_code] if 'number' in jcards[idx]: jnum = jcards[idx]['number'] else: jnum = '' if name in self.names: print ' Duplicate name ' + name + ', ignoring.' else: self.names[name] = jname self.cardstrings[name] = card.encode() if jcode and jnum: self.codes[name] = jcode + '/' + jnum + '.jpg' else: self.codes[name] = '' namecount += 1 print ' Read ' + str(namecount) + ' unique cardnames' print ' Building SequenceMatcher objects.' self.matchers = [difflib.SequenceMatcher(b=n, autojunk=False) for n in self.names] self.card_matchers = [difflib.SequenceMatcher(b=self.cardstrings[n], autojunk=False) for n in self.cardstrings] print '... Done.' def nearest(self, name, n=3): return f_nearest(name, self.matchers, n) def nearest_par(self, names, n=3, threads=cores): workpool = multiprocessing.Pool(threads) proto_worklist = list_split(names, threads) worklist = map(lambda x: (x, self.names, n), proto_worklist) donelist = workpool.map(f_nearest_per_thread, worklist) return list_flatten(donelist) def nearest_card(self, card, n=5): return f_nearest(card.encode(), self.card_matchers, n) def nearest_card_par(self, cards, n=5, threads=cores): workpool = multiprocessing.Pool(threads) proto_worklist = list_split(cards, threads) worklist = map(lambda x: (map(lambda c: c.encode(), x), self.cardstrings.values(), n), proto_worklist) donelist = workpool.map(f_nearest_per_thread, worklist) return list_flatten(donelist)