From b22dcafc89bad76aa4349d83858296ff19df8c2d Mon Sep 17 00:00:00 2001 From: Bill Zorn Date: Tue, 1 Dec 2015 13:36:47 -0800 Subject: [PATCH] collecting changes stranded on leveler --- lib/cardlib.py | 4 +++ lib/namediff.py | 13 +++++++ scripts/keydiff.py | 81 ++++++++++++++++++++++++++++++++++++++++++ scripts/ngrams.py | 85 +++++++++++++++++++++++++++++++++++++++++++++ scripts/validate.py | 63 +++++++++++++++++++++++++++++++++ 5 files changed, 246 insertions(+) create mode 100755 scripts/keydiff.py create mode 100755 scripts/ngrams.py diff --git a/lib/cardlib.py b/lib/cardlib.py index 2f2cf77..2aa89f3 100644 --- a/lib/cardlib.py +++ b/lib/cardlib.py @@ -419,6 +419,7 @@ class Card: self.__dict__[field_text] = Manatext('') self.__dict__[field_text + '_lines'] = [] self.__dict__[field_text + '_words'] = [] + self.__dict__[field_text + '_lines_words'] = [] self.__dict__[field_other] = [] self.bside = None # format-independent view of processed input @@ -545,6 +546,9 @@ class Card: self.__dict__[field_text + '_words'] = re.sub(utils.unletters_regex, ' ', fulltext).split() + self.__dict__[field_text + '_lines_words'] = map( + lambda line: re.sub(utils.unletters_regex, ' ', line).split(), + fulltext.split(utils.newline)) else: self.valid = False self.__dict__[field_other] += [(idx, ' ' + str(value))] diff --git a/lib/namediff.py b/lib/namediff.py index 0e23783..14e341f 100644 --- a/lib/namediff.py +++ b/lib/namediff.py @@ -56,6 +56,7 @@ class Namediff: self.verbose = verbose self.names = {} self.codes = {} + self.cardstrings = {} if self.verbose: print 'Setting up namediff...' @@ -83,6 +84,7 @@ class Namediff: print ' Duplicate name ' + name + ', ignoring.' else: self.names[name] = jname + self.cardstrings[name] = card.encode() if jcode and jnum: self.codes[name] = jcode + '/' + jnum + '.jpg' else: @@ -93,6 +95,7 @@ class Namediff: print ' Building SequenceMatcher objects.' self.matchers = [difflib.SequenceMatcher(b=n, autojunk=False) for n in self.names] + self.card_matchers = [difflib.SequenceMatcher(b=self.cardstrings[n], autojunk=False) for n in self.cardstrings] print '... Done.' @@ -105,3 +108,13 @@ class Namediff: worklist = map(lambda x: (x, self.names, n), proto_worklist) donelist = workpool.map(f_nearest_per_thread, worklist) return list_flatten(donelist) + + def nearest_card(self, card, n=5): + return f_nearest(card.encode(), self.card_matchers, n) + + def nearest_card_par(self, cards, n=5, threads=cores): + workpool = multiprocessing.Pool(threads) + proto_worklist = list_split(cards, threads) + worklist = map(lambda x: (map(lambda c: c.encode(), x), self.cardstrings.values(), n), proto_worklist) + donelist = workpool.map(f_nearest_per_thread, worklist) + return list_flatten(donelist) diff --git a/scripts/keydiff.py b/scripts/keydiff.py new file mode 100755 index 0000000..ab818f0 --- /dev/null +++ b/scripts/keydiff.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python + +def parse_keyfile(f, d, constructor = lambda x: x): + for line in f: + kv = map(lambda s: s.strip(), line.split(':')) + if not len(kv) == 2: + continue + d[kv[0]] = constructor(kv[1]) + +def merge_dicts(d1, d2): + d = {} + for k in d1: + d[k] = (d1[k], d2[k] if k in d2 else None) + for k in d2: + if not k in d: + d[k] = (None, d2[k]) + return d + +def main(fname1, fname2, verbose = True): + if verbose: + print 'opening ' + fname1 + ' as base key/value store' + print 'opening ' + fname2 + ' as target key/value store' + + d1 = {} + d2 = {} + with open(fname1, 'rt') as f1: + parse_keyfile(f1, d1, int) + with open(fname2, 'rt') as f2: + parse_keyfile(f2, d2, int) + + tot1 = sum(d1.values()) + tot2 = sum(d2.values()) + + if verbose: + print ' ' + fname1 + ': ' + str(len(d1)) + ', total ' + str(tot1) + print ' ' + fname2 + ': ' + str(len(d2)) + ', total ' + str(tot2) + + d_merged = merge_dicts(d1, d2) + + ratios = {} + only_1 = {} + only_2 = {} + for k in d_merged: + (v1, v2) = d_merged[k] + if v1 is None: + only_2[k] = v2 + elif v2 is None: + only_1[k] = v1 + else: + ratios[k] = float(v2 * tot1) / float(v1 * tot2) + + print 'shared: ' + str(len(ratios)) + for k in sorted(ratios, lambda x,y: cmp(d2[x], d2[y]), reverse=True): + print ' ' + k + ': ' + str(d2[k]) + '/' + str(d1[k]) + ' (' + str(ratios[k]) + ')' + print '' + + print '1 only: ' + str(len(only_1)) + for k in sorted(only_1, lambda x,y: cmp(d1[x], d1[y]), reverse=True): + print ' ' + k + ': ' + str(d1[k]) + print '' + + print '2 only: ' + str(len(only_2)) + for k in sorted(only_2, lambda x,y: cmp(d2[x], d2[y]), reverse=True): + print ' ' + k + ': ' + str(d2[k]) + print '' + +if __name__ == '__main__': + + import argparse + parser = argparse.ArgumentParser() + + parser.add_argument('file1', #nargs='?'. default=None, + help='base key file to diff against') + parser.add_argument('file2', nargs='?', default=None, + help='other file to compare against the baseline') + parser.add_argument('-v', '--verbose', action='store_true', + help='verbose output') + + args = parser.parse_args() + main(args.file1, args.file2, verbose=args.verbose) + exit(0) diff --git a/scripts/ngrams.py b/scripts/ngrams.py new file mode 100755 index 0000000..8c64ae6 --- /dev/null +++ b/scripts/ngrams.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +import sys +import os + +libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib') +sys.path.append(libdir) +import jdecode + + +def update_ngrams(lines, gramdict, grams): + for line in lines: + for i in range(0, len(line) - (grams - 1)): + ngram = ' '.join([line[i + j] for j in range(0, grams)]) + if ngram in gramdict: + gramdict[ngram] += 1 + else: + gramdict[ngram] = 1 + +def describe_bins(gramdict, bins): + bins = sorted(bins) + counts = [0 for _ in range(0, len(bins) + 1)] + + for ngram in gramdict: + for i in range(0, len(bins) + 1): + if i < len(bins): + if gramdict[ngram] <= bins[i]: + counts[i] += 1 + break + else: + # didn't fit into any of the smaller bins, stick in on the end + counts[-1] += 1 + + for i in range(0, len(counts)): + if counts[i] > 0: + print (' ' + (str(bins[i]) if i < len(bins) else str(bins[-1]) + '+') + + ': ' + str(counts[i])) + +def main(fname, oname, gmin = 2, gmax = 8, verbose = True): + gmin = int(gmin) + gmax = int(gmax) + bins = [1, 2, 3, 10, 30, 100, 300, 1000] + if gmin < 2 or gmax < gmin: + print 'invalid gram sizes: ' + str(gmin) + '-' + str(gmax) + exit(1) + + # may need to set special arguments here + cards = jdecode.mtg_open_file(fname, verbose=verbose) + + for grams in range(gmin, gmax+1): + if verbose: + print 'generating ' + str(grams) + '-grams...' + gramdict = {} + for card in cards: + update_ngrams(card.text_lines_words, gramdict, grams) + + oname_full = oname + '.' + str(grams) + 'g' + if verbose: + print ' writing ' + str(len(gramdict)) + ' unique ' + str(grams) + '-grams to ' + oname_full + describe_bins(gramdict, bins) + + with open(oname_full, 'wt') as f: + for ngram in sorted(gramdict, + lambda x,y: cmp(gramdict[x], gramdict[y]), + reverse = True): + f.write((ngram + ': ' + str(gramdict[ngram]) + '\n').encode('utf-8')) + +if __name__ == '__main__': + + import argparse + parser = argparse.ArgumentParser() + + parser.add_argument('infile', #nargs='?'. default=None, + help='encoded card file or json corpus to process') + parser.add_argument('outfile', #nargs='?', default=None, + help='base name of output file, outputs ending in .2g, .3g etc. will be produced') + parser.add_argument('-min', '--min', action='store', default='2', + help='minimum gram size to compute') + parser.add_argument('-max', '--max', action='store', default='8', + help='maximum gram size to compute') + parser.add_argument('-v', '--verbose', action='store_true', + help='verbose output') + + args = parser.parse_args() + main(args.infile, args.outfile, gmin=args.min, gmax=args.max, verbose=args.verbose) + exit(0) diff --git a/scripts/validate.py b/scripts/validate.py index 3c7793a..0e7f799 100755 --- a/scripts/validate.py +++ b/scripts/validate.py @@ -9,6 +9,37 @@ sys.path.append(libdir) import utils import jdecode +datadir = os.path.realpath(os.path.join(libdir, '../data')) +gramdir = os.path.join(datadir, 'ngrams') +compute_ngrams = False +gramdicts = {} +if os.path.isdir(gramdir): + import keydiff + compute_ngrams = True + for fname in os.listdir(gramdir): + suffixes = re.findall(r'\.[0-9]*g$', fname) + if suffixes: + grams = int(suffixes[0][1:-1]) + d = {} + with open(os.path.join(gramdir, fname), 'rt') as f: + keydiff.parse_keyfile(f, d, int) + gramdicts[grams] = d + +def rare_grams(card, thresh = 2, grams = 2): + if not grams in gramdicts: + return None + rares = 0 + gramdict = gramdicts[grams] + for line in card.text_lines_words: + for i in range(0, len(line) - (grams - 1)): + ngram = ' '.join([line[i + j] for j in range(0, grams)]) + if ngram in gramdict: + if gramdict[ngram] < thresh: + rares += 1 + else: + rares += 1 + return rares + def list_only(l, items): for e in l: if not e in items: @@ -130,6 +161,38 @@ values = OrderedDict([(k, (0,0,0)) for k in props]) def main(fname, oname = None, verbose = True): # may need to set special arguments here cards = jdecode.mtg_open_file(fname, verbose=verbose) + rg = {} + for card in cards: + g = rare_grams(card, thresh=2, grams=2) + if len(card.text_words) > 0: + g = int(1.0 + (float(g) * 100.0 / float(len(card.text_words)))) + if g in rg: + rg[g] += 1 + else: + rg[g] = 1 + if g >= 60: + print g + print card.format() + + tot = 0 + vmax = sum(rg.values()) + pct90 = None + pct95 = None + pct99 = None + for i in sorted(rg): + print str(i) + ' rare ngrams: ' + str(rg[i]) + tot += rg[i] + if pct90 is None and tot >= vmax * 0.90: + pct90 = i + if pct95 is None and tot >= vmax * 0.95: + pct95 = i + if pct99 is None and tot >= vmax * 0.99: + pct99 = i + + print '90% - ' + str(pct90) + print '95% - ' + str(pct95) + print '99% - ' + str(pct99) + exit(0) for card in cards: for prop in props: