collecting changes stranded on leveler

This commit is contained in:
Bill Zorn 2015-12-01 13:36:47 -08:00
parent 6606a5530e
commit b22dcafc89
5 changed files with 246 additions and 0 deletions

View file

@ -419,6 +419,7 @@ class Card:
self.__dict__[field_text] = Manatext('')
self.__dict__[field_text + '_lines'] = []
self.__dict__[field_text + '_words'] = []
self.__dict__[field_text + '_lines_words'] = []
self.__dict__[field_other] = []
self.bside = None
# format-independent view of processed input
@ -545,6 +546,9 @@ class Card:
self.__dict__[field_text + '_words'] = re.sub(utils.unletters_regex,
' ',
fulltext).split()
self.__dict__[field_text + '_lines_words'] = map(
lambda line: re.sub(utils.unletters_regex, ' ', line).split(),
fulltext.split(utils.newline))
else:
self.valid = False
self.__dict__[field_other] += [(idx, '<text> ' + str(value))]

View file

@ -56,6 +56,7 @@ class Namediff:
self.verbose = verbose
self.names = {}
self.codes = {}
self.cardstrings = {}
if self.verbose:
print 'Setting up namediff...'
@ -83,6 +84,7 @@ class Namediff:
print ' Duplicate name ' + name + ', ignoring.'
else:
self.names[name] = jname
self.cardstrings[name] = card.encode()
if jcode and jnum:
self.codes[name] = jcode + '/' + jnum + '.jpg'
else:
@ -93,6 +95,7 @@ class Namediff:
print ' Building SequenceMatcher objects.'
self.matchers = [difflib.SequenceMatcher(b=n, autojunk=False) for n in self.names]
self.card_matchers = [difflib.SequenceMatcher(b=self.cardstrings[n], autojunk=False) for n in self.cardstrings]
print '... Done.'
@ -105,3 +108,13 @@ class Namediff:
worklist = map(lambda x: (x, self.names, n), proto_worklist)
donelist = workpool.map(f_nearest_per_thread, worklist)
return list_flatten(donelist)
def nearest_card(self, card, n=5):
return f_nearest(card.encode(), self.card_matchers, n)
def nearest_card_par(self, cards, n=5, threads=cores):
workpool = multiprocessing.Pool(threads)
proto_worklist = list_split(cards, threads)
worklist = map(lambda x: (map(lambda c: c.encode(), x), self.cardstrings.values(), n), proto_worklist)
donelist = workpool.map(f_nearest_per_thread, worklist)
return list_flatten(donelist)

81
scripts/keydiff.py Executable file
View file

@ -0,0 +1,81 @@
#!/usr/bin/env python
def parse_keyfile(f, d, constructor = lambda x: x):
for line in f:
kv = map(lambda s: s.strip(), line.split(':'))
if not len(kv) == 2:
continue
d[kv[0]] = constructor(kv[1])
def merge_dicts(d1, d2):
d = {}
for k in d1:
d[k] = (d1[k], d2[k] if k in d2 else None)
for k in d2:
if not k in d:
d[k] = (None, d2[k])
return d
def main(fname1, fname2, verbose = True):
if verbose:
print 'opening ' + fname1 + ' as base key/value store'
print 'opening ' + fname2 + ' as target key/value store'
d1 = {}
d2 = {}
with open(fname1, 'rt') as f1:
parse_keyfile(f1, d1, int)
with open(fname2, 'rt') as f2:
parse_keyfile(f2, d2, int)
tot1 = sum(d1.values())
tot2 = sum(d2.values())
if verbose:
print ' ' + fname1 + ': ' + str(len(d1)) + ', total ' + str(tot1)
print ' ' + fname2 + ': ' + str(len(d2)) + ', total ' + str(tot2)
d_merged = merge_dicts(d1, d2)
ratios = {}
only_1 = {}
only_2 = {}
for k in d_merged:
(v1, v2) = d_merged[k]
if v1 is None:
only_2[k] = v2
elif v2 is None:
only_1[k] = v1
else:
ratios[k] = float(v2 * tot1) / float(v1 * tot2)
print 'shared: ' + str(len(ratios))
for k in sorted(ratios, lambda x,y: cmp(d2[x], d2[y]), reverse=True):
print ' ' + k + ': ' + str(d2[k]) + '/' + str(d1[k]) + ' (' + str(ratios[k]) + ')'
print ''
print '1 only: ' + str(len(only_1))
for k in sorted(only_1, lambda x,y: cmp(d1[x], d1[y]), reverse=True):
print ' ' + k + ': ' + str(d1[k])
print ''
print '2 only: ' + str(len(only_2))
for k in sorted(only_2, lambda x,y: cmp(d2[x], d2[y]), reverse=True):
print ' ' + k + ': ' + str(d2[k])
print ''
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('file1', #nargs='?'. default=None,
help='base key file to diff against')
parser.add_argument('file2', nargs='?', default=None,
help='other file to compare against the baseline')
parser.add_argument('-v', '--verbose', action='store_true',
help='verbose output')
args = parser.parse_args()
main(args.file1, args.file2, verbose=args.verbose)
exit(0)

85
scripts/ngrams.py Executable file
View file

@ -0,0 +1,85 @@
#!/usr/bin/env python
import sys
import os
libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
sys.path.append(libdir)
import jdecode
def update_ngrams(lines, gramdict, grams):
for line in lines:
for i in range(0, len(line) - (grams - 1)):
ngram = ' '.join([line[i + j] for j in range(0, grams)])
if ngram in gramdict:
gramdict[ngram] += 1
else:
gramdict[ngram] = 1
def describe_bins(gramdict, bins):
bins = sorted(bins)
counts = [0 for _ in range(0, len(bins) + 1)]
for ngram in gramdict:
for i in range(0, len(bins) + 1):
if i < len(bins):
if gramdict[ngram] <= bins[i]:
counts[i] += 1
break
else:
# didn't fit into any of the smaller bins, stick in on the end
counts[-1] += 1
for i in range(0, len(counts)):
if counts[i] > 0:
print (' ' + (str(bins[i]) if i < len(bins) else str(bins[-1]) + '+')
+ ': ' + str(counts[i]))
def main(fname, oname, gmin = 2, gmax = 8, verbose = True):
gmin = int(gmin)
gmax = int(gmax)
bins = [1, 2, 3, 10, 30, 100, 300, 1000]
if gmin < 2 or gmax < gmin:
print 'invalid gram sizes: ' + str(gmin) + '-' + str(gmax)
exit(1)
# may need to set special arguments here
cards = jdecode.mtg_open_file(fname, verbose=verbose)
for grams in range(gmin, gmax+1):
if verbose:
print 'generating ' + str(grams) + '-grams...'
gramdict = {}
for card in cards:
update_ngrams(card.text_lines_words, gramdict, grams)
oname_full = oname + '.' + str(grams) + 'g'
if verbose:
print ' writing ' + str(len(gramdict)) + ' unique ' + str(grams) + '-grams to ' + oname_full
describe_bins(gramdict, bins)
with open(oname_full, 'wt') as f:
for ngram in sorted(gramdict,
lambda x,y: cmp(gramdict[x], gramdict[y]),
reverse = True):
f.write((ngram + ': ' + str(gramdict[ngram]) + '\n').encode('utf-8'))
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('infile', #nargs='?'. default=None,
help='encoded card file or json corpus to process')
parser.add_argument('outfile', #nargs='?', default=None,
help='base name of output file, outputs ending in .2g, .3g etc. will be produced')
parser.add_argument('-min', '--min', action='store', default='2',
help='minimum gram size to compute')
parser.add_argument('-max', '--max', action='store', default='8',
help='maximum gram size to compute')
parser.add_argument('-v', '--verbose', action='store_true',
help='verbose output')
args = parser.parse_args()
main(args.infile, args.outfile, gmin=args.min, gmax=args.max, verbose=args.verbose)
exit(0)

View file

@ -9,6 +9,37 @@ sys.path.append(libdir)
import utils
import jdecode
datadir = os.path.realpath(os.path.join(libdir, '../data'))
gramdir = os.path.join(datadir, 'ngrams')
compute_ngrams = False
gramdicts = {}
if os.path.isdir(gramdir):
import keydiff
compute_ngrams = True
for fname in os.listdir(gramdir):
suffixes = re.findall(r'\.[0-9]*g$', fname)
if suffixes:
grams = int(suffixes[0][1:-1])
d = {}
with open(os.path.join(gramdir, fname), 'rt') as f:
keydiff.parse_keyfile(f, d, int)
gramdicts[grams] = d
def rare_grams(card, thresh = 2, grams = 2):
if not grams in gramdicts:
return None
rares = 0
gramdict = gramdicts[grams]
for line in card.text_lines_words:
for i in range(0, len(line) - (grams - 1)):
ngram = ' '.join([line[i + j] for j in range(0, grams)])
if ngram in gramdict:
if gramdict[ngram] < thresh:
rares += 1
else:
rares += 1
return rares
def list_only(l, items):
for e in l:
if not e in items:
@ -130,6 +161,38 @@ values = OrderedDict([(k, (0,0,0)) for k in props])
def main(fname, oname = None, verbose = True):
# may need to set special arguments here
cards = jdecode.mtg_open_file(fname, verbose=verbose)
rg = {}
for card in cards:
g = rare_grams(card, thresh=2, grams=2)
if len(card.text_words) > 0:
g = int(1.0 + (float(g) * 100.0 / float(len(card.text_words))))
if g in rg:
rg[g] += 1
else:
rg[g] = 1
if g >= 60:
print g
print card.format()
tot = 0
vmax = sum(rg.values())
pct90 = None
pct95 = None
pct99 = None
for i in sorted(rg):
print str(i) + ' rare ngrams: ' + str(rg[i])
tot += rg[i]
if pct90 is None and tot >= vmax * 0.90:
pct90 = i
if pct95 is None and tot >= vmax * 0.95:
pct95 = i
if pct99 is None and tot >= vmax * 0.99:
pct99 = i
print '90% - ' + str(pct90)
print '95% - ' + str(pct95)
print '99% - ' + str(pct99)
exit(0)
for card in cards:
for prop in props: