collecting changes stranded on leveler
This commit is contained in:
parent
6606a5530e
commit
b22dcafc89
5 changed files with 246 additions and 0 deletions
|
@ -419,6 +419,7 @@ class Card:
|
||||||
self.__dict__[field_text] = Manatext('')
|
self.__dict__[field_text] = Manatext('')
|
||||||
self.__dict__[field_text + '_lines'] = []
|
self.__dict__[field_text + '_lines'] = []
|
||||||
self.__dict__[field_text + '_words'] = []
|
self.__dict__[field_text + '_words'] = []
|
||||||
|
self.__dict__[field_text + '_lines_words'] = []
|
||||||
self.__dict__[field_other] = []
|
self.__dict__[field_other] = []
|
||||||
self.bside = None
|
self.bside = None
|
||||||
# format-independent view of processed input
|
# format-independent view of processed input
|
||||||
|
@ -545,6 +546,9 @@ class Card:
|
||||||
self.__dict__[field_text + '_words'] = re.sub(utils.unletters_regex,
|
self.__dict__[field_text + '_words'] = re.sub(utils.unletters_regex,
|
||||||
' ',
|
' ',
|
||||||
fulltext).split()
|
fulltext).split()
|
||||||
|
self.__dict__[field_text + '_lines_words'] = map(
|
||||||
|
lambda line: re.sub(utils.unletters_regex, ' ', line).split(),
|
||||||
|
fulltext.split(utils.newline))
|
||||||
else:
|
else:
|
||||||
self.valid = False
|
self.valid = False
|
||||||
self.__dict__[field_other] += [(idx, '<text> ' + str(value))]
|
self.__dict__[field_other] += [(idx, '<text> ' + str(value))]
|
||||||
|
|
|
@ -56,6 +56,7 @@ class Namediff:
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.names = {}
|
self.names = {}
|
||||||
self.codes = {}
|
self.codes = {}
|
||||||
|
self.cardstrings = {}
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print 'Setting up namediff...'
|
print 'Setting up namediff...'
|
||||||
|
@ -83,6 +84,7 @@ class Namediff:
|
||||||
print ' Duplicate name ' + name + ', ignoring.'
|
print ' Duplicate name ' + name + ', ignoring.'
|
||||||
else:
|
else:
|
||||||
self.names[name] = jname
|
self.names[name] = jname
|
||||||
|
self.cardstrings[name] = card.encode()
|
||||||
if jcode and jnum:
|
if jcode and jnum:
|
||||||
self.codes[name] = jcode + '/' + jnum + '.jpg'
|
self.codes[name] = jcode + '/' + jnum + '.jpg'
|
||||||
else:
|
else:
|
||||||
|
@ -93,6 +95,7 @@ class Namediff:
|
||||||
print ' Building SequenceMatcher objects.'
|
print ' Building SequenceMatcher objects.'
|
||||||
|
|
||||||
self.matchers = [difflib.SequenceMatcher(b=n, autojunk=False) for n in self.names]
|
self.matchers = [difflib.SequenceMatcher(b=n, autojunk=False) for n in self.names]
|
||||||
|
self.card_matchers = [difflib.SequenceMatcher(b=self.cardstrings[n], autojunk=False) for n in self.cardstrings]
|
||||||
|
|
||||||
print '... Done.'
|
print '... Done.'
|
||||||
|
|
||||||
|
@ -105,3 +108,13 @@ class Namediff:
|
||||||
worklist = map(lambda x: (x, self.names, n), proto_worklist)
|
worklist = map(lambda x: (x, self.names, n), proto_worklist)
|
||||||
donelist = workpool.map(f_nearest_per_thread, worklist)
|
donelist = workpool.map(f_nearest_per_thread, worklist)
|
||||||
return list_flatten(donelist)
|
return list_flatten(donelist)
|
||||||
|
|
||||||
|
def nearest_card(self, card, n=5):
|
||||||
|
return f_nearest(card.encode(), self.card_matchers, n)
|
||||||
|
|
||||||
|
def nearest_card_par(self, cards, n=5, threads=cores):
|
||||||
|
workpool = multiprocessing.Pool(threads)
|
||||||
|
proto_worklist = list_split(cards, threads)
|
||||||
|
worklist = map(lambda x: (map(lambda c: c.encode(), x), self.cardstrings.values(), n), proto_worklist)
|
||||||
|
donelist = workpool.map(f_nearest_per_thread, worklist)
|
||||||
|
return list_flatten(donelist)
|
||||||
|
|
81
scripts/keydiff.py
Executable file
81
scripts/keydiff.py
Executable file
|
@ -0,0 +1,81 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
def parse_keyfile(f, d, constructor = lambda x: x):
|
||||||
|
for line in f:
|
||||||
|
kv = map(lambda s: s.strip(), line.split(':'))
|
||||||
|
if not len(kv) == 2:
|
||||||
|
continue
|
||||||
|
d[kv[0]] = constructor(kv[1])
|
||||||
|
|
||||||
|
def merge_dicts(d1, d2):
|
||||||
|
d = {}
|
||||||
|
for k in d1:
|
||||||
|
d[k] = (d1[k], d2[k] if k in d2 else None)
|
||||||
|
for k in d2:
|
||||||
|
if not k in d:
|
||||||
|
d[k] = (None, d2[k])
|
||||||
|
return d
|
||||||
|
|
||||||
|
def main(fname1, fname2, verbose = True):
|
||||||
|
if verbose:
|
||||||
|
print 'opening ' + fname1 + ' as base key/value store'
|
||||||
|
print 'opening ' + fname2 + ' as target key/value store'
|
||||||
|
|
||||||
|
d1 = {}
|
||||||
|
d2 = {}
|
||||||
|
with open(fname1, 'rt') as f1:
|
||||||
|
parse_keyfile(f1, d1, int)
|
||||||
|
with open(fname2, 'rt') as f2:
|
||||||
|
parse_keyfile(f2, d2, int)
|
||||||
|
|
||||||
|
tot1 = sum(d1.values())
|
||||||
|
tot2 = sum(d2.values())
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print ' ' + fname1 + ': ' + str(len(d1)) + ', total ' + str(tot1)
|
||||||
|
print ' ' + fname2 + ': ' + str(len(d2)) + ', total ' + str(tot2)
|
||||||
|
|
||||||
|
d_merged = merge_dicts(d1, d2)
|
||||||
|
|
||||||
|
ratios = {}
|
||||||
|
only_1 = {}
|
||||||
|
only_2 = {}
|
||||||
|
for k in d_merged:
|
||||||
|
(v1, v2) = d_merged[k]
|
||||||
|
if v1 is None:
|
||||||
|
only_2[k] = v2
|
||||||
|
elif v2 is None:
|
||||||
|
only_1[k] = v1
|
||||||
|
else:
|
||||||
|
ratios[k] = float(v2 * tot1) / float(v1 * tot2)
|
||||||
|
|
||||||
|
print 'shared: ' + str(len(ratios))
|
||||||
|
for k in sorted(ratios, lambda x,y: cmp(d2[x], d2[y]), reverse=True):
|
||||||
|
print ' ' + k + ': ' + str(d2[k]) + '/' + str(d1[k]) + ' (' + str(ratios[k]) + ')'
|
||||||
|
print ''
|
||||||
|
|
||||||
|
print '1 only: ' + str(len(only_1))
|
||||||
|
for k in sorted(only_1, lambda x,y: cmp(d1[x], d1[y]), reverse=True):
|
||||||
|
print ' ' + k + ': ' + str(d1[k])
|
||||||
|
print ''
|
||||||
|
|
||||||
|
print '2 only: ' + str(len(only_2))
|
||||||
|
for k in sorted(only_2, lambda x,y: cmp(d2[x], d2[y]), reverse=True):
|
||||||
|
print ' ' + k + ': ' + str(d2[k])
|
||||||
|
print ''
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument('file1', #nargs='?'. default=None,
|
||||||
|
help='base key file to diff against')
|
||||||
|
parser.add_argument('file2', nargs='?', default=None,
|
||||||
|
help='other file to compare against the baseline')
|
||||||
|
parser.add_argument('-v', '--verbose', action='store_true',
|
||||||
|
help='verbose output')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args.file1, args.file2, verbose=args.verbose)
|
||||||
|
exit(0)
|
85
scripts/ngrams.py
Executable file
85
scripts/ngrams.py
Executable file
|
@ -0,0 +1,85 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
|
||||||
|
sys.path.append(libdir)
|
||||||
|
import jdecode
|
||||||
|
|
||||||
|
|
||||||
|
def update_ngrams(lines, gramdict, grams):
|
||||||
|
for line in lines:
|
||||||
|
for i in range(0, len(line) - (grams - 1)):
|
||||||
|
ngram = ' '.join([line[i + j] for j in range(0, grams)])
|
||||||
|
if ngram in gramdict:
|
||||||
|
gramdict[ngram] += 1
|
||||||
|
else:
|
||||||
|
gramdict[ngram] = 1
|
||||||
|
|
||||||
|
def describe_bins(gramdict, bins):
|
||||||
|
bins = sorted(bins)
|
||||||
|
counts = [0 for _ in range(0, len(bins) + 1)]
|
||||||
|
|
||||||
|
for ngram in gramdict:
|
||||||
|
for i in range(0, len(bins) + 1):
|
||||||
|
if i < len(bins):
|
||||||
|
if gramdict[ngram] <= bins[i]:
|
||||||
|
counts[i] += 1
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# didn't fit into any of the smaller bins, stick in on the end
|
||||||
|
counts[-1] += 1
|
||||||
|
|
||||||
|
for i in range(0, len(counts)):
|
||||||
|
if counts[i] > 0:
|
||||||
|
print (' ' + (str(bins[i]) if i < len(bins) else str(bins[-1]) + '+')
|
||||||
|
+ ': ' + str(counts[i]))
|
||||||
|
|
||||||
|
def main(fname, oname, gmin = 2, gmax = 8, verbose = True):
|
||||||
|
gmin = int(gmin)
|
||||||
|
gmax = int(gmax)
|
||||||
|
bins = [1, 2, 3, 10, 30, 100, 300, 1000]
|
||||||
|
if gmin < 2 or gmax < gmin:
|
||||||
|
print 'invalid gram sizes: ' + str(gmin) + '-' + str(gmax)
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
# may need to set special arguments here
|
||||||
|
cards = jdecode.mtg_open_file(fname, verbose=verbose)
|
||||||
|
|
||||||
|
for grams in range(gmin, gmax+1):
|
||||||
|
if verbose:
|
||||||
|
print 'generating ' + str(grams) + '-grams...'
|
||||||
|
gramdict = {}
|
||||||
|
for card in cards:
|
||||||
|
update_ngrams(card.text_lines_words, gramdict, grams)
|
||||||
|
|
||||||
|
oname_full = oname + '.' + str(grams) + 'g'
|
||||||
|
if verbose:
|
||||||
|
print ' writing ' + str(len(gramdict)) + ' unique ' + str(grams) + '-grams to ' + oname_full
|
||||||
|
describe_bins(gramdict, bins)
|
||||||
|
|
||||||
|
with open(oname_full, 'wt') as f:
|
||||||
|
for ngram in sorted(gramdict,
|
||||||
|
lambda x,y: cmp(gramdict[x], gramdict[y]),
|
||||||
|
reverse = True):
|
||||||
|
f.write((ngram + ': ' + str(gramdict[ngram]) + '\n').encode('utf-8'))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument('infile', #nargs='?'. default=None,
|
||||||
|
help='encoded card file or json corpus to process')
|
||||||
|
parser.add_argument('outfile', #nargs='?', default=None,
|
||||||
|
help='base name of output file, outputs ending in .2g, .3g etc. will be produced')
|
||||||
|
parser.add_argument('-min', '--min', action='store', default='2',
|
||||||
|
help='minimum gram size to compute')
|
||||||
|
parser.add_argument('-max', '--max', action='store', default='8',
|
||||||
|
help='maximum gram size to compute')
|
||||||
|
parser.add_argument('-v', '--verbose', action='store_true',
|
||||||
|
help='verbose output')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args.infile, args.outfile, gmin=args.min, gmax=args.max, verbose=args.verbose)
|
||||||
|
exit(0)
|
|
@ -9,6 +9,37 @@ sys.path.append(libdir)
|
||||||
import utils
|
import utils
|
||||||
import jdecode
|
import jdecode
|
||||||
|
|
||||||
|
datadir = os.path.realpath(os.path.join(libdir, '../data'))
|
||||||
|
gramdir = os.path.join(datadir, 'ngrams')
|
||||||
|
compute_ngrams = False
|
||||||
|
gramdicts = {}
|
||||||
|
if os.path.isdir(gramdir):
|
||||||
|
import keydiff
|
||||||
|
compute_ngrams = True
|
||||||
|
for fname in os.listdir(gramdir):
|
||||||
|
suffixes = re.findall(r'\.[0-9]*g$', fname)
|
||||||
|
if suffixes:
|
||||||
|
grams = int(suffixes[0][1:-1])
|
||||||
|
d = {}
|
||||||
|
with open(os.path.join(gramdir, fname), 'rt') as f:
|
||||||
|
keydiff.parse_keyfile(f, d, int)
|
||||||
|
gramdicts[grams] = d
|
||||||
|
|
||||||
|
def rare_grams(card, thresh = 2, grams = 2):
|
||||||
|
if not grams in gramdicts:
|
||||||
|
return None
|
||||||
|
rares = 0
|
||||||
|
gramdict = gramdicts[grams]
|
||||||
|
for line in card.text_lines_words:
|
||||||
|
for i in range(0, len(line) - (grams - 1)):
|
||||||
|
ngram = ' '.join([line[i + j] for j in range(0, grams)])
|
||||||
|
if ngram in gramdict:
|
||||||
|
if gramdict[ngram] < thresh:
|
||||||
|
rares += 1
|
||||||
|
else:
|
||||||
|
rares += 1
|
||||||
|
return rares
|
||||||
|
|
||||||
def list_only(l, items):
|
def list_only(l, items):
|
||||||
for e in l:
|
for e in l:
|
||||||
if not e in items:
|
if not e in items:
|
||||||
|
@ -130,6 +161,38 @@ values = OrderedDict([(k, (0,0,0)) for k in props])
|
||||||
def main(fname, oname = None, verbose = True):
|
def main(fname, oname = None, verbose = True):
|
||||||
# may need to set special arguments here
|
# may need to set special arguments here
|
||||||
cards = jdecode.mtg_open_file(fname, verbose=verbose)
|
cards = jdecode.mtg_open_file(fname, verbose=verbose)
|
||||||
|
rg = {}
|
||||||
|
for card in cards:
|
||||||
|
g = rare_grams(card, thresh=2, grams=2)
|
||||||
|
if len(card.text_words) > 0:
|
||||||
|
g = int(1.0 + (float(g) * 100.0 / float(len(card.text_words))))
|
||||||
|
if g in rg:
|
||||||
|
rg[g] += 1
|
||||||
|
else:
|
||||||
|
rg[g] = 1
|
||||||
|
if g >= 60:
|
||||||
|
print g
|
||||||
|
print card.format()
|
||||||
|
|
||||||
|
tot = 0
|
||||||
|
vmax = sum(rg.values())
|
||||||
|
pct90 = None
|
||||||
|
pct95 = None
|
||||||
|
pct99 = None
|
||||||
|
for i in sorted(rg):
|
||||||
|
print str(i) + ' rare ngrams: ' + str(rg[i])
|
||||||
|
tot += rg[i]
|
||||||
|
if pct90 is None and tot >= vmax * 0.90:
|
||||||
|
pct90 = i
|
||||||
|
if pct95 is None and tot >= vmax * 0.95:
|
||||||
|
pct95 = i
|
||||||
|
if pct99 is None and tot >= vmax * 0.99:
|
||||||
|
pct99 = i
|
||||||
|
|
||||||
|
print '90% - ' + str(pct90)
|
||||||
|
print '95% - ' + str(pct95)
|
||||||
|
print '99% - ' + str(pct99)
|
||||||
|
exit(0)
|
||||||
|
|
||||||
for card in cards:
|
for card in cards:
|
||||||
for prop in props:
|
for prop in props:
|
||||||
|
|
Loading…
Reference in a new issue