collecting changes stranded on leveler
This commit is contained in:
parent
6606a5530e
commit
b22dcafc89
5 changed files with 246 additions and 0 deletions
|
@ -419,6 +419,7 @@ class Card:
|
|||
self.__dict__[field_text] = Manatext('')
|
||||
self.__dict__[field_text + '_lines'] = []
|
||||
self.__dict__[field_text + '_words'] = []
|
||||
self.__dict__[field_text + '_lines_words'] = []
|
||||
self.__dict__[field_other] = []
|
||||
self.bside = None
|
||||
# format-independent view of processed input
|
||||
|
@ -545,6 +546,9 @@ class Card:
|
|||
self.__dict__[field_text + '_words'] = re.sub(utils.unletters_regex,
|
||||
' ',
|
||||
fulltext).split()
|
||||
self.__dict__[field_text + '_lines_words'] = map(
|
||||
lambda line: re.sub(utils.unletters_regex, ' ', line).split(),
|
||||
fulltext.split(utils.newline))
|
||||
else:
|
||||
self.valid = False
|
||||
self.__dict__[field_other] += [(idx, '<text> ' + str(value))]
|
||||
|
|
|
@ -56,6 +56,7 @@ class Namediff:
|
|||
self.verbose = verbose
|
||||
self.names = {}
|
||||
self.codes = {}
|
||||
self.cardstrings = {}
|
||||
|
||||
if self.verbose:
|
||||
print 'Setting up namediff...'
|
||||
|
@ -83,6 +84,7 @@ class Namediff:
|
|||
print ' Duplicate name ' + name + ', ignoring.'
|
||||
else:
|
||||
self.names[name] = jname
|
||||
self.cardstrings[name] = card.encode()
|
||||
if jcode and jnum:
|
||||
self.codes[name] = jcode + '/' + jnum + '.jpg'
|
||||
else:
|
||||
|
@ -93,6 +95,7 @@ class Namediff:
|
|||
print ' Building SequenceMatcher objects.'
|
||||
|
||||
self.matchers = [difflib.SequenceMatcher(b=n, autojunk=False) for n in self.names]
|
||||
self.card_matchers = [difflib.SequenceMatcher(b=self.cardstrings[n], autojunk=False) for n in self.cardstrings]
|
||||
|
||||
print '... Done.'
|
||||
|
||||
|
@ -105,3 +108,13 @@ class Namediff:
|
|||
worklist = map(lambda x: (x, self.names, n), proto_worklist)
|
||||
donelist = workpool.map(f_nearest_per_thread, worklist)
|
||||
return list_flatten(donelist)
|
||||
|
||||
def nearest_card(self, card, n=5):
|
||||
return f_nearest(card.encode(), self.card_matchers, n)
|
||||
|
||||
def nearest_card_par(self, cards, n=5, threads=cores):
|
||||
workpool = multiprocessing.Pool(threads)
|
||||
proto_worklist = list_split(cards, threads)
|
||||
worklist = map(lambda x: (map(lambda c: c.encode(), x), self.cardstrings.values(), n), proto_worklist)
|
||||
donelist = workpool.map(f_nearest_per_thread, worklist)
|
||||
return list_flatten(donelist)
|
||||
|
|
81
scripts/keydiff.py
Executable file
81
scripts/keydiff.py
Executable file
|
@ -0,0 +1,81 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
def parse_keyfile(f, d, constructor = lambda x: x):
|
||||
for line in f:
|
||||
kv = map(lambda s: s.strip(), line.split(':'))
|
||||
if not len(kv) == 2:
|
||||
continue
|
||||
d[kv[0]] = constructor(kv[1])
|
||||
|
||||
def merge_dicts(d1, d2):
|
||||
d = {}
|
||||
for k in d1:
|
||||
d[k] = (d1[k], d2[k] if k in d2 else None)
|
||||
for k in d2:
|
||||
if not k in d:
|
||||
d[k] = (None, d2[k])
|
||||
return d
|
||||
|
||||
def main(fname1, fname2, verbose = True):
|
||||
if verbose:
|
||||
print 'opening ' + fname1 + ' as base key/value store'
|
||||
print 'opening ' + fname2 + ' as target key/value store'
|
||||
|
||||
d1 = {}
|
||||
d2 = {}
|
||||
with open(fname1, 'rt') as f1:
|
||||
parse_keyfile(f1, d1, int)
|
||||
with open(fname2, 'rt') as f2:
|
||||
parse_keyfile(f2, d2, int)
|
||||
|
||||
tot1 = sum(d1.values())
|
||||
tot2 = sum(d2.values())
|
||||
|
||||
if verbose:
|
||||
print ' ' + fname1 + ': ' + str(len(d1)) + ', total ' + str(tot1)
|
||||
print ' ' + fname2 + ': ' + str(len(d2)) + ', total ' + str(tot2)
|
||||
|
||||
d_merged = merge_dicts(d1, d2)
|
||||
|
||||
ratios = {}
|
||||
only_1 = {}
|
||||
only_2 = {}
|
||||
for k in d_merged:
|
||||
(v1, v2) = d_merged[k]
|
||||
if v1 is None:
|
||||
only_2[k] = v2
|
||||
elif v2 is None:
|
||||
only_1[k] = v1
|
||||
else:
|
||||
ratios[k] = float(v2 * tot1) / float(v1 * tot2)
|
||||
|
||||
print 'shared: ' + str(len(ratios))
|
||||
for k in sorted(ratios, lambda x,y: cmp(d2[x], d2[y]), reverse=True):
|
||||
print ' ' + k + ': ' + str(d2[k]) + '/' + str(d1[k]) + ' (' + str(ratios[k]) + ')'
|
||||
print ''
|
||||
|
||||
print '1 only: ' + str(len(only_1))
|
||||
for k in sorted(only_1, lambda x,y: cmp(d1[x], d1[y]), reverse=True):
|
||||
print ' ' + k + ': ' + str(d1[k])
|
||||
print ''
|
||||
|
||||
print '2 only: ' + str(len(only_2))
|
||||
for k in sorted(only_2, lambda x,y: cmp(d2[x], d2[y]), reverse=True):
|
||||
print ' ' + k + ': ' + str(d2[k])
|
||||
print ''
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('file1', #nargs='?'. default=None,
|
||||
help='base key file to diff against')
|
||||
parser.add_argument('file2', nargs='?', default=None,
|
||||
help='other file to compare against the baseline')
|
||||
parser.add_argument('-v', '--verbose', action='store_true',
|
||||
help='verbose output')
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args.file1, args.file2, verbose=args.verbose)
|
||||
exit(0)
|
85
scripts/ngrams.py
Executable file
85
scripts/ngrams.py
Executable file
|
@ -0,0 +1,85 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
import os
|
||||
|
||||
libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
|
||||
sys.path.append(libdir)
|
||||
import jdecode
|
||||
|
||||
|
||||
def update_ngrams(lines, gramdict, grams):
|
||||
for line in lines:
|
||||
for i in range(0, len(line) - (grams - 1)):
|
||||
ngram = ' '.join([line[i + j] for j in range(0, grams)])
|
||||
if ngram in gramdict:
|
||||
gramdict[ngram] += 1
|
||||
else:
|
||||
gramdict[ngram] = 1
|
||||
|
||||
def describe_bins(gramdict, bins):
|
||||
bins = sorted(bins)
|
||||
counts = [0 for _ in range(0, len(bins) + 1)]
|
||||
|
||||
for ngram in gramdict:
|
||||
for i in range(0, len(bins) + 1):
|
||||
if i < len(bins):
|
||||
if gramdict[ngram] <= bins[i]:
|
||||
counts[i] += 1
|
||||
break
|
||||
else:
|
||||
# didn't fit into any of the smaller bins, stick in on the end
|
||||
counts[-1] += 1
|
||||
|
||||
for i in range(0, len(counts)):
|
||||
if counts[i] > 0:
|
||||
print (' ' + (str(bins[i]) if i < len(bins) else str(bins[-1]) + '+')
|
||||
+ ': ' + str(counts[i]))
|
||||
|
||||
def main(fname, oname, gmin = 2, gmax = 8, verbose = True):
|
||||
gmin = int(gmin)
|
||||
gmax = int(gmax)
|
||||
bins = [1, 2, 3, 10, 30, 100, 300, 1000]
|
||||
if gmin < 2 or gmax < gmin:
|
||||
print 'invalid gram sizes: ' + str(gmin) + '-' + str(gmax)
|
||||
exit(1)
|
||||
|
||||
# may need to set special arguments here
|
||||
cards = jdecode.mtg_open_file(fname, verbose=verbose)
|
||||
|
||||
for grams in range(gmin, gmax+1):
|
||||
if verbose:
|
||||
print 'generating ' + str(grams) + '-grams...'
|
||||
gramdict = {}
|
||||
for card in cards:
|
||||
update_ngrams(card.text_lines_words, gramdict, grams)
|
||||
|
||||
oname_full = oname + '.' + str(grams) + 'g'
|
||||
if verbose:
|
||||
print ' writing ' + str(len(gramdict)) + ' unique ' + str(grams) + '-grams to ' + oname_full
|
||||
describe_bins(gramdict, bins)
|
||||
|
||||
with open(oname_full, 'wt') as f:
|
||||
for ngram in sorted(gramdict,
|
||||
lambda x,y: cmp(gramdict[x], gramdict[y]),
|
||||
reverse = True):
|
||||
f.write((ngram + ': ' + str(gramdict[ngram]) + '\n').encode('utf-8'))
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('infile', #nargs='?'. default=None,
|
||||
help='encoded card file or json corpus to process')
|
||||
parser.add_argument('outfile', #nargs='?', default=None,
|
||||
help='base name of output file, outputs ending in .2g, .3g etc. will be produced')
|
||||
parser.add_argument('-min', '--min', action='store', default='2',
|
||||
help='minimum gram size to compute')
|
||||
parser.add_argument('-max', '--max', action='store', default='8',
|
||||
help='maximum gram size to compute')
|
||||
parser.add_argument('-v', '--verbose', action='store_true',
|
||||
help='verbose output')
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args.infile, args.outfile, gmin=args.min, gmax=args.max, verbose=args.verbose)
|
||||
exit(0)
|
|
@ -9,6 +9,37 @@ sys.path.append(libdir)
|
|||
import utils
|
||||
import jdecode
|
||||
|
||||
datadir = os.path.realpath(os.path.join(libdir, '../data'))
|
||||
gramdir = os.path.join(datadir, 'ngrams')
|
||||
compute_ngrams = False
|
||||
gramdicts = {}
|
||||
if os.path.isdir(gramdir):
|
||||
import keydiff
|
||||
compute_ngrams = True
|
||||
for fname in os.listdir(gramdir):
|
||||
suffixes = re.findall(r'\.[0-9]*g$', fname)
|
||||
if suffixes:
|
||||
grams = int(suffixes[0][1:-1])
|
||||
d = {}
|
||||
with open(os.path.join(gramdir, fname), 'rt') as f:
|
||||
keydiff.parse_keyfile(f, d, int)
|
||||
gramdicts[grams] = d
|
||||
|
||||
def rare_grams(card, thresh = 2, grams = 2):
|
||||
if not grams in gramdicts:
|
||||
return None
|
||||
rares = 0
|
||||
gramdict = gramdicts[grams]
|
||||
for line in card.text_lines_words:
|
||||
for i in range(0, len(line) - (grams - 1)):
|
||||
ngram = ' '.join([line[i + j] for j in range(0, grams)])
|
||||
if ngram in gramdict:
|
||||
if gramdict[ngram] < thresh:
|
||||
rares += 1
|
||||
else:
|
||||
rares += 1
|
||||
return rares
|
||||
|
||||
def list_only(l, items):
|
||||
for e in l:
|
||||
if not e in items:
|
||||
|
@ -130,6 +161,38 @@ values = OrderedDict([(k, (0,0,0)) for k in props])
|
|||
def main(fname, oname = None, verbose = True):
|
||||
# may need to set special arguments here
|
||||
cards = jdecode.mtg_open_file(fname, verbose=verbose)
|
||||
rg = {}
|
||||
for card in cards:
|
||||
g = rare_grams(card, thresh=2, grams=2)
|
||||
if len(card.text_words) > 0:
|
||||
g = int(1.0 + (float(g) * 100.0 / float(len(card.text_words))))
|
||||
if g in rg:
|
||||
rg[g] += 1
|
||||
else:
|
||||
rg[g] = 1
|
||||
if g >= 60:
|
||||
print g
|
||||
print card.format()
|
||||
|
||||
tot = 0
|
||||
vmax = sum(rg.values())
|
||||
pct90 = None
|
||||
pct95 = None
|
||||
pct99 = None
|
||||
for i in sorted(rg):
|
||||
print str(i) + ' rare ngrams: ' + str(rg[i])
|
||||
tot += rg[i]
|
||||
if pct90 is None and tot >= vmax * 0.90:
|
||||
pct90 = i
|
||||
if pct95 is None and tot >= vmax * 0.95:
|
||||
pct95 = i
|
||||
if pct99 is None and tot >= vmax * 0.99:
|
||||
pct99 = i
|
||||
|
||||
print '90% - ' + str(pct90)
|
||||
print '95% - ' + str(pct95)
|
||||
print '99% - ' + str(pct99)
|
||||
exit(0)
|
||||
|
||||
for card in cards:
|
||||
for prop in props:
|
||||
|
|
Loading…
Reference in a new issue