ipython and beginning of pairing

This commit is contained in:
Bill Zorn 2015-12-06 14:26:02 -08:00
parent 00159593bb
commit d047ded658
4 changed files with 384 additions and 9 deletions

333
mtg_sweep1.ipynb Normal file

File diff suppressed because one or more lines are too long

View file

@ -8,9 +8,14 @@ from collections import OrderedDict
import scipy import scipy
import scipy.stats import scipy.stats
import numpy as np import numpy as np
import math
def mean_nonan(l):
filtered = [x for x in l if not math.isnan(x)]
return np.mean(filtered)
def gmean_nonzero(l): def gmean_nonzero(l):
filtered = [x for x in l if x != 0] filtered = [x for x in l if x != 0 and not math.isnan(x)]
return scipy.stats.gmean(filtered) return scipy.stats.gmean(filtered)
libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib') libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
@ -18,7 +23,7 @@ sys.path.append(libdir)
datadir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data') datadir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data')
import jdecode import jdecode
import validate import mtg_validate
import ngrams import ngrams
def annotate_values(values): def annotate_values(values):
@ -66,7 +71,7 @@ def get_statistics(fname, lm = None, sep = False, verbose=False):
# validate # validate
((total_all, total_good, total_bad, total_uncovered), ((total_all, total_good, total_bad, total_uncovered),
values) = validate.process_props(cards) values) = mtg_validate.process_props(cards)
stats['props'] = annotate_values(values) stats['props'] = annotate_values(values)
stats['props']['overall'] = OrderedDict([('total', total_all), stats['props']['overall'] = OrderedDict([('total', total_all),
@ -97,8 +102,8 @@ def get_statistics(fname, lm = None, sep = False, verbose=False):
if cdist == 1.0: if cdist == 1.0:
card_dupes += 1 card_dupes += 1
dists['name_mean'] = np.mean(dists['name']) dists['name_mean'] = mean_nonan(dists['name'])
dists['cbow_mean'] = np.mean(dists['cbow']) dists['cbow_mean'] = mean_nonan(dists['cbow'])
dists['name_geomean'] = gmean_nonzero(dists['name']) dists['name_geomean'] = gmean_nonzero(dists['name'])
dists['cbow_geomean'] = gmean_nonzero(dists['cbow']) dists['cbow_geomean'] = gmean_nonzero(dists['cbow'])
stats['dists'] = dists stats['dists'] = dists
@ -125,19 +130,20 @@ def get_statistics(fname, lm = None, sep = False, verbose=False):
ngram['perp'] += [perp] ngram['perp'] += [perp]
ngram['perp_per'] += [perp_per] ngram['perp_per'] += [perp_per]
ngram['perp_mean'] = np.mean(ngram['perp']) ngram['perp_mean'] = mean_nonan(ngram['perp'])
ngram['perp_per_mean'] = np.mean(ngram['perp_per']) ngram['perp_per_mean'] = mean_nonan(ngram['perp_per'])
ngram['perp_geomean'] = gmean_nonzero(ngram['perp']) ngram['perp_geomean'] = gmean_nonzero(ngram['perp'])
ngram['perp_per_geomean'] = gmean_nonzero(ngram['perp_per']) ngram['perp_per_geomean'] = gmean_nonzero(ngram['perp_per'])
stats['ngram'] = ngram stats['ngram'] = ngram
print_statistics(stats) return stats
def main(infile, verbose = False): def main(infile, verbose = False):
lm = ngrams.build_ngram_model(jdecode.mtg_open_file(str(os.path.join(datadir, 'output.txt'))), lm = ngrams.build_ngram_model(jdecode.mtg_open_file(str(os.path.join(datadir, 'output.txt'))),
3, separate_lines=True, verbose=True) 3, separate_lines=True, verbose=True)
get_statistics(infile, lm=lm, sep=True, verbose=verbose) stats = get_statistics(infile, lm=lm, sep=True, verbose=verbose)
print_statistics(stats)
if __name__ == '__main__': if __name__ == '__main__':

36
scripts/pairing.py Executable file
View file

@ -0,0 +1,36 @@
#!/usr/bin/env python
import sys
import os
libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
sys.path.append(libdir)
datadir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data')
import jdecode
import ngrams
import analysis
separate_lines=True
def main(fname, n=20, verbose=False):
realcards = jdecode.mtg_open_file(str(os.path.join(datadir, 'output.txt')), verbose=verbose)
lm = ngrams.build_ngram_model(realcards, 3, separate_lines=separate_lines, verbose=verbose)
cards = jdecode.mtg_open_file(fname, verbose=verbose)
stats = analysis.get_statistics(fname, lm=lm, sep=separate_lines, verbose=verbose)
print 'derp'
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('infile', #nargs='?'. default=None,
help='encoded card file or json corpus to process')
parser.add_argument('-n', '--n', action='store',
help='number of cards to consider for each pairing')
parser.add_argument('-v', '--verbose', action='store_true',
help='verbose output')
args = parser.parse_args()
main(args.infile, n=args.n, verbose=args.verbose)
exit(0)