From 6b80e49d2bc29080c87d54be2313c16031491db4 Mon Sep 17 00:00:00 2001 From: Bill Zorn Date: Tue, 17 Nov 2015 00:47:18 -0800 Subject: [PATCH] added some data processing scripts --- scripts/distances.py | 81 ++++++++++++++++++++++++++++++++++++++++++++ scripts/sum.py | 58 +++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100755 scripts/distances.py create mode 100755 scripts/sum.py diff --git a/scripts/distances.py b/scripts/distances.py new file mode 100755 index 0000000..4c0fd7c --- /dev/null +++ b/scripts/distances.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +import sys +import os + +libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib') +sys.path.append(libdir) +import utils +import jdecode +from namediff import Namediff +from cbow import CBOW + +def main(fname, oname, verbose = True, parallel = True): + # may need to set special arguments here + cards = jdecode.mtg_open_file(fname, verbose=verbose) + + # this could reasonably be some separate function + # might make sense to merge cbow and namediff and have this be the main interface + namediff = Namediff() + cbow = CBOW() + + if verbose: + print 'Computing nearest names...' + if parallel: + nearest_names = namediff.nearest_par(map(lambda c: c.name, cards), n=1) + else: + nearest_names = [namediff.nearest(c.name, n=1) for c in cards] + + if verbose: + print 'Computing nearest cards...' + if parallel: + nearest_cards = cbow.nearest_par(cards, n=1) + else: + nearest_cards = [cbow.nearest(c, n=1) for c in cards] + + for i in range(0, len(cards)): + cards[i].nearest_names = nearest_names[i] + cards[i].nearest_cards = nearest_cards[i] + + # # unfortunately this takes ~30 hours on 8 cores for a 10MB dump + # if verbose: + # print 'Computing nearest encodings by text edit distance...' + # if parallel: + # nearest_cards_text = namediff.nearest_card_par(cards, n=1) + # else: + # nearest_cards_text = [namediff.nearest_card(c, n=1) for c in cards] + + if verbose: + print '...Done.' + + # write to a file to store the data, this is a terribly long computation + # we could also just store this same info in the cards themselves as more fields... + sep = '|' + with open(oname, 'w') as ofile: + for i in range(0, len(cards)): + card = cards[i] + ostr = str(i) + sep + card.name + sep + ndist, _ = card.nearest_names[0] + ostr += str(ndist) + sep + cdist, _ = card.nearest_cards[0] + ostr += str(cdist) + '\n' + # tdist, _ = nearest_cards_text[i][0] + # ostr += str(tdist) + '\n' + ofile.write(ostr.encode('utf-8')) + +if __name__ == '__main__': + + import argparse + parser = argparse.ArgumentParser() + + parser.add_argument('infile', #nargs='?'. default=None, + help='encoded card file or json corpus to process') + parser.add_argument('outfile', #nargs='?', default=None, + help='name of output file, will be overwritten') + parser.add_argument('-v', '--verbose', action='store_true', + help='verbose output') + parser.add_argument('-p', '--parallel', action='store_true', + help='run in parallel on all cores') + + args = parser.parse_args() + main(args.infile, args.outfile, verbose=args.verbose, parallel=args.parallel) + exit(0) diff --git a/scripts/sum.py b/scripts/sum.py new file mode 100755 index 0000000..947b39d --- /dev/null +++ b/scripts/sum.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python +import sys +import os + +libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib') +sys.path.append(libdir) + +def main(fname): + with open(fname, 'rt') as f: + text = f.read() + + cardstats = text.split('\n') + nonempty = 0 + name_avg = 0 + name_dupes = 0 + card_avg = 0 + card_dupes = 0 + + for c in cardstats: + fields = c.split('|') + if len(fields) < 4: + continue + nonempty += 1 + idx = int(fields[0]) + name = str(fields[1]) + ndist = float(fields[2]) + cdist = float(fields[3]) + + name_avg += ndist + if ndist == 1.0: + name_dupes += 1 + card_avg += cdist + if cdist == 1.0: + card_dupes += 1 + + name_avg = name_avg / float(nonempty) + card_avg = card_avg / float(nonempty) + + print str(nonempty) + ' cards' + print '-- names --' + print 'avg distance: ' + str(name_avg) + print 'num duplicates: ' + str(name_dupes) + print '-- cards --' + print 'avg distance: ' + str(card_avg) + print 'num duplicates: ' + str(card_dupes) + print '----' + +if __name__ == '__main__': + + import argparse + parser = argparse.ArgumentParser() + + parser.add_argument('infile', #nargs='?'. default=None, + help='data file to process') + + args = parser.parse_args() + main(args.infile) + exit(0)