added some data processing scripts
This commit is contained in:
parent
2cd8b03249
commit
6b80e49d2b
2 changed files with 139 additions and 0 deletions
81
scripts/distances.py
Executable file
81
scripts/distances.py
Executable file
|
@ -0,0 +1,81 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
|
||||||
|
sys.path.append(libdir)
|
||||||
|
import utils
|
||||||
|
import jdecode
|
||||||
|
from namediff import Namediff
|
||||||
|
from cbow import CBOW
|
||||||
|
|
||||||
|
def main(fname, oname, verbose = True, parallel = True):
|
||||||
|
# may need to set special arguments here
|
||||||
|
cards = jdecode.mtg_open_file(fname, verbose=verbose)
|
||||||
|
|
||||||
|
# this could reasonably be some separate function
|
||||||
|
# might make sense to merge cbow and namediff and have this be the main interface
|
||||||
|
namediff = Namediff()
|
||||||
|
cbow = CBOW()
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print 'Computing nearest names...'
|
||||||
|
if parallel:
|
||||||
|
nearest_names = namediff.nearest_par(map(lambda c: c.name, cards), n=1)
|
||||||
|
else:
|
||||||
|
nearest_names = [namediff.nearest(c.name, n=1) for c in cards]
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print 'Computing nearest cards...'
|
||||||
|
if parallel:
|
||||||
|
nearest_cards = cbow.nearest_par(cards, n=1)
|
||||||
|
else:
|
||||||
|
nearest_cards = [cbow.nearest(c, n=1) for c in cards]
|
||||||
|
|
||||||
|
for i in range(0, len(cards)):
|
||||||
|
cards[i].nearest_names = nearest_names[i]
|
||||||
|
cards[i].nearest_cards = nearest_cards[i]
|
||||||
|
|
||||||
|
# # unfortunately this takes ~30 hours on 8 cores for a 10MB dump
|
||||||
|
# if verbose:
|
||||||
|
# print 'Computing nearest encodings by text edit distance...'
|
||||||
|
# if parallel:
|
||||||
|
# nearest_cards_text = namediff.nearest_card_par(cards, n=1)
|
||||||
|
# else:
|
||||||
|
# nearest_cards_text = [namediff.nearest_card(c, n=1) for c in cards]
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print '...Done.'
|
||||||
|
|
||||||
|
# write to a file to store the data, this is a terribly long computation
|
||||||
|
# we could also just store this same info in the cards themselves as more fields...
|
||||||
|
sep = '|'
|
||||||
|
with open(oname, 'w') as ofile:
|
||||||
|
for i in range(0, len(cards)):
|
||||||
|
card = cards[i]
|
||||||
|
ostr = str(i) + sep + card.name + sep
|
||||||
|
ndist, _ = card.nearest_names[0]
|
||||||
|
ostr += str(ndist) + sep
|
||||||
|
cdist, _ = card.nearest_cards[0]
|
||||||
|
ostr += str(cdist) + '\n'
|
||||||
|
# tdist, _ = nearest_cards_text[i][0]
|
||||||
|
# ostr += str(tdist) + '\n'
|
||||||
|
ofile.write(ostr.encode('utf-8'))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument('infile', #nargs='?'. default=None,
|
||||||
|
help='encoded card file or json corpus to process')
|
||||||
|
parser.add_argument('outfile', #nargs='?', default=None,
|
||||||
|
help='name of output file, will be overwritten')
|
||||||
|
parser.add_argument('-v', '--verbose', action='store_true',
|
||||||
|
help='verbose output')
|
||||||
|
parser.add_argument('-p', '--parallel', action='store_true',
|
||||||
|
help='run in parallel on all cores')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args.infile, args.outfile, verbose=args.verbose, parallel=args.parallel)
|
||||||
|
exit(0)
|
58
scripts/sum.py
Executable file
58
scripts/sum.py
Executable file
|
@ -0,0 +1,58 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
|
||||||
|
sys.path.append(libdir)
|
||||||
|
|
||||||
|
def main(fname):
|
||||||
|
with open(fname, 'rt') as f:
|
||||||
|
text = f.read()
|
||||||
|
|
||||||
|
cardstats = text.split('\n')
|
||||||
|
nonempty = 0
|
||||||
|
name_avg = 0
|
||||||
|
name_dupes = 0
|
||||||
|
card_avg = 0
|
||||||
|
card_dupes = 0
|
||||||
|
|
||||||
|
for c in cardstats:
|
||||||
|
fields = c.split('|')
|
||||||
|
if len(fields) < 4:
|
||||||
|
continue
|
||||||
|
nonempty += 1
|
||||||
|
idx = int(fields[0])
|
||||||
|
name = str(fields[1])
|
||||||
|
ndist = float(fields[2])
|
||||||
|
cdist = float(fields[3])
|
||||||
|
|
||||||
|
name_avg += ndist
|
||||||
|
if ndist == 1.0:
|
||||||
|
name_dupes += 1
|
||||||
|
card_avg += cdist
|
||||||
|
if cdist == 1.0:
|
||||||
|
card_dupes += 1
|
||||||
|
|
||||||
|
name_avg = name_avg / float(nonempty)
|
||||||
|
card_avg = card_avg / float(nonempty)
|
||||||
|
|
||||||
|
print str(nonempty) + ' cards'
|
||||||
|
print '-- names --'
|
||||||
|
print 'avg distance: ' + str(name_avg)
|
||||||
|
print 'num duplicates: ' + str(name_dupes)
|
||||||
|
print '-- cards --'
|
||||||
|
print 'avg distance: ' + str(card_avg)
|
||||||
|
print 'num duplicates: ' + str(card_dupes)
|
||||||
|
print '----'
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument('infile', #nargs='?'. default=None,
|
||||||
|
help='data file to process')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args.infile)
|
||||||
|
exit(0)
|
Loading…
Reference in a new issue