From 947e41ea80b9ceab89a157cec94f803762ec101f Mon Sep 17 00:00:00 2001 From: Bill Zorn Date: Fri, 4 Dec 2015 20:01:29 -0800 Subject: [PATCH] updated ngrams script so it can also use the nltk model --- scripts/ngrams.py | 84 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 23 deletions(-) diff --git a/scripts/ngrams.py b/scripts/ngrams.py index 8c64ae6..55b6503 100755 --- a/scripts/ngrams.py +++ b/scripts/ngrams.py @@ -1,11 +1,12 @@ #!/usr/bin/env python import sys import os +import pickle libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib') sys.path.append(libdir) import jdecode - +import nltk_model as model def update_ngrams(lines, gramdict, grams): for line in lines: @@ -35,34 +36,66 @@ def describe_bins(gramdict, bins): print (' ' + (str(bins[i]) if i < len(bins) else str(bins[-1]) + '+') + ': ' + str(counts[i])) -def main(fname, oname, gmin = 2, gmax = 8, verbose = True): - gmin = int(gmin) - gmax = int(gmax) - bins = [1, 2, 3, 10, 30, 100, 300, 1000] - if gmin < 2 or gmax < gmin: - print 'invalid gram sizes: ' + str(gmin) + '-' + str(gmax) - exit(1) +def extract_language(cards, separate_lines = True): + if separate_lines: + lang = [line.vectorize() for card in cards for line in card.text_lines] + else: + lang = [card.text.vectorize() for card in cards] + return map(lambda s: s.split(), lang) +def build_ngram_model(cards, n, separate_lines = True, verbose = False): + if verbose: + print('generating ' + str(n) + '-gram model') + lang = extract_language(cards, separate_lines=separate_lines) + if verbose: + print('found ' + str(len(lang)) + ' sentences') + lm = model.NgramModel(n, lang) + if verbose: + print(lm) + return lm + +def main(fname, oname, gmin = 2, gmax = 8, nltk = False, sep = False, verbose = False): # may need to set special arguments here cards = jdecode.mtg_open_file(fname, verbose=verbose) + gmin = int(gmin) + gmax = int(gmax) - for grams in range(gmin, gmax+1): + if nltk: + n = gmin + lm = build_ngram_model(cards, n, separate_lines=sep, verbose=verbose) if verbose: - print 'generating ' + str(grams) + '-grams...' - gramdict = {} - for card in cards: - update_ngrams(card.text_lines_words, gramdict, grams) - - oname_full = oname + '.' + str(grams) + 'g' + teststr = 'when @ enters the battlefield' + print('litmus test: perplexity of ' + repr(teststr)) + print(' ' + str(lm.perplexity(teststr.split()))) if verbose: - print ' writing ' + str(len(gramdict)) + ' unique ' + str(grams) + '-grams to ' + oname_full - describe_bins(gramdict, bins) + print('pickling module to ' + oname) + with open(oname, 'wb') as f: + pickle.dump(lm, f) - with open(oname_full, 'wt') as f: - for ngram in sorted(gramdict, - lambda x,y: cmp(gramdict[x], gramdict[y]), - reverse = True): - f.write((ngram + ': ' + str(gramdict[ngram]) + '\n').encode('utf-8')) + else: + bins = [1, 2, 3, 10, 30, 100, 300, 1000] + if gmin < 2 or gmax < gmin: + print 'invalid gram sizes: ' + str(gmin) + '-' + str(gmax) + exit(1) + + for grams in range(gmin, gmax+1): + if verbose: + print 'generating ' + str(grams) + '-grams...' + gramdict = {} + for card in cards: + update_ngrams(card.text_lines_words, gramdict, grams) + + oname_full = oname + '.' + str(grams) + 'g' + if verbose: + print(' writing ' + str(len(gramdict)) + ' unique ' + str(grams) + + '-grams to ' + oname_full) + describe_bins(gramdict, bins) + + with open(oname_full, 'wt') as f: + for ngram in sorted(gramdict, + lambda x,y: cmp(gramdict[x], gramdict[y]), + reverse = True): + f.write((ngram + ': ' + str(gramdict[ngram]) + '\n').encode('utf-8')) if __name__ == '__main__': @@ -77,9 +110,14 @@ if __name__ == '__main__': help='minimum gram size to compute') parser.add_argument('-max', '--max', action='store', default='8', help='maximum gram size to compute') + parser.add_argument('-nltk', '--nltk', action='store_true', + help='use nltk model.NgramModel, with n = min') + parser.add_argument('-s', '--separate', action='store_true', + help='separate card text into lines when constructing nltk model') parser.add_argument('-v', '--verbose', action='store_true', help='verbose output') args = parser.parse_args() - main(args.infile, args.outfile, gmin=args.min, gmax=args.max, verbose=args.verbose) + main(args.infile, args.outfile, gmin=args.min, gmax=args.max, nltk=args.nltk, + sep=args.separate, verbose=args.verbose) exit(0)