mtgencode/scripts/ngrams.py

#!/usr/bin/env python
import sys
import os
import pickle

libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
sys.path.append(libdir)
import jdecode
import nltk_model as model

def update_ngrams(lines, gramdict, grams):
    for line in lines:
        for i in range(0, len(line) - (grams - 1)):
            ngram = ' '.join([line[i + j] for j in range(0, grams)])
            if ngram in gramdict:
                gramdict[ngram] += 1
            else:
                gramdict[ngram] = 1

def describe_bins(gramdict, bins):
    bins = sorted(bins)
    counts = [0 for _ in range(0, len(bins) + 1)]

    for ngram in gramdict:
        for i in range(0, len(bins) + 1):
            if i < len(bins):
                if gramdict[ngram] <= bins[i]:
                    counts[i] += 1
                    break
            else:
                # didn't fit into any of the smaller bins, stick in on the end
                counts[-1] += 1

    for i in range(0, len(counts)):
        if counts[i] > 0:
            print ('  ' + (str(bins[i]) if i < len(bins) else str(bins[-1]) + '+')
                   + ': ' + str(counts[i]))

def extract_language(cards, separate_lines = True):
    if separate_lines:
        lang = [line.vectorize() for card in cards for line in card.text_lines]
    else:
        lang = [card.text.vectorize() for card in cards]
    return map(lambda s: s.split(), lang)

def build_ngram_model(cards, n, separate_lines = True, verbose = False):
    if verbose:
        print('generating ' + str(n) + '-gram model')
    lang = extract_language(cards, separate_lines=separate_lines)
    if verbose:
        print('found ' + str(len(lang)) + ' sentences')
    lm = model.NgramModel(n, lang, pad_left=True, pad_right=True)
    if verbose:
        print(lm)
    return lm

def main(fname, oname, gmin = 2, gmax = 8, nltk = False, sep = False, verbose = False):
    # may need to set special arguments here
    cards = jdecode.mtg_open_file(fname, verbose=verbose)
    gmin = int(gmin)
    gmax = int(gmax)

    if nltk:
        n = gmin
        lm = build_ngram_model(cards, n, separate_lines=sep, verbose=verbose)
        if verbose:
            teststr = 'when @ enters the battlefield'
            print('litmus test: perplexity of ' + repr(teststr))
            print('  ' + str(lm.perplexity(teststr.split())))
        if verbose:
            print('pickling module to ' + oname)
        with open(oname, 'wb') as f:
            pickle.dump(lm, f)

    else:
        bins = [1, 2, 3, 10, 30, 100, 300, 1000]
        if gmin < 2 or gmax < gmin:
            print 'invalid gram sizes: ' + str(gmin) + '-' + str(gmax)
            exit(1)

        for grams in range(gmin, gmax+1):
            if verbose:
                print 'generating ' + str(grams) + '-grams...'
            gramdict = {}
            for card in cards:
                update_ngrams(card.text_lines_words, gramdict, grams)

            oname_full = oname + '.' + str(grams) + 'g'
            if verbose:
                print('  writing ' + str(len(gramdict)) + ' unique ' + str(grams)
                      + '-grams to ' + oname_full)
                describe_bins(gramdict, bins)

            with open(oname_full, 'wt') as f:
                for ngram in sorted(gramdict,
                                    lambda x,y: cmp(gramdict[x], gramdict[y]),
                                    reverse = True):
                    f.write((ngram + ': ' + str(gramdict[ngram]) + '\n').encode('utf-8'))

if __name__ == '__main__':

    import argparse
    parser = argparse.ArgumentParser()

    parser.add_argument('infile', #nargs='?'. default=None,
                        help='encoded card file or json corpus to process')
    parser.add_argument('outfile', #nargs='?', default=None,
                        help='base name of output file, outputs ending in .2g, .3g etc. will be produced')
    parser.add_argument('-min', '--min', action='store', default='2',
                        help='minimum gram size to compute')
    parser.add_argument('-max', '--max', action='store', default='8',
                        help='maximum gram size to compute')
    parser.add_argument('-nltk', '--nltk', action='store_true',
                        help='use nltk model.NgramModel, with n = min')
    parser.add_argument('-s', '--separate', action='store_true',
                        help='separate card text into lines when constructing nltk model')
    parser.add_argument('-v', '--verbose', action='store_true',
                        help='verbose output')

    args = parser.parse_args()
    main(args.infile, args.outfile, gmin=args.min, gmax=args.max, nltk=args.nltk,
         sep=args.separate, verbose=args.verbose)
    exit(0)