updated ngrams script so it can also use the nltk model
This commit is contained in:
parent
a8c1303e7f
commit
947e41ea80
1 changed files with 61 additions and 23 deletions
|
@ -1,11 +1,12 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import pickle
|
||||||
|
|
||||||
libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
|
libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
|
||||||
sys.path.append(libdir)
|
sys.path.append(libdir)
|
||||||
import jdecode
|
import jdecode
|
||||||
|
import nltk_model as model
|
||||||
|
|
||||||
def update_ngrams(lines, gramdict, grams):
|
def update_ngrams(lines, gramdict, grams):
|
||||||
for line in lines:
|
for line in lines:
|
||||||
|
@ -35,17 +36,48 @@ def describe_bins(gramdict, bins):
|
||||||
print (' ' + (str(bins[i]) if i < len(bins) else str(bins[-1]) + '+')
|
print (' ' + (str(bins[i]) if i < len(bins) else str(bins[-1]) + '+')
|
||||||
+ ': ' + str(counts[i]))
|
+ ': ' + str(counts[i]))
|
||||||
|
|
||||||
def main(fname, oname, gmin = 2, gmax = 8, verbose = True):
|
def extract_language(cards, separate_lines = True):
|
||||||
|
if separate_lines:
|
||||||
|
lang = [line.vectorize() for card in cards for line in card.text_lines]
|
||||||
|
else:
|
||||||
|
lang = [card.text.vectorize() for card in cards]
|
||||||
|
return map(lambda s: s.split(), lang)
|
||||||
|
|
||||||
|
def build_ngram_model(cards, n, separate_lines = True, verbose = False):
|
||||||
|
if verbose:
|
||||||
|
print('generating ' + str(n) + '-gram model')
|
||||||
|
lang = extract_language(cards, separate_lines=separate_lines)
|
||||||
|
if verbose:
|
||||||
|
print('found ' + str(len(lang)) + ' sentences')
|
||||||
|
lm = model.NgramModel(n, lang)
|
||||||
|
if verbose:
|
||||||
|
print(lm)
|
||||||
|
return lm
|
||||||
|
|
||||||
|
def main(fname, oname, gmin = 2, gmax = 8, nltk = False, sep = False, verbose = False):
|
||||||
|
# may need to set special arguments here
|
||||||
|
cards = jdecode.mtg_open_file(fname, verbose=verbose)
|
||||||
gmin = int(gmin)
|
gmin = int(gmin)
|
||||||
gmax = int(gmax)
|
gmax = int(gmax)
|
||||||
|
|
||||||
|
if nltk:
|
||||||
|
n = gmin
|
||||||
|
lm = build_ngram_model(cards, n, separate_lines=sep, verbose=verbose)
|
||||||
|
if verbose:
|
||||||
|
teststr = 'when @ enters the battlefield'
|
||||||
|
print('litmus test: perplexity of ' + repr(teststr))
|
||||||
|
print(' ' + str(lm.perplexity(teststr.split())))
|
||||||
|
if verbose:
|
||||||
|
print('pickling module to ' + oname)
|
||||||
|
with open(oname, 'wb') as f:
|
||||||
|
pickle.dump(lm, f)
|
||||||
|
|
||||||
|
else:
|
||||||
bins = [1, 2, 3, 10, 30, 100, 300, 1000]
|
bins = [1, 2, 3, 10, 30, 100, 300, 1000]
|
||||||
if gmin < 2 or gmax < gmin:
|
if gmin < 2 or gmax < gmin:
|
||||||
print 'invalid gram sizes: ' + str(gmin) + '-' + str(gmax)
|
print 'invalid gram sizes: ' + str(gmin) + '-' + str(gmax)
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
# may need to set special arguments here
|
|
||||||
cards = jdecode.mtg_open_file(fname, verbose=verbose)
|
|
||||||
|
|
||||||
for grams in range(gmin, gmax+1):
|
for grams in range(gmin, gmax+1):
|
||||||
if verbose:
|
if verbose:
|
||||||
print 'generating ' + str(grams) + '-grams...'
|
print 'generating ' + str(grams) + '-grams...'
|
||||||
|
@ -55,7 +87,8 @@ def main(fname, oname, gmin = 2, gmax = 8, verbose = True):
|
||||||
|
|
||||||
oname_full = oname + '.' + str(grams) + 'g'
|
oname_full = oname + '.' + str(grams) + 'g'
|
||||||
if verbose:
|
if verbose:
|
||||||
print ' writing ' + str(len(gramdict)) + ' unique ' + str(grams) + '-grams to ' + oname_full
|
print(' writing ' + str(len(gramdict)) + ' unique ' + str(grams)
|
||||||
|
+ '-grams to ' + oname_full)
|
||||||
describe_bins(gramdict, bins)
|
describe_bins(gramdict, bins)
|
||||||
|
|
||||||
with open(oname_full, 'wt') as f:
|
with open(oname_full, 'wt') as f:
|
||||||
|
@ -77,9 +110,14 @@ if __name__ == '__main__':
|
||||||
help='minimum gram size to compute')
|
help='minimum gram size to compute')
|
||||||
parser.add_argument('-max', '--max', action='store', default='8',
|
parser.add_argument('-max', '--max', action='store', default='8',
|
||||||
help='maximum gram size to compute')
|
help='maximum gram size to compute')
|
||||||
|
parser.add_argument('-nltk', '--nltk', action='store_true',
|
||||||
|
help='use nltk model.NgramModel, with n = min')
|
||||||
|
parser.add_argument('-s', '--separate', action='store_true',
|
||||||
|
help='separate card text into lines when constructing nltk model')
|
||||||
parser.add_argument('-v', '--verbose', action='store_true',
|
parser.add_argument('-v', '--verbose', action='store_true',
|
||||||
help='verbose output')
|
help='verbose output')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args.infile, args.outfile, gmin=args.min, gmax=args.max, verbose=args.verbose)
|
main(args.infile, args.outfile, gmin=args.min, gmax=args.max, nltk=args.nltk,
|
||||||
|
sep=args.separate, verbose=args.verbose)
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|
Loading…
Reference in a new issue