mtgencode/scripts/sanity.py

173 lines
5.7 KiB
Python
Raw Normal View History

#!/usr/bin/env python
import sys
import os
import re
import json
libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
sys.path.append(libdir)
import utils
import jdecode
import cardlib
import transforms
def check_lines(fname):
cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
prelines = set()
keylines = set()
mainlines = set()
costlines = set()
postlines = set()
known = ['enchant ', 'equip', 'countertype', 'multikicker', 'kicker',
'suspend', 'echo', 'awaken', 'bestow', 'buyback',
'cumulative', 'dash', 'entwine', 'evoke', 'fortify',
'flashback', 'madness', 'morph', 'megamorph', 'miracle', 'ninjutsu',
'overload', 'prowl', 'recover', 'reinforce', 'replicate', 'scavenge',
'splice', 'surge', 'unearth', 'transfigure', 'transmute',
]
known = []
for card in cards:
prel, keyl, mainl, costl, postl = transforms.separate_lines(card.text.encode(randomize=False))
if card.bside:
prel2, keyl2, mainl2, costl2, postl2 = transforms.separate_lines(card.bside.text.encode(randomize=False))
prel += prel2
keyl += keyl2
mainl += mainl2
costl += costl2
postl += postl2
for line in prel:
if line.strip() == '':
print(card.name, card.text.text)
if any(line.startswith(s) for s in known):
line = 'known'
prelines.add(line)
for line in postl:
if line.strip() == '':
print(card.name, card.text.text)
if any(line.startswith(s) for s in known):
line = 'known'
postlines.add(line)
for line in keyl:
if line.strip() == '':
print(card.name, card.text.text)
if any(line.startswith(s) for s in known):
line = 'known'
keylines.add(line)
for line in mainl:
if line.strip() == '':
print(card.name, card.text.text)
# if any(line.startswith(s) for s in known):
# line = 'known'
mainlines.add(line)
for line in costl:
if line.strip() == '':
print(card.name, card.text.text)
# if any(line.startswith(s) for s in known) or 'cycling' in line or 'monstrosity' in line:
# line = 'known'
costlines.add(line)
print('prel: {:d}, keyl: {:d}, mainl: {:d}, postl {:d}'
.format(len(prelines), len(keylines), len(mainlines), len(postlines)))
print('\nprelines')
for line in sorted(prelines):
print(line)
print('\npostlines')
for line in sorted(postlines):
print(line)
print('\ncostlines')
for line in sorted(costlines):
print(line)
print('\nkeylines')
for line in sorted(keylines):
print(line)
print('\nmainlines')
for line in sorted(mainlines):
#if any(s in line for s in ['champion', 'devour', 'tribute']):
print(line)
def check_vocab(fname):
cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
vocab = {}
for card in cards:
words = card.text.vectorize().split()
if card.bside:
words += card.bside.text.vectorize().split()
for word in words:
if not word in vocab:
vocab[word] = 1
else:
vocab[word] += 1
for word in sorted(vocab, lambda x,y: cmp(vocab[x], vocab[y]), reverse = True):
print('{:8d} : {:s}'.format(vocab[word], word))
n = 3
for card in cards:
words = card.text.vectorize().split()
if card.bside:
words += card.bside.text.vectorize().split()
for word in words:
if vocab[word] <= n:
#if 'name' in word:
print('\n{:8d} : {:s}'.format(vocab[word], word))
print(card.encode())
break
def check_characters(fname, vname):
cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
tokens = {c for c in utils.cardsep}
for card in cards:
for c in card.encode():
tokens.add(c)
token_to_idx = {tok:i+1 for i, tok in enumerate(sorted(tokens))}
idx_to_token = {i+1:tok for i, tok in enumerate(sorted(tokens))}
print('Vocabulary: ({:d} symbols)'.format(len(token_to_idx)))
for token in sorted(token_to_idx):
print('{:8s} : {:4d}'.format(repr(token), token_to_idx[token]))
# compliant with torch-rnn
if vname:
json_data = {'token_to_idx':token_to_idx, 'idx_to_token':idx_to_token}
print('writing vocabulary to {:s}'.format(vname))
with open(vname, 'w') as f:
json.dump(json_data, f)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('infile', nargs='?', default=os.path.join(libdir, '../data/output.txt'),
help='encoded card file or json corpus to process')
parser.add_argument('-lines', action='store_true',
help='show behavior of line separation')
parser.add_argument('-vocab', action='store_true',
help='show vocabulary counts from encoded card text')
parser.add_argument('-chars', action='store_true',
help='generate and display vocabulary of characters used in encoding')
parser.add_argument('--vocab_name', default=None,
help='json file to write vocabulary to')
args = parser.parse_args()
if args.lines:
check_lines(args.infile)
if args.vocab:
check_vocab(args.infile)
if args.chars:
check_characters(args.infile, args.vocab_name)
exit(0)