172 lines
5.7 KiB
Python
Executable file
172 lines
5.7 KiB
Python
Executable file
#!/usr/bin/env python
|
|
import sys
|
|
import os
|
|
import re
|
|
import json
|
|
|
|
libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
|
|
sys.path.append(libdir)
|
|
import utils
|
|
import jdecode
|
|
import cardlib
|
|
import transforms
|
|
|
|
def check_lines(fname):
|
|
cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
|
|
|
|
prelines = set()
|
|
keylines = set()
|
|
mainlines = set()
|
|
costlines = set()
|
|
postlines = set()
|
|
|
|
known = ['enchant ', 'equip', 'countertype', 'multikicker', 'kicker',
|
|
'suspend', 'echo', 'awaken', 'bestow', 'buyback',
|
|
'cumulative', 'dash', 'entwine', 'evoke', 'fortify',
|
|
'flashback', 'madness', 'morph', 'megamorph', 'miracle', 'ninjutsu',
|
|
'overload', 'prowl', 'recover', 'reinforce', 'replicate', 'scavenge',
|
|
'splice', 'surge', 'unearth', 'transfigure', 'transmute',
|
|
]
|
|
known = []
|
|
|
|
for card in cards:
|
|
prel, keyl, mainl, costl, postl = transforms.separate_lines(card.text.encode(randomize=False))
|
|
if card.bside:
|
|
prel2, keyl2, mainl2, costl2, postl2 = transforms.separate_lines(card.bside.text.encode(randomize=False))
|
|
prel += prel2
|
|
keyl += keyl2
|
|
mainl += mainl2
|
|
costl += costl2
|
|
postl += postl2
|
|
|
|
for line in prel:
|
|
if line.strip() == '':
|
|
print(card.name, card.text.text)
|
|
if any(line.startswith(s) for s in known):
|
|
line = 'known'
|
|
prelines.add(line)
|
|
for line in postl:
|
|
if line.strip() == '':
|
|
print(card.name, card.text.text)
|
|
if any(line.startswith(s) for s in known):
|
|
line = 'known'
|
|
postlines.add(line)
|
|
for line in keyl:
|
|
if line.strip() == '':
|
|
print(card.name, card.text.text)
|
|
if any(line.startswith(s) for s in known):
|
|
line = 'known'
|
|
keylines.add(line)
|
|
for line in mainl:
|
|
if line.strip() == '':
|
|
print(card.name, card.text.text)
|
|
# if any(line.startswith(s) for s in known):
|
|
# line = 'known'
|
|
mainlines.add(line)
|
|
for line in costl:
|
|
if line.strip() == '':
|
|
print(card.name, card.text.text)
|
|
# if any(line.startswith(s) for s in known) or 'cycling' in line or 'monstrosity' in line:
|
|
# line = 'known'
|
|
costlines.add(line)
|
|
|
|
print('prel: {:d}, keyl: {:d}, mainl: {:d}, postl {:d}'
|
|
.format(len(prelines), len(keylines), len(mainlines), len(postlines)))
|
|
|
|
print('\nprelines')
|
|
for line in sorted(prelines):
|
|
print(line)
|
|
|
|
print('\npostlines')
|
|
for line in sorted(postlines):
|
|
print(line)
|
|
|
|
print('\ncostlines')
|
|
for line in sorted(costlines):
|
|
print(line)
|
|
|
|
print('\nkeylines')
|
|
for line in sorted(keylines):
|
|
print(line)
|
|
|
|
print('\nmainlines')
|
|
for line in sorted(mainlines):
|
|
#if any(s in line for s in ['champion', 'devour', 'tribute']):
|
|
print(line)
|
|
|
|
def check_vocab(fname):
|
|
cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
|
|
|
|
vocab = {}
|
|
for card in cards:
|
|
words = card.text.vectorize().split()
|
|
if card.bside:
|
|
words += card.bside.text.vectorize().split()
|
|
for word in words:
|
|
if not word in vocab:
|
|
vocab[word] = 1
|
|
else:
|
|
vocab[word] += 1
|
|
|
|
for word in sorted(vocab, lambda x,y: cmp(vocab[x], vocab[y]), reverse = True):
|
|
print('{:8d} : {:s}'.format(vocab[word], word))
|
|
|
|
n = 3
|
|
|
|
for card in cards:
|
|
words = card.text.vectorize().split()
|
|
if card.bside:
|
|
words += card.bside.text.vectorize().split()
|
|
for word in words:
|
|
if vocab[word] <= n:
|
|
#if 'name' in word:
|
|
print('\n{:8d} : {:s}'.format(vocab[word], word))
|
|
print(card.encode())
|
|
break
|
|
|
|
def check_characters(fname, vname):
|
|
cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
|
|
|
|
tokens = {c for c in utils.cardsep}
|
|
for card in cards:
|
|
for c in card.encode():
|
|
tokens.add(c)
|
|
|
|
token_to_idx = {tok:i+1 for i, tok in enumerate(sorted(tokens))}
|
|
idx_to_token = {i+1:tok for i, tok in enumerate(sorted(tokens))}
|
|
|
|
print('Vocabulary: ({:d} symbols)'.format(len(token_to_idx)))
|
|
for token in sorted(token_to_idx):
|
|
print('{:8s} : {:4d}'.format(repr(token), token_to_idx[token]))
|
|
|
|
# compliant with torch-rnn
|
|
if vname:
|
|
json_data = {'token_to_idx':token_to_idx, 'idx_to_token':idx_to_token}
|
|
print('writing vocabulary to {:s}'.format(vname))
|
|
with open(vname, 'w') as f:
|
|
json.dump(json_data, f)
|
|
|
|
if __name__ == '__main__':
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('infile', nargs='?', default=os.path.join(libdir, '../data/output.txt'),
|
|
help='encoded card file or json corpus to process')
|
|
parser.add_argument('-lines', action='store_true',
|
|
help='show behavior of line separation')
|
|
parser.add_argument('-vocab', action='store_true',
|
|
help='show vocabulary counts from encoded card text')
|
|
parser.add_argument('-chars', action='store_true',
|
|
help='generate and display vocabulary of characters used in encoding')
|
|
parser.add_argument('--vocab_name', default=None,
|
|
help='json file to write vocabulary to')
|
|
args = parser.parse_args()
|
|
|
|
if args.lines:
|
|
check_lines(args.infile)
|
|
if args.vocab:
|
|
check_vocab(args.infile)
|
|
if args.chars:
|
|
check_characters(args.infile, args.vocab_name)
|
|
|
|
exit(0)
|