code changes for encoding updates and sanity checker script that found them
This commit is contained in:
parent
6421c02f60
commit
d4b5ef2104
3 changed files with 177 additions and 12 deletions
|
@ -257,8 +257,11 @@ def fields_from_json(src_json, linetrans = True):
|
||||||
parsed = False
|
parsed = False
|
||||||
|
|
||||||
if 'subtypes' in src_json:
|
if 'subtypes' in src_json:
|
||||||
fields[field_subtypes] = [(-1, map(lambda s: utils.to_ascii(s.lower()),
|
fields[field_subtypes] = [(-1, map(lambda s: utils.to_ascii(s.lower())
|
||||||
|
# urza's lands...
|
||||||
|
.replace('"', "'").replace('-', utils.dash_marker),
|
||||||
src_json['subtypes']))]
|
src_json['subtypes']))]
|
||||||
|
|
||||||
|
|
||||||
if 'rarity' in src_json:
|
if 'rarity' in src_json:
|
||||||
if src_json['rarity'] in utils.json_rarity_map:
|
if src_json['rarity'] in utils.json_rarity_map:
|
||||||
|
|
|
@ -94,6 +94,19 @@ def text_pass_2_cardname(s, name):
|
||||||
for override in overrides:
|
for override in overrides:
|
||||||
s = s.replace(override, this_marker)
|
s = s.replace(override, this_marker)
|
||||||
|
|
||||||
|
# stupid planeswalker abilities
|
||||||
|
s = s.replace('to him.', 'to ' + this_marker + '.')
|
||||||
|
s = s.replace('to him this', 'to ' + this_marker + ' this')
|
||||||
|
s = s.replace('to himself', 'to itself')
|
||||||
|
s = s.replace("he's", this_marker + ' is')
|
||||||
|
|
||||||
|
# sometimes we actually don't want to do this replacement
|
||||||
|
s = s.replace('named ' + this_marker, 'named ' + name)
|
||||||
|
s = s.replace('name is still ' + this_marker, 'name is still ' + name)
|
||||||
|
s = s.replace('named keeper of ' + this_marker, 'named keeper of ' + name)
|
||||||
|
s = s.replace('named kobolds of ' + this_marker, 'named kobolds of ' + name)
|
||||||
|
s = s.replace('named sword of kaldra, ' + this_marker, 'named sword of kaldra, ' + name)
|
||||||
|
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
@ -133,9 +146,12 @@ def text_pass_4b_x(s):
|
||||||
s = s.replace(u'x\u2014', x_marker + u'\u2014')
|
s = s.replace(u'x\u2014', x_marker + u'\u2014')
|
||||||
s = s.replace('x.', x_marker + '.')
|
s = s.replace('x.', x_marker + '.')
|
||||||
s = s.replace('x,', x_marker + ',')
|
s = s.replace('x,', x_marker + ',')
|
||||||
|
s = s.replace('x is', x_marker + ' is')
|
||||||
|
s = s.replace('x can\'t', x_marker + ' can\'t')
|
||||||
s = s.replace('x/x', x_marker + '/' + x_marker)
|
s = s.replace('x/x', x_marker + '/' + x_marker)
|
||||||
s = s.replace('x target', x_marker + ' target')
|
s = s.replace('x target', x_marker + ' target')
|
||||||
s = s.replace('si' + x_marker + ' target', 'six target')
|
s = s.replace('si' + x_marker + ' target', 'six target')
|
||||||
|
s = s.replace('avara' + x_marker, 'avarax')
|
||||||
# there's also some stupid ice age card that wants -x/-y
|
# there's also some stupid ice age card that wants -x/-y
|
||||||
s = s.replace('/~', '/-')
|
s = s.replace('/~', '/-')
|
||||||
return s
|
return s
|
||||||
|
@ -469,25 +485,27 @@ def text_pass_11_linetrans(s):
|
||||||
# randomize the order of the lines
|
# randomize the order of the lines
|
||||||
# not a text pass, intended to be invoked dynamically when encoding a card
|
# not a text pass, intended to be invoked dynamically when encoding a card
|
||||||
# call this on fully encoded text, with mana symbols expanded
|
# call this on fully encoded text, with mana symbols expanded
|
||||||
def randomize_lines(text):
|
def separate_lines(text):
|
||||||
# forget about level up, ignore empty text too while we're at it
|
# forget about level up, ignore empty text too while we're at it
|
||||||
if text == '' or 'level up' in text:
|
if text == '' or 'level up' in text:
|
||||||
return [],[],[],[]
|
return [],[],[],[],[]
|
||||||
|
|
||||||
preline_search = ['equip', 'fortify', 'enchant ', 'bestow']
|
preline_search = ['equip', 'fortify', 'enchant ', 'bestow']
|
||||||
postline_search = [
|
costline_search = [
|
||||||
'countertype', 'multikicker', 'kicker', 'suspend', 'echo', 'awaken',
|
'multikicker', 'kicker', 'suspend', 'echo', 'awaken',
|
||||||
'buyback', 'champion', 'dash', 'entwine', 'evoke', 'fading', 'flashback',
|
'buyback', 'dash', 'entwine', 'evoke', 'flashback',
|
||||||
'madness', 'megamorph', 'morph', 'miracle', 'ninjutsu', 'overload',
|
'madness', 'megamorph', 'morph', 'miracle', 'ninjutsu', 'overload',
|
||||||
'prowl', 'recover', 'reinforce', 'replicate', 'scavenge', 'splice',
|
'prowl', 'recover', 'reinforce', 'replicate', 'scavenge', 'splice',
|
||||||
'surge', 'unearth', 'transmute', 'transfigure', 'vanishing', 'tribute',
|
'surge', 'unearth', 'transmute', 'transfigure',
|
||||||
]
|
]
|
||||||
# cycling is a special case to handle the variants
|
# cycling is a special case to handle the variants
|
||||||
|
postline_search = ['countertype']
|
||||||
keyline_search = ['cumulative']
|
keyline_search = ['cumulative']
|
||||||
|
|
||||||
prelines = []
|
prelines = []
|
||||||
keylines = []
|
keylines = []
|
||||||
mainlines = []
|
mainlines = []
|
||||||
|
costlines = []
|
||||||
postlines = []
|
postlines = []
|
||||||
|
|
||||||
lines = text.split(utils.newline)
|
lines = text.split(utils.newline)
|
||||||
|
@ -496,26 +514,28 @@ def randomize_lines(text):
|
||||||
if not '.' in line:
|
if not '.' in line:
|
||||||
if any(line.startswith(s) for s in preline_search):
|
if any(line.startswith(s) for s in preline_search):
|
||||||
prelines.append(line)
|
prelines.append(line)
|
||||||
elif any(line.startswith(s) for s in postline_search) or 'cycling' in line:
|
elif any(line.startswith(s) for s in postline_search):
|
||||||
postlines.append(line)
|
postlines.append(line)
|
||||||
|
elif any(line.startswith(s) for s in costline_search) or 'cycling' in line:
|
||||||
|
costlines.append(line)
|
||||||
else:
|
else:
|
||||||
keylines.append(line)
|
keylines.append(line)
|
||||||
elif (utils.dash_marker in line and not
|
elif (utils.dash_marker in line and not
|
||||||
(' '+utils.dash_marker+' ' in line or 'non'+utils.dash_marker in line)):
|
(' '+utils.dash_marker+' ' in line or 'non'+utils.dash_marker in line)):
|
||||||
if any(line.startswith(s) for s in preline_search):
|
if any(line.startswith(s) for s in preline_search):
|
||||||
prelines.append(line)
|
prelines.append(line)
|
||||||
elif any(line.startswith(s) for s in postline_search) or 'cycling' in line:
|
elif any(line.startswith(s) for s in costline_search) or 'cycling' in line:
|
||||||
postlines.append(line)
|
costlines.append(line)
|
||||||
elif any(line.startswith(s) for s in keyline_search):
|
elif any(line.startswith(s) for s in keyline_search):
|
||||||
keylines.append(line)
|
keylines.append(line)
|
||||||
else:
|
else:
|
||||||
mainlines.append(line)
|
mainlines.append(line)
|
||||||
elif ': monstrosity' in line:
|
elif ': monstrosity' in line:
|
||||||
postlines.append(line)
|
costlines.append(line)
|
||||||
else:
|
else:
|
||||||
mainlines.append(line)
|
mainlines.append(line)
|
||||||
|
|
||||||
return prelines, keylines, mainlines, postlines
|
return prelines, keylines, mainlines, costlines, postlines
|
||||||
|
|
||||||
|
|
||||||
# Text unpasses, for decoding. All assume the text inside a Manatext, so don't do anything
|
# Text unpasses, for decoding. All assume the text inside a Manatext, so don't do anything
|
||||||
|
|
142
scripts/sanity.py
Executable file
142
scripts/sanity.py
Executable file
|
@ -0,0 +1,142 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
|
||||||
|
sys.path.append(libdir)
|
||||||
|
import utils
|
||||||
|
import jdecode
|
||||||
|
import transforms
|
||||||
|
|
||||||
|
def check_lines(fname):
|
||||||
|
cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
|
||||||
|
|
||||||
|
prelines = set()
|
||||||
|
keylines = set()
|
||||||
|
mainlines = set()
|
||||||
|
costlines = set()
|
||||||
|
postlines = set()
|
||||||
|
|
||||||
|
known = ['enchant ', 'equip', 'countertype', 'multikicker', 'kicker',
|
||||||
|
'suspend', 'echo', 'awaken', 'bestow', 'buyback',
|
||||||
|
'cumulative', 'dash', 'entwine', 'evoke', 'fortify',
|
||||||
|
'flashback', 'madness', 'morph', 'megamorph', 'miracle', 'ninjutsu',
|
||||||
|
'overload', 'prowl', 'recover', 'reinforce', 'replicate', 'scavenge',
|
||||||
|
'splice', 'surge', 'unearth', 'transfigure', 'transmute',
|
||||||
|
]
|
||||||
|
known = []
|
||||||
|
|
||||||
|
for card in cards:
|
||||||
|
prel, keyl, mainl, costl, postl = transforms.separate_lines(card.text.encode(randomize=False))
|
||||||
|
if card.bside:
|
||||||
|
prel2, keyl2, mainl2, costl2, postl2 = transforms.separate_lines(card.bside.text.encode(randomize=False))
|
||||||
|
prel += prel2
|
||||||
|
keyl += keyl2
|
||||||
|
mainl += mainl2
|
||||||
|
costl += costl2
|
||||||
|
postl += postl2
|
||||||
|
|
||||||
|
for line in prel:
|
||||||
|
if line.strip() == '':
|
||||||
|
print(card.name, card.text.text)
|
||||||
|
if any(line.startswith(s) for s in known):
|
||||||
|
line = 'known'
|
||||||
|
prelines.add(line)
|
||||||
|
for line in postl:
|
||||||
|
if line.strip() == '':
|
||||||
|
print(card.name, card.text.text)
|
||||||
|
if any(line.startswith(s) for s in known):
|
||||||
|
line = 'known'
|
||||||
|
postlines.add(line)
|
||||||
|
for line in keyl:
|
||||||
|
if line.strip() == '':
|
||||||
|
print(card.name, card.text.text)
|
||||||
|
if any(line.startswith(s) for s in known):
|
||||||
|
line = 'known'
|
||||||
|
keylines.add(line)
|
||||||
|
for line in mainl:
|
||||||
|
if line.strip() == '':
|
||||||
|
print(card.name, card.text.text)
|
||||||
|
# if any(line.startswith(s) for s in known):
|
||||||
|
# line = 'known'
|
||||||
|
mainlines.add(line)
|
||||||
|
for line in costl:
|
||||||
|
if line.strip() == '':
|
||||||
|
print(card.name, card.text.text)
|
||||||
|
# if any(line.startswith(s) for s in known) or 'cycling' in line or 'monstrosity' in line:
|
||||||
|
# line = 'known'
|
||||||
|
costlines.add(line)
|
||||||
|
|
||||||
|
print('prel: {:d}, keyl: {:d}, mainl: {:d}, postl {:d}'
|
||||||
|
.format(len(prelines), len(keylines), len(mainlines), len(postlines)))
|
||||||
|
|
||||||
|
print('\nprelines')
|
||||||
|
for line in sorted(prelines):
|
||||||
|
print(line)
|
||||||
|
|
||||||
|
print('\npostlines')
|
||||||
|
for line in sorted(postlines):
|
||||||
|
print(line)
|
||||||
|
|
||||||
|
print('\ncostlines')
|
||||||
|
for line in sorted(costlines):
|
||||||
|
print(line)
|
||||||
|
|
||||||
|
print('\nkeylines')
|
||||||
|
for line in sorted(keylines):
|
||||||
|
print(line)
|
||||||
|
|
||||||
|
print('\nmainlines')
|
||||||
|
for line in sorted(mainlines):
|
||||||
|
#if any(s in line for s in ['champion', 'devour', 'tribute']):
|
||||||
|
print(line)
|
||||||
|
|
||||||
|
def check_vocab(fname):
|
||||||
|
cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
|
||||||
|
|
||||||
|
vocab = {}
|
||||||
|
for card in cards:
|
||||||
|
words = card.text.vectorize().split()
|
||||||
|
if card.bside:
|
||||||
|
words += card.bside.text.vectorize().split()
|
||||||
|
for word in words:
|
||||||
|
if not word in vocab:
|
||||||
|
vocab[word] = 1
|
||||||
|
else:
|
||||||
|
vocab[word] += 1
|
||||||
|
|
||||||
|
for word in sorted(vocab, lambda x,y: cmp(vocab[x], vocab[y]), reverse = True):
|
||||||
|
print('{:8d} : {:s}'.format(vocab[word], word))
|
||||||
|
|
||||||
|
n = 3
|
||||||
|
|
||||||
|
for card in cards:
|
||||||
|
words = card.text.vectorize().split()
|
||||||
|
if card.bside:
|
||||||
|
words += card.bside.text.vectorize().split()
|
||||||
|
for word in words:
|
||||||
|
if vocab[word] <= n:
|
||||||
|
#if 'name' in word:
|
||||||
|
print('\n{:8d} : {:s}'.format(vocab[word], word))
|
||||||
|
print(card.encode())
|
||||||
|
break
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import argparse
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument('infile', nargs='?', default=os.path.join(libdir, '../data/output.txt'),
|
||||||
|
help='encoded card file or json corpus to process')
|
||||||
|
parser.add_argument('-lines', action='store_true',
|
||||||
|
help='show behavior of line separation')
|
||||||
|
parser.add_argument('-vocab', action='store_true',
|
||||||
|
help='show vocabulary counts from encoded card text')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.lines:
|
||||||
|
check_lines(args.infile)
|
||||||
|
if args.vocab:
|
||||||
|
check_vocab(args.infile)
|
||||||
|
|
||||||
|
exit(0)
|
Loading…
Reference in a new issue