added initial files
This commit is contained in:
commit
f0fd56d8ea
4 changed files with 618 additions and 0 deletions
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
|
@ -0,0 +1,4 @@
|
|||
*~
|
||||
*.pyc
|
||||
AllSets.json
|
||||
AllSets-x.json
|
16
README.md
Normal file
16
README.md
Normal file
|
@ -0,0 +1,16 @@
|
|||
mtgencode
|
||||
======
|
||||
|
||||
Python scripts for encoding MTG cards in a way that is hopefully nice for neural networks.
|
||||
|
||||
I apologize in advance for the quality of this code. Once I figure out the best way to do things, I might try to clean it up. Until then it's going to be a mess.
|
||||
|
||||
To use the script, you'll need to get the json corpus of magic cards from mtgjson.com. I usually encode from AllSets.json, but if you want to extend the code you can change it to use the other fields from AllSets-x.json.
|
||||
|
||||
Once you have the json corpus:
|
||||
```
|
||||
python encode.py AllSets.json output.txt
|
||||
```
|
||||
will read the corpus from AllSets.json and put the new encoding in output.txt.
|
||||
|
||||
Apparently I'm running Python 2.7.6.
|
532
encode.py
Normal file
532
encode.py
Normal file
|
@ -0,0 +1,532 @@
|
|||
import jdecode
|
||||
import re
|
||||
import codecs
|
||||
|
||||
#badwords = []
|
||||
|
||||
valid_encoded_char = r'[abcdefghijklmnopqrstuvwxyz\'+\-*",.:;WUBRGPV/XTQ|\\&^\{\}@ \n=~#]'
|
||||
|
||||
dash_marker = '~'
|
||||
bullet_marker = '='
|
||||
reserved_indicator = '\r'
|
||||
|
||||
def to_ascii(s):
|
||||
s = s.replace(u'\u2014', dash_marker) # unicode long dash
|
||||
s = s.replace(u'\u2022', bullet_marker) # unicode bullet
|
||||
s = s.replace(u'\u2019', '"') # single quote
|
||||
s = s.replace(u'\u2018', '"') # single quote
|
||||
s = s.replace(u'\u2212', '-') # minus sign
|
||||
s = s.replace(u'\xe6', 'ae') # ae symbol
|
||||
s = s.replace(u'\xfb', 'u') # u with caret
|
||||
s = s.replace(u'\xfa', 'u') # u with accent
|
||||
s = s.replace(u'\xe9', 'e') # e with accent
|
||||
s = s.replace(u'\xe1', 'a') # a with accent
|
||||
s = s.replace(u'\xe0', 'a') # a with accent going the other way
|
||||
s = s.replace(u'\xe2', 'a') # a with caret
|
||||
s = s.replace(u'\xf6', 'o') # o with umlaut
|
||||
s = s.replace(u'\xed', 'i') # i with accent
|
||||
return s
|
||||
|
||||
# This whole things assumes the json format of mtgjson.com.
|
||||
|
||||
# Here's a brief list of relevant fields:
|
||||
# name - string
|
||||
# names - list (used for split, flip, and double-faced)
|
||||
# manaCost - string
|
||||
# cmc - number
|
||||
# colors - list
|
||||
# type - string (the whole big long damn thing)
|
||||
# supertypes - list
|
||||
# types - list
|
||||
# subtypes - list
|
||||
# text - string
|
||||
# power - string
|
||||
# toughness - string
|
||||
# loyalty - number
|
||||
|
||||
# And some less useful ones, in case they're wanted for something:
|
||||
# layout - string
|
||||
# rarity - string
|
||||
# flavor - string
|
||||
# artis - string
|
||||
# number - string
|
||||
# multiverseid - number
|
||||
# variations - list
|
||||
# imageName - string
|
||||
# watermark - string
|
||||
# border - string
|
||||
# timeshifted - boolean
|
||||
# hand - number
|
||||
# life - number
|
||||
# reserved - boolean
|
||||
# releaseDate - string
|
||||
# starter - boolean
|
||||
|
||||
fieldsep = '|'
|
||||
newline = '\\'
|
||||
unary_marker = '&'
|
||||
unary_counter = '^'
|
||||
mana_open_delimiter = '{'
|
||||
mana_close_delimiter = '}'
|
||||
x_marker = 'X'
|
||||
tap_marker = 'T'
|
||||
untap_marker = 'Q'
|
||||
this_marker = '@'
|
||||
counter_marker = '#'
|
||||
bsidesep = '\n'
|
||||
|
||||
unary_max = 30
|
||||
|
||||
def to_unary(s):
|
||||
numbers = re.findall(r'[0123456789]+', s)
|
||||
for n in sorted(numbers, cmp = lambda x,y: cmp(int(x), int(y)) * -1):
|
||||
i = int(n)
|
||||
if i == 40:
|
||||
s = s.replace(n, 'forty')
|
||||
elif i == 50:
|
||||
s = s.replace(n, 'fifty')
|
||||
elif i == 100:
|
||||
s = s.replace(n, 'one hundred')
|
||||
elif i == 200:
|
||||
s = s.replace(n, 'two hundred')
|
||||
else:
|
||||
if i > unary_max:
|
||||
i = unary_max
|
||||
print s
|
||||
s = s.replace(n, unary_marker + unary_counter * i)
|
||||
|
||||
return s
|
||||
|
||||
|
||||
# also handles the tap and untap symbols
|
||||
def compress_mana(manastring):
|
||||
# mana string is of the form '{3}{W}{2/B}', as specified by mtgjson
|
||||
translations = {
|
||||
'{w}' : 'WW',
|
||||
'{u}' : 'UU',
|
||||
'{b}' : 'BB',
|
||||
'{r}' : 'RR',
|
||||
'{g}' : 'GG',
|
||||
'{p}' : 'PP',
|
||||
'{w/p}' : 'WP',
|
||||
'{u/p}' : 'UP',
|
||||
'{b/p}' : 'BP',
|
||||
'{r/p}' : 'RP',
|
||||
'{g/p}' : 'GP',
|
||||
'{2/w}' : 'VW',
|
||||
'{2/u}' : 'VU',
|
||||
'{2/b}' : 'VB',
|
||||
'{2/r}' : 'VR',
|
||||
'{2/g}' : 'VG',
|
||||
'{w/u}' : 'WU',
|
||||
'{w/b}' : 'WB',
|
||||
'{r/w}' : 'RW',
|
||||
'{g/w}' : 'GW',
|
||||
'{u/b}' : 'UB',
|
||||
'{u/r}' : 'UR',
|
||||
'{g/u}' : 'GU',
|
||||
'{b/r}' : 'BR',
|
||||
'{b/g}' : 'BG',
|
||||
'{r/g}' : 'RG',
|
||||
'{s}' : 'SS',
|
||||
'{x}' : x_marker * 2,
|
||||
#'{xx}' : x_marker * 4,
|
||||
#'{xxx}' : x_marker * 6,
|
||||
'{t}' : tap_marker,
|
||||
'{q}' : untap_marker,
|
||||
}
|
||||
for t in translations:
|
||||
manastring = manastring.replace(t, translations[t])
|
||||
|
||||
numbers = re.findall(r'\{[0123456789]+\}', manastring)
|
||||
for n in numbers:
|
||||
i = int(re.findall(r'[0123456789]+', n)[0])
|
||||
manastring = manastring.replace(n, unary_counter * i)
|
||||
|
||||
# we don't really need delimiters for tap, it's a unique symbol anyways
|
||||
if manastring in [tap_marker, untap_marker]:
|
||||
return manastring
|
||||
else:
|
||||
return '{' + manastring + '}'
|
||||
|
||||
def replace_mana(s):
|
||||
manastrings = re.findall(r'\{[\{\}wubrgp/xtq0123456789]+\}', s)
|
||||
for manastring in manastrings:
|
||||
s = s.replace(manastring, compress_mana(manastring))
|
||||
return s
|
||||
|
||||
|
||||
def strip_reminder_text(s):
|
||||
return re.sub(r'\(.*\)', '', s)
|
||||
|
||||
def replace_newlines(s):
|
||||
return s.replace('\n', '\\')
|
||||
|
||||
|
||||
def replace_cardname(s, name):
|
||||
# here are some fun edge cases, thanks to jml34 on the forum for
|
||||
# pointing them out
|
||||
if name == 'sacrifice':
|
||||
s = s.replace(name, this_marker, 1)
|
||||
return s
|
||||
elif name == 'fear':
|
||||
return s
|
||||
|
||||
s = s.replace(name, this_marker)
|
||||
|
||||
# so, some legends don't use the full cardname in their text box...
|
||||
# this check finds about 400 of them
|
||||
nameparts = name.split(',')
|
||||
if len(nameparts) > 1:
|
||||
mininame = nameparts[0]
|
||||
new_s = s.replace(mininame, this_marker)
|
||||
if not new_s == s:
|
||||
s = new_s
|
||||
# on first inspection, the replacements all look good
|
||||
# print '------------------'
|
||||
# print name
|
||||
# print '----'
|
||||
# print s
|
||||
|
||||
# a few others don't have a convenient comma to detect their nicknames,
|
||||
# so we override them here
|
||||
overrides = [
|
||||
# detectable by splitting on 'the', though that might cause other issues
|
||||
'crovax',
|
||||
'rashka',
|
||||
'phage',
|
||||
'shimatsu',
|
||||
# random and arbitrary: they have a last name, 1996 world champion, etc.
|
||||
'world champion',
|
||||
'axelrod',
|
||||
'hazezon',
|
||||
'rubinia',
|
||||
'rasputin',
|
||||
'hivis',
|
||||
]
|
||||
|
||||
for override in overrides:
|
||||
s = s.replace(override, this_marker)
|
||||
|
||||
# some detection code when the overrides need to be fixed...
|
||||
# global badwords
|
||||
# bad = False
|
||||
# for word in name.replace(',', '').split():
|
||||
# if word in s and not word in badwords:
|
||||
# badwords += [word]
|
||||
return s
|
||||
|
||||
|
||||
def sanitize_name(s):
|
||||
s = s.replace('!', '')
|
||||
s = s.replace('?', '')
|
||||
s = s.replace('-', dash_marker)
|
||||
s = s.replace('100,000', 'one hundred thousand')
|
||||
s = s.replace('1,000', 'one thousand')
|
||||
s = s.replace('1996', 'nineteen ninety-six')
|
||||
return s
|
||||
|
||||
|
||||
# call this before replacing newlines
|
||||
# this one ends up being really bad because of the confusion
|
||||
# with 'counter target spell or ability'
|
||||
def replace_counters(s):
|
||||
#so, big fat old dictionary time!!!!!!!!!
|
||||
allcounters = [
|
||||
'time counter',
|
||||
'devotion counter',
|
||||
'charge counter',
|
||||
'ki counter',
|
||||
'matrix counter',
|
||||
'spore counter',
|
||||
'poison counter',
|
||||
'quest counter',
|
||||
'hatchling counter',
|
||||
'storage counter',
|
||||
'growth counter',
|
||||
'paralyzation counter',
|
||||
'energy counter',
|
||||
'study counter',
|
||||
'glyph counter',
|
||||
'depletion counter',
|
||||
'sleight counter',
|
||||
'loyalty counter',
|
||||
'hoofprint counter',
|
||||
'wage counter',
|
||||
'echo counter',
|
||||
'lore counter',
|
||||
'page counter',
|
||||
'divinity counter',
|
||||
'mannequin counter',
|
||||
'ice counter',
|
||||
'fade counter',
|
||||
'pain counter',
|
||||
#'age counter',
|
||||
'gold counter',
|
||||
'muster counter',
|
||||
'infection counter',
|
||||
'plague counter',
|
||||
'fate counter',
|
||||
'slime counter',
|
||||
'shell counter',
|
||||
'credit counter',
|
||||
'despair counter',
|
||||
'globe counter',
|
||||
'currency counter',
|
||||
'blood counter',
|
||||
'soot counter',
|
||||
'carrion counter',
|
||||
'fuse counter',
|
||||
'filibuster counter',
|
||||
'wind counter',
|
||||
'hourglass counter',
|
||||
'trap counter',
|
||||
'corpse counter',
|
||||
'awakening counter',
|
||||
'verse counter',
|
||||
'scream counter',
|
||||
'doom counter',
|
||||
'luck counter',
|
||||
'intervention counter',
|
||||
'eyeball counter',
|
||||
'flood counter',
|
||||
'eon counter',
|
||||
'death counter',
|
||||
'delay counter',
|
||||
'blaze counter',
|
||||
'magnet counter',
|
||||
'feather counter',
|
||||
'shield counter',
|
||||
'wish counter',
|
||||
'petal counter',
|
||||
'music counter',
|
||||
'pressure counter',
|
||||
'manifestation counter',
|
||||
#'net counter',
|
||||
'velocity counter',
|
||||
'vitality counter',
|
||||
'treasure counter',
|
||||
'pin counter',
|
||||
'bounty counter',
|
||||
'rust counter',
|
||||
'mire counter',
|
||||
'tower counter',
|
||||
#'ore counter',
|
||||
'cube counter',
|
||||
'strife counter',
|
||||
'elixir counter',
|
||||
'hunger counter',
|
||||
'level counter',
|
||||
'winch counter',
|
||||
'fungus counter',
|
||||
'training counter',
|
||||
'theft counter',
|
||||
'arrowhead counter',
|
||||
'sleep counter',
|
||||
'healing counter',
|
||||
'mining counter',
|
||||
'dream counter',
|
||||
'aim counter',
|
||||
'arrow counter',
|
||||
'javelin counter',
|
||||
'gem counter',
|
||||
'bribery counter',
|
||||
'mine counter',
|
||||
'omen counter',
|
||||
'phylactery counter',
|
||||
'tide counter',
|
||||
'polyp counter',
|
||||
'petrification counter',
|
||||
'shred counter',
|
||||
'pupa counter',
|
||||
]
|
||||
usedcounters = []
|
||||
for countername in allcounters:
|
||||
if countername in s:
|
||||
usedcounters += [countername]
|
||||
s = s.replace(countername, counter_marker + ' counter')
|
||||
|
||||
# oh god some of the counter names are suffixes of others...
|
||||
shortcounters = [
|
||||
'age counter',
|
||||
'net counter',
|
||||
'ore counter',
|
||||
]
|
||||
for countername in shortcounters:
|
||||
if countername in s:
|
||||
usedcounters += [countername]
|
||||
s = s.replace(countername, counter_marker + ' counter')
|
||||
|
||||
# miraculously this doesn't seem to happen
|
||||
# if len(usedcounters) > 1:
|
||||
# print usedcounters
|
||||
|
||||
# we haven't done newline replacement yet, so use actual newlines
|
||||
if len(usedcounters) == 1:
|
||||
# and yeah, this line of code can blow up in all kinds of different ways
|
||||
s = 'countertype ' + counter_marker + ' ' + usedcounters[0].split()[0] + '\n' + s
|
||||
|
||||
# random code for finding out all the counter names
|
||||
# global badwords
|
||||
# countertypes = re.findall(r'[| ][^ ]+ counter', s)
|
||||
# for countertype in countertypes:
|
||||
# minicounter = countertype[1:]
|
||||
# if not minicounter in badwords:
|
||||
# badwords += [minicounter]
|
||||
return s
|
||||
|
||||
|
||||
# run only after doing unary conversion
|
||||
def fix_dashes(s):
|
||||
s = s.replace('-' + unary_marker, reserved_indicator)
|
||||
s = s.replace('-', dash_marker)
|
||||
s = s.replace(reserved_indicator, '-' + unary_marker)
|
||||
|
||||
# level up is annoying
|
||||
levels = re.findall(r'level &\^*\-&', s)
|
||||
for level in levels:
|
||||
newlevel = level.replace('-', dash_marker)
|
||||
s = s.replace(level, newlevel)
|
||||
|
||||
levels = re.findall(r'level &\^*\+', s)
|
||||
for level in levels:
|
||||
newlevel = level.replace('+', dash_marker)
|
||||
s = s.replace(level, newlevel)
|
||||
|
||||
# and we still have the ~x issue
|
||||
|
||||
return s
|
||||
|
||||
|
||||
# run this after fixing dashes, because this unbreaks the ~x issue
|
||||
# also probably don't run this on names, there are a few names with x~ in them.
|
||||
def fix_x(s):
|
||||
s = s.replace(dash_marker + 'x', '-' + x_marker)
|
||||
s = s.replace('+x', '+' + x_marker)
|
||||
s = s.replace(' x ', ' ' + x_marker + ' ')
|
||||
s = s.replace('x:', x_marker + ':')
|
||||
s = s.replace('x~', x_marker + '~')
|
||||
s = s.replace('x.', x_marker + '.')
|
||||
s = s.replace('x,', x_marker + ',')
|
||||
s = s.replace('x/x', x_marker + '/' + x_marker)
|
||||
return s
|
||||
|
||||
|
||||
# do before removing newlines
|
||||
# might as well do this after countertype because we probably care more about
|
||||
# the location of the equip cost
|
||||
def relocate_equip(s):
|
||||
equips = re.findall(r'equip \{[WUBRGPV/XTQ&^]*\}.?$', s)
|
||||
# there don't seem to be any cases with more than one
|
||||
if len(equips) == 1:
|
||||
equip = equips[0]
|
||||
s = s.replace('\n' + equip, '')
|
||||
s = s.replace(equip, '')
|
||||
|
||||
if equip[-1:] == ' ':
|
||||
equip = equip[0:-1]
|
||||
|
||||
if s == '':
|
||||
s = equip
|
||||
else:
|
||||
s = equip + '\n' + s
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def encode(card):
|
||||
# filter out vanguard cards
|
||||
if card['layout'] in ['token', 'plane', 'scheme', 'phenomenon', 'vanguard']:
|
||||
return
|
||||
|
||||
encoding = fieldsep
|
||||
name = card['name'].lower()
|
||||
encoding += sanitize_name(name)
|
||||
encoding += fieldsep
|
||||
if 'supertypes' in card:
|
||||
encoding += ' '.join(card['supertypes']).lower()
|
||||
encoding += fieldsep
|
||||
encoding += ' '.join(card['types']).lower()
|
||||
encoding += fieldsep
|
||||
if 'loyalty' in card:
|
||||
encoding += to_unary(str(card['loyalty']))
|
||||
encoding += fieldsep
|
||||
if 'subtypes' in card:
|
||||
encoding += ' '.join(card['subtypes']).lower()
|
||||
encoding += fieldsep
|
||||
if 'power' in card and 'toughness' in card:
|
||||
encoding += to_unary(card['power']) + '/' + to_unary(card['toughness'])
|
||||
encoding += fieldsep
|
||||
if 'manaCost' in card:
|
||||
encoding += replace_mana(card['manaCost'].lower())
|
||||
encoding += fieldsep
|
||||
if 'text' in card:
|
||||
text = card['text'].lower()
|
||||
text = strip_reminder_text(text)
|
||||
text = replace_cardname(text, name)
|
||||
text = replace_mana(text)
|
||||
text = to_unary(text)
|
||||
text = fix_dashes(text)
|
||||
text = fix_x(text)
|
||||
text = replace_counters(text)
|
||||
text = relocate_equip(text)
|
||||
text = replace_newlines(text)
|
||||
encoding += text
|
||||
encoding += fieldsep
|
||||
# if 'flavor' in card:
|
||||
# encoding += card['flavor'].lower()
|
||||
# encoding += fieldsep
|
||||
|
||||
# now output the bside if there is one
|
||||
if 'bside' in card:
|
||||
encoding += bsidesep
|
||||
encoding += encode(card['bside'])
|
||||
|
||||
encoding = to_ascii(encoding)
|
||||
# encoding = re.sub(valid_encoded_char, '', encoding)
|
||||
# if not encoding == '':
|
||||
# print card
|
||||
return encoding
|
||||
|
||||
def encode_duplicated(cards):
|
||||
# Boring solution: only write out the first one...
|
||||
return encode(cards[0])
|
||||
|
||||
|
||||
def main(fname, oname = None, verbose = True):
|
||||
if verbose:
|
||||
print 'Opening json file: ' + fname
|
||||
|
||||
allcards = jdecode.mtg_open_json(fname, verbose)
|
||||
|
||||
if not oname == None:
|
||||
if verbose:
|
||||
print 'Writing output to: ' + oname
|
||||
ofile = codecs.open(oname, 'w', 'utf-8')
|
||||
|
||||
for card in allcards:
|
||||
val = encode_duplicated(allcards[card])
|
||||
if not (val == None or val == ''):
|
||||
if oname == None:
|
||||
print val + '\n'
|
||||
else:
|
||||
ofile.write(val + '\n\n')
|
||||
|
||||
# print len(badwords)
|
||||
# for word in badwords:
|
||||
# print word
|
||||
|
||||
if not oname == None:
|
||||
ofile.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
if len(sys.argv) == 2:
|
||||
main(sys.argv[1])
|
||||
elif len(sys.argv) == 3:
|
||||
main(sys.argv[1], oname = sys.argv[2])
|
||||
else:
|
||||
print 'Usage: ' + sys.argv[0] + ' ' + '<JSON file> [output filename]'
|
||||
exit(1)
|
||||
|
66
jdecode.py
Normal file
66
jdecode.py
Normal file
|
@ -0,0 +1,66 @@
|
|||
import json
|
||||
|
||||
# to allow filtering of sets like un sets, etc...
|
||||
def legal_set(set):
|
||||
return not set['type'] == 'un'
|
||||
|
||||
def mtg_open_json(fname, verbose = False):
|
||||
|
||||
f = open(fname, 'r')
|
||||
jobj = json.load(f)
|
||||
f.close()
|
||||
|
||||
allcards = {}
|
||||
asides = {}
|
||||
bsides = {}
|
||||
|
||||
for k_set in jobj:
|
||||
set = jobj[k_set]
|
||||
setname = set['name']
|
||||
|
||||
if legal_set(set):
|
||||
for card in set['cards']:
|
||||
card['setName'] = setname
|
||||
|
||||
cardnumber = None
|
||||
if 'number' in card:
|
||||
cardnumber = card['number']
|
||||
cardname = card['name']
|
||||
|
||||
uid = set['code']
|
||||
if cardnumber == None:
|
||||
uid = uid + '_' + cardname + '_'
|
||||
else:
|
||||
uid = uid + '_' + cardnumber
|
||||
|
||||
# aggregate by name to avoid duplicates, not counting bsides
|
||||
if not uid[-1] == 'b':
|
||||
if cardname in allcards:
|
||||
allcards[cardname] += [card]
|
||||
else:
|
||||
allcards[cardname] = [card]
|
||||
|
||||
# also aggregate aside cards by uid so we can add bsides later
|
||||
if uid[-1:] == 'a':
|
||||
asides[uid] = card
|
||||
if uid[-1:] == 'b':
|
||||
bsides[uid] = card
|
||||
|
||||
#break
|
||||
|
||||
for uid in bsides:
|
||||
aside_uid = uid[:-1] + 'a'
|
||||
if aside_uid in asides:
|
||||
# the second check handles the brothers yamazaki edge case
|
||||
if not asides[aside_uid]['name'] == bsides[uid]['name']:
|
||||
asides[aside_uid]['bside'] = bsides[uid]
|
||||
else:
|
||||
pass
|
||||
# this exposes some coldsnap theme deck bsides that aren't
|
||||
# really bsides; shouldn't matter too much
|
||||
#print aside_uid
|
||||
#print bsides[uid]
|
||||
|
||||
if verbose:
|
||||
print 'Opened ' + str(len(allcards)) + ' uniquely named cards.'
|
||||
return allcards
|
Loading…
Reference in a new issue