From f0fd56d8eaa0c7dfffa25be15f4724d4347c5a97 Mon Sep 17 00:00:00 2001 From: billzorn Date: Sat, 27 Jun 2015 18:35:11 -0700 Subject: [PATCH] added initial files --- .gitignore | 4 + README.md | 16 ++ encode.py | 532 +++++++++++++++++++++++++++++++++++++++++++++++++++++ jdecode.py | 66 +++++++ 4 files changed, 618 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 encode.py create mode 100644 jdecode.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d15d99f --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*~ +*.pyc +AllSets.json +AllSets-x.json diff --git a/README.md b/README.md new file mode 100644 index 0000000..6a2e5fb --- /dev/null +++ b/README.md @@ -0,0 +1,16 @@ +mtgencode +====== + +Python scripts for encoding MTG cards in a way that is hopefully nice for neural networks. + +I apologize in advance for the quality of this code. Once I figure out the best way to do things, I might try to clean it up. Until then it's going to be a mess. + +To use the script, you'll need to get the json corpus of magic cards from mtgjson.com. I usually encode from AllSets.json, but if you want to extend the code you can change it to use the other fields from AllSets-x.json. + +Once you have the json corpus: +``` +python encode.py AllSets.json output.txt +``` +will read the corpus from AllSets.json and put the new encoding in output.txt. + +Apparently I'm running Python 2.7.6. diff --git a/encode.py b/encode.py new file mode 100644 index 0000000..e17985f --- /dev/null +++ b/encode.py @@ -0,0 +1,532 @@ +import jdecode +import re +import codecs + +#badwords = [] + +valid_encoded_char = r'[abcdefghijklmnopqrstuvwxyz\'+\-*",.:;WUBRGPV/XTQ|\\&^\{\}@ \n=~#]' + +dash_marker = '~' +bullet_marker = '=' +reserved_indicator = '\r' + +def to_ascii(s): + s = s.replace(u'\u2014', dash_marker) # unicode long dash + s = s.replace(u'\u2022', bullet_marker) # unicode bullet + s = s.replace(u'\u2019', '"') # single quote + s = s.replace(u'\u2018', '"') # single quote + s = s.replace(u'\u2212', '-') # minus sign + s = s.replace(u'\xe6', 'ae') # ae symbol + s = s.replace(u'\xfb', 'u') # u with caret + s = s.replace(u'\xfa', 'u') # u with accent + s = s.replace(u'\xe9', 'e') # e with accent + s = s.replace(u'\xe1', 'a') # a with accent + s = s.replace(u'\xe0', 'a') # a with accent going the other way + s = s.replace(u'\xe2', 'a') # a with caret + s = s.replace(u'\xf6', 'o') # o with umlaut + s = s.replace(u'\xed', 'i') # i with accent + return s + +# This whole things assumes the json format of mtgjson.com. + +# Here's a brief list of relevant fields: +# name - string +# names - list (used for split, flip, and double-faced) +# manaCost - string +# cmc - number +# colors - list +# type - string (the whole big long damn thing) +# supertypes - list +# types - list +# subtypes - list +# text - string +# power - string +# toughness - string +# loyalty - number + +# And some less useful ones, in case they're wanted for something: +# layout - string +# rarity - string +# flavor - string +# artis - string +# number - string +# multiverseid - number +# variations - list +# imageName - string +# watermark - string +# border - string +# timeshifted - boolean +# hand - number +# life - number +# reserved - boolean +# releaseDate - string +# starter - boolean + +fieldsep = '|' +newline = '\\' +unary_marker = '&' +unary_counter = '^' +mana_open_delimiter = '{' +mana_close_delimiter = '}' +x_marker = 'X' +tap_marker = 'T' +untap_marker = 'Q' +this_marker = '@' +counter_marker = '#' +bsidesep = '\n' + +unary_max = 30 + +def to_unary(s): + numbers = re.findall(r'[0123456789]+', s) + for n in sorted(numbers, cmp = lambda x,y: cmp(int(x), int(y)) * -1): + i = int(n) + if i == 40: + s = s.replace(n, 'forty') + elif i == 50: + s = s.replace(n, 'fifty') + elif i == 100: + s = s.replace(n, 'one hundred') + elif i == 200: + s = s.replace(n, 'two hundred') + else: + if i > unary_max: + i = unary_max + print s + s = s.replace(n, unary_marker + unary_counter * i) + + return s + + +# also handles the tap and untap symbols +def compress_mana(manastring): + # mana string is of the form '{3}{W}{2/B}', as specified by mtgjson + translations = { + '{w}' : 'WW', + '{u}' : 'UU', + '{b}' : 'BB', + '{r}' : 'RR', + '{g}' : 'GG', + '{p}' : 'PP', + '{w/p}' : 'WP', + '{u/p}' : 'UP', + '{b/p}' : 'BP', + '{r/p}' : 'RP', + '{g/p}' : 'GP', + '{2/w}' : 'VW', + '{2/u}' : 'VU', + '{2/b}' : 'VB', + '{2/r}' : 'VR', + '{2/g}' : 'VG', + '{w/u}' : 'WU', + '{w/b}' : 'WB', + '{r/w}' : 'RW', + '{g/w}' : 'GW', + '{u/b}' : 'UB', + '{u/r}' : 'UR', + '{g/u}' : 'GU', + '{b/r}' : 'BR', + '{b/g}' : 'BG', + '{r/g}' : 'RG', + '{s}' : 'SS', + '{x}' : x_marker * 2, + #'{xx}' : x_marker * 4, + #'{xxx}' : x_marker * 6, + '{t}' : tap_marker, + '{q}' : untap_marker, + } + for t in translations: + manastring = manastring.replace(t, translations[t]) + + numbers = re.findall(r'\{[0123456789]+\}', manastring) + for n in numbers: + i = int(re.findall(r'[0123456789]+', n)[0]) + manastring = manastring.replace(n, unary_counter * i) + + # we don't really need delimiters for tap, it's a unique symbol anyways + if manastring in [tap_marker, untap_marker]: + return manastring + else: + return '{' + manastring + '}' + +def replace_mana(s): + manastrings = re.findall(r'\{[\{\}wubrgp/xtq0123456789]+\}', s) + for manastring in manastrings: + s = s.replace(manastring, compress_mana(manastring)) + return s + + +def strip_reminder_text(s): + return re.sub(r'\(.*\)', '', s) + +def replace_newlines(s): + return s.replace('\n', '\\') + + +def replace_cardname(s, name): + # here are some fun edge cases, thanks to jml34 on the forum for + # pointing them out + if name == 'sacrifice': + s = s.replace(name, this_marker, 1) + return s + elif name == 'fear': + return s + + s = s.replace(name, this_marker) + + # so, some legends don't use the full cardname in their text box... + # this check finds about 400 of them + nameparts = name.split(',') + if len(nameparts) > 1: + mininame = nameparts[0] + new_s = s.replace(mininame, this_marker) + if not new_s == s: + s = new_s + # on first inspection, the replacements all look good + # print '------------------' + # print name + # print '----' + # print s + + # a few others don't have a convenient comma to detect their nicknames, + # so we override them here + overrides = [ + # detectable by splitting on 'the', though that might cause other issues + 'crovax', + 'rashka', + 'phage', + 'shimatsu', + # random and arbitrary: they have a last name, 1996 world champion, etc. + 'world champion', + 'axelrod', + 'hazezon', + 'rubinia', + 'rasputin', + 'hivis', + ] + + for override in overrides: + s = s.replace(override, this_marker) + + # some detection code when the overrides need to be fixed... + # global badwords + # bad = False + # for word in name.replace(',', '').split(): + # if word in s and not word in badwords: + # badwords += [word] + return s + + +def sanitize_name(s): + s = s.replace('!', '') + s = s.replace('?', '') + s = s.replace('-', dash_marker) + s = s.replace('100,000', 'one hundred thousand') + s = s.replace('1,000', 'one thousand') + s = s.replace('1996', 'nineteen ninety-six') + return s + + +# call this before replacing newlines +# this one ends up being really bad because of the confusion +# with 'counter target spell or ability' +def replace_counters(s): + #so, big fat old dictionary time!!!!!!!!! + allcounters = [ + 'time counter', + 'devotion counter', + 'charge counter', + 'ki counter', + 'matrix counter', + 'spore counter', + 'poison counter', + 'quest counter', + 'hatchling counter', + 'storage counter', + 'growth counter', + 'paralyzation counter', + 'energy counter', + 'study counter', + 'glyph counter', + 'depletion counter', + 'sleight counter', + 'loyalty counter', + 'hoofprint counter', + 'wage counter', + 'echo counter', + 'lore counter', + 'page counter', + 'divinity counter', + 'mannequin counter', + 'ice counter', + 'fade counter', + 'pain counter', + #'age counter', + 'gold counter', + 'muster counter', + 'infection counter', + 'plague counter', + 'fate counter', + 'slime counter', + 'shell counter', + 'credit counter', + 'despair counter', + 'globe counter', + 'currency counter', + 'blood counter', + 'soot counter', + 'carrion counter', + 'fuse counter', + 'filibuster counter', + 'wind counter', + 'hourglass counter', + 'trap counter', + 'corpse counter', + 'awakening counter', + 'verse counter', + 'scream counter', + 'doom counter', + 'luck counter', + 'intervention counter', + 'eyeball counter', + 'flood counter', + 'eon counter', + 'death counter', + 'delay counter', + 'blaze counter', + 'magnet counter', + 'feather counter', + 'shield counter', + 'wish counter', + 'petal counter', + 'music counter', + 'pressure counter', + 'manifestation counter', + #'net counter', + 'velocity counter', + 'vitality counter', + 'treasure counter', + 'pin counter', + 'bounty counter', + 'rust counter', + 'mire counter', + 'tower counter', + #'ore counter', + 'cube counter', + 'strife counter', + 'elixir counter', + 'hunger counter', + 'level counter', + 'winch counter', + 'fungus counter', + 'training counter', + 'theft counter', + 'arrowhead counter', + 'sleep counter', + 'healing counter', + 'mining counter', + 'dream counter', + 'aim counter', + 'arrow counter', + 'javelin counter', + 'gem counter', + 'bribery counter', + 'mine counter', + 'omen counter', + 'phylactery counter', + 'tide counter', + 'polyp counter', + 'petrification counter', + 'shred counter', + 'pupa counter', + ] + usedcounters = [] + for countername in allcounters: + if countername in s: + usedcounters += [countername] + s = s.replace(countername, counter_marker + ' counter') + + # oh god some of the counter names are suffixes of others... + shortcounters = [ + 'age counter', + 'net counter', + 'ore counter', + ] + for countername in shortcounters: + if countername in s: + usedcounters += [countername] + s = s.replace(countername, counter_marker + ' counter') + + # miraculously this doesn't seem to happen + # if len(usedcounters) > 1: + # print usedcounters + + # we haven't done newline replacement yet, so use actual newlines + if len(usedcounters) == 1: + # and yeah, this line of code can blow up in all kinds of different ways + s = 'countertype ' + counter_marker + ' ' + usedcounters[0].split()[0] + '\n' + s + + # random code for finding out all the counter names + # global badwords + # countertypes = re.findall(r'[| ][^ ]+ counter', s) + # for countertype in countertypes: + # minicounter = countertype[1:] + # if not minicounter in badwords: + # badwords += [minicounter] + return s + + +# run only after doing unary conversion +def fix_dashes(s): + s = s.replace('-' + unary_marker, reserved_indicator) + s = s.replace('-', dash_marker) + s = s.replace(reserved_indicator, '-' + unary_marker) + + # level up is annoying + levels = re.findall(r'level &\^*\-&', s) + for level in levels: + newlevel = level.replace('-', dash_marker) + s = s.replace(level, newlevel) + + levels = re.findall(r'level &\^*\+', s) + for level in levels: + newlevel = level.replace('+', dash_marker) + s = s.replace(level, newlevel) + + # and we still have the ~x issue + + return s + + +# run this after fixing dashes, because this unbreaks the ~x issue +# also probably don't run this on names, there are a few names with x~ in them. +def fix_x(s): + s = s.replace(dash_marker + 'x', '-' + x_marker) + s = s.replace('+x', '+' + x_marker) + s = s.replace(' x ', ' ' + x_marker + ' ') + s = s.replace('x:', x_marker + ':') + s = s.replace('x~', x_marker + '~') + s = s.replace('x.', x_marker + '.') + s = s.replace('x,', x_marker + ',') + s = s.replace('x/x', x_marker + '/' + x_marker) + return s + + +# do before removing newlines +# might as well do this after countertype because we probably care more about +# the location of the equip cost +def relocate_equip(s): + equips = re.findall(r'equip \{[WUBRGPV/XTQ&^]*\}.?$', s) + # there don't seem to be any cases with more than one + if len(equips) == 1: + equip = equips[0] + s = s.replace('\n' + equip, '') + s = s.replace(equip, '') + + if equip[-1:] == ' ': + equip = equip[0:-1] + + if s == '': + s = equip + else: + s = equip + '\n' + s + + return s + + +def encode(card): + # filter out vanguard cards + if card['layout'] in ['token', 'plane', 'scheme', 'phenomenon', 'vanguard']: + return + + encoding = fieldsep + name = card['name'].lower() + encoding += sanitize_name(name) + encoding += fieldsep + if 'supertypes' in card: + encoding += ' '.join(card['supertypes']).lower() + encoding += fieldsep + encoding += ' '.join(card['types']).lower() + encoding += fieldsep + if 'loyalty' in card: + encoding += to_unary(str(card['loyalty'])) + encoding += fieldsep + if 'subtypes' in card: + encoding += ' '.join(card['subtypes']).lower() + encoding += fieldsep + if 'power' in card and 'toughness' in card: + encoding += to_unary(card['power']) + '/' + to_unary(card['toughness']) + encoding += fieldsep + if 'manaCost' in card: + encoding += replace_mana(card['manaCost'].lower()) + encoding += fieldsep + if 'text' in card: + text = card['text'].lower() + text = strip_reminder_text(text) + text = replace_cardname(text, name) + text = replace_mana(text) + text = to_unary(text) + text = fix_dashes(text) + text = fix_x(text) + text = replace_counters(text) + text = relocate_equip(text) + text = replace_newlines(text) + encoding += text + encoding += fieldsep + # if 'flavor' in card: + # encoding += card['flavor'].lower() + # encoding += fieldsep + + # now output the bside if there is one + if 'bside' in card: + encoding += bsidesep + encoding += encode(card['bside']) + + encoding = to_ascii(encoding) + # encoding = re.sub(valid_encoded_char, '', encoding) + # if not encoding == '': + # print card + return encoding + +def encode_duplicated(cards): + # Boring solution: only write out the first one... + return encode(cards[0]) + + +def main(fname, oname = None, verbose = True): + if verbose: + print 'Opening json file: ' + fname + + allcards = jdecode.mtg_open_json(fname, verbose) + + if not oname == None: + if verbose: + print 'Writing output to: ' + oname + ofile = codecs.open(oname, 'w', 'utf-8') + + for card in allcards: + val = encode_duplicated(allcards[card]) + if not (val == None or val == ''): + if oname == None: + print val + '\n' + else: + ofile.write(val + '\n\n') + + # print len(badwords) + # for word in badwords: + # print word + + if not oname == None: + ofile.close() + + +if __name__ == '__main__': + import sys + if len(sys.argv) == 2: + main(sys.argv[1]) + elif len(sys.argv) == 3: + main(sys.argv[1], oname = sys.argv[2]) + else: + print 'Usage: ' + sys.argv[0] + ' ' + ' [output filename]' + exit(1) + diff --git a/jdecode.py b/jdecode.py new file mode 100644 index 0000000..3043527 --- /dev/null +++ b/jdecode.py @@ -0,0 +1,66 @@ +import json + +# to allow filtering of sets like un sets, etc... +def legal_set(set): + return not set['type'] == 'un' + +def mtg_open_json(fname, verbose = False): + + f = open(fname, 'r') + jobj = json.load(f) + f.close() + + allcards = {} + asides = {} + bsides = {} + + for k_set in jobj: + set = jobj[k_set] + setname = set['name'] + + if legal_set(set): + for card in set['cards']: + card['setName'] = setname + + cardnumber = None + if 'number' in card: + cardnumber = card['number'] + cardname = card['name'] + + uid = set['code'] + if cardnumber == None: + uid = uid + '_' + cardname + '_' + else: + uid = uid + '_' + cardnumber + + # aggregate by name to avoid duplicates, not counting bsides + if not uid[-1] == 'b': + if cardname in allcards: + allcards[cardname] += [card] + else: + allcards[cardname] = [card] + + # also aggregate aside cards by uid so we can add bsides later + if uid[-1:] == 'a': + asides[uid] = card + if uid[-1:] == 'b': + bsides[uid] = card + + #break + + for uid in bsides: + aside_uid = uid[:-1] + 'a' + if aside_uid in asides: + # the second check handles the brothers yamazaki edge case + if not asides[aside_uid]['name'] == bsides[uid]['name']: + asides[aside_uid]['bside'] = bsides[uid] + else: + pass + # this exposes some coldsnap theme deck bsides that aren't + # really bsides; shouldn't matter too much + #print aside_uid + #print bsides[uid] + + if verbose: + print 'Opened ' + str(len(allcards)) + ' uniquely named cards.' + return allcards