EVERYTHING HAS CHANGED

Added lib and script subdirs to organize things; the biggest change is that now we have a really powerful Card class that can handle all of the decoding and encoding for us. encode.py has been written to take advantage of this, other things have not yet. Coming soon! As a side note the changes to output.txt are purely cosemtic, though the order should be stable now.
2015-07-14 00:07:25 -07:00 · 2015-07-14 00:07:25 -07:00 · 1a4965fd83
parent cbf8ac34e5
commit 1a4965fd83
12 changed files with 28751 additions and 28543 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,4 @@
 *.pyc
 AllSets.json
 AllSets-x.json
+lib/__init__.py
--- a/data/output.txt
+++ b/data/output.txt
--- a/datamine.py
+++ b/datamine.py
@ -3,7 +3,9 @@ import codecs
 import sys
 import random

-import utils
+import lib.utils as utils
+from lib.card import Card
+from lib.mana import Manacost

 # Format a list of rows of data into nice columns.
 # Note that it's the columns that are nice, not this code.
@ -29,351 +31,6 @@ def printrows(l):
    for row in l:
        print row

-def randomize_all_mana(text):
-    manastrs = re.findall(utils.mana_regex, text)
-    newtext = text
-    for manastr in sorted(manastrs, lambda x,y: cmp(len(x), len(y)), reverse = True):
-        newtext = newtext.replace(manastr, utils.reserved_marker)
-    for manastr in manastrs:
-        newtext = newtext.replace(utils.reserved_marker, 
-                                  Manacost(manastr).reencode(randomize = True),
-                                  1)
-    return newtext
-
-# so this stuff still needs to be cleaned up
-punctuation_chars = r'[+\-*",.:;WUBRGPV/XTQ|\\&^\{\}@ \n=~%\[\]]'
-creature_keywords = [
-    # evergreen
-    'deathtouch',
-    'defender',
-    'double strike',
-    'first strike',
-    'flash',
-    'flying',
-    'haste',
-    'hexproof',
-    'indestructible',
-    'lifelink',
-    'menace',
-    'prowess',
-    'reach',
-    'trample',
-    'vigilance',
-    # no longer evergreen
-    'banding',
-    'fear',
-    'shroud',
-    'intimidate',
-    # expert level keywords
-    'absorb',
-    'amplify',
-    'annihilator',
-    'battle cry',
-    'bolster',
-    'bloodthirst',
-    'bushido',
-    'changeling',
-    'convoke',
-    'devour',
-    'evolve',
-    'exalted',
-    'extort',
-    'fading',
-    'flanking',
-    'frenzy',
-    'graft',
-    'haunt',
-    'horsemanship',
-    'infect',
-    'modular',
-    #'morph',
-    #'ninjutsu',
-    'persist',
-    'poisonous',
-    'provoke',
-    #'prowl',
-    'rampage',
-    'ripple',
-    #'scavenge',
-    'shadow',
-    'soulbond',
-    'soulshift',
-    'split second',
-    'sunburst',
-    'undying',
-    #'unearth',
-    'unleash',
-    'vanishing',
-    'wither',
-] # there are other keywords out there, these are just easy to detect
-
-# data aggregating classes
-class Manacost:
-    '''mana cost representation with data'''
-    
-    # hardcoded to be dependent on the symbol structure... ah well
-    def get_colors(self):
-        colors = ''
-        for sym in self.symbols:
-            if self.symbols[sym] > 0:
-                symcolors = re.sub(r'2|P|S|X', '', sym)
-                for symcolor in symcolors:
-                    if symcolor not in colors:
-                        colors += symcolor
-        # sort so the order is always consistent
-        return ''.join(sorted(colors))
-
-    def check_colors(self, symbolstring):
-        for sym in symbolstring:
-            if not sym in self.colors:
-                return False
-        return True
-
-    def __init__(self, text):
-        self.raw = text
-        self.cmc = 0
-        self.colorless = 0
-        self.sequence = []
-        self.symbols = {sym : 0 for sym in utils.mana_syms}
-        self.allsymbols = {sym : 0 for sym in utils.mana_symall}
-
-        if text == '':
-            self._parsed = True
-            self._valid = True
-            self.none = True
-            self.inner = ''
-
-        elif not (len(self.raw) >= 2 and self.raw[0] == '{' and self.raw[-1] == '}'):
-            self._parsed = False
-            self._valid = False
-            self.none = False
-
-        else:
-            self._parsed = True
-            self._valid = True
-            self.none = False
-            self.inner = self.raw[1:-1]
-
-            # structure mirrors the decoding in utils, but we pull out different data here
-            idx = 0
-            while idx < len(self.inner):
-                # taking this branch is an infinite loop if unary_marker is empty
-                if (len(utils.mana_unary_marker) > 0 and 
-                    self.inner[idx:idx+len(utils.mana_unary_marker)] == utils.mana_unary_marker):
-                    idx += len(utils.mana_unary_marker)
-                    self.sequence += [utils.mana_unary_marker]
-                elif self.inner[idx:idx+len(utils.mana_unary_counter)] == utils.mana_unary_counter:
-                    idx += len(utils.mana_unary_counter)
-                    self.sequence += [utils.mana_unary_counter]
-                    self.colorless += 1
-                    self.cmc += 1
-                else:
-                    old_idx = idx
-                    for symlen in range(utils.mana_symlen_min, utils.mana_symlen_max + 1):
-                        encoded_sym = self.inner[idx:idx+symlen]
-                        if encoded_sym in utils.mana_symall_decode:
-                            idx += symlen
-                            # leave the sequence encoded for convenience
-                            self.sequence += [encoded_sym]
-                            sym = utils.mana_symall_decode[encoded_sym]
-                            self.allsymbols[sym] += 1
-                            if sym in utils.mana_symalt:
-                                self.symbols[utils.mana_alt(sym)] += 1
-                            else:
-                                self.symbols[sym] += 1
-                            if sym == utils.mana_X:
-                                self.cmc += 0
-                            elif utils.mana_2 in sym:
-                                self.cmc += 2
-                            else:
-                                self.cmc += 1
-                            break
-                    # otherwise we'll go into an infinite loop if we see a symbol we don't know
-                    if idx == old_idx:
-                        idx += 1
-                        self._valid = False
-
-        self.colors = self.get_colors()
-
-    def __str__(self):
-        return utils.mana_untranslate(utils.mana_open_delimiter + ''.join(self.sequence)
-                                      + utils.mana_close_delimiter)
-
-    def format(self, for_forum):
-        return utils.mana_untranslate(utils.mana_open_delimiter + ''.join(self.sequence, for_forum)
-                                      + utils.mana_close_delimiter)
-
-    def reencode(self, randomize = False):
-        if self.none:
-            return ''
-        elif randomize:
-            # so this won't work very well if mana_unary_marker isn't empty
-            return (utils.mana_open_delimiter 
-                    + ''.join(random.sample(self.sequence, len(self.sequence)))
-                    + utils.mana_close_delimiter)
-        else:
-            return utils.mana_open_delimiter + ''.join(self.sequence) + utils.mana_close_delimiter
-
-class Card:
-    '''card representation with data'''
-
-    def __init__(self, text):
-        self.raw = text
-        self._parsed = True
-        self._valid = True
-
-        if '\n' in self.raw:
-            halves = self.raw.split('\n')
-            if not len(halves) == 2:
-                self._parsed = False
-                self._valid = False
-                self.fields = halves
-                return
-            else:
-                self.raw = halves[0]
-                self.bside = Card(halves[1])
-                if not self.bside._valid:
-                    self._valid = False
-        else:
-            self.bside = None
-
-        fields = self.raw.split(utils.fieldsep)
-        if not len(fields) >= 10:
-            self._parsed = False
-            self._valid = False
-            self.fields = fields
-        else:
-            if not fields[1] == '':
-                self.name = fields[1]
-            else:
-                self.name = ''
-                self._valid = False
-
-            if not fields[2] == '':
-                self.supertypes = fields[2].split(' ')
-            else:
-                self.supertypes = []
-
-            if not fields[3] == '':
-                self.types = fields[3].split(' ')
-            else:
-                self.types = []
-                self._valid = False
-
-            if not fields[4] == '':
-                self.loyalty = fields[4]
-                try:
-                    self.loyalty_value = int(self.loyalty)
-                except ValueError:
-                    self.loyalty_value = None
-                    # strictly speaking, '* where * is something' is valid...
-                    # self._valid = False
-            else:
-                self.loyalty = None
-                self.loyalty_value = None
-
-            if not fields[5] == '':
-                self.subtypes = fields[5].split(' ')
-                if 'creature' in self.types:
-                    self.creaturetypes = self.subtypes
-                else:
-                    self.creaturetypes = []
-            else:
-                self.subtypes = []
-                self.creaturetypes = []
-
-            if not fields[6] == '':
-                self.pt = fields[6]
-                self.power = None
-                self.power_value = None
-                self.toughness = None
-                self.toughness_value = None
-                p_t = self.pt.split('/')
-                if len(p_t) == 2:
-                    self.power = p_t[0]
-                    try:
-                        self.power_value = int(self.power)
-                    except ValueError:
-                        self.power_value = None
-                    self.toughness = p_t[1]
-                    try:
-                        self.toughness_value = int(self.toughness)
-                    except ValueError:
-                        self.toughness_value = None
-                else:
-                    self._valid = False
-            else:
-                self.pt = None
-                self.power = None
-                self.power_value = None
-                self.toughness = None
-                self.toughness_value = None
-
-            # if there's no cost (lands) then cost.none will be True
-            self.cost = Manacost(fields[7])
-            
-            if not fields[8] == '':
-                self.text = fields[8]
-                self.text_lines = self.text.split(utils.newline)
-                self.text_words = re.sub(punctuation_chars, ' ', self.text).split()
-                self.creature_words = []
-                # SUPER HACK
-                if 'creature' in self.types:
-                    for line in self.text_lines:
-                        orig_line = line
-                        guess = []
-                        for keyword in creature_keywords:
-                            if keyword in line:
-                                guess += [keyword]
-                                line = line.replace(keyword, '')
-                        # yeah, I said it was a hack
-                        if re.sub(punctuation_chars, ' ', line).split() == [] or 'protect' in line or 'walk' in line or 'sliver creatures' in line or 'you control have' in line:
-                            for word in guess:
-                                if word not in self.creature_words:
-                                    self.creature_words += [word]
-                        # elif len(guess) > 0 and len(line) < 30:
-                        #     print orig_line
-            else:
-                self.text = ''
-                self.text_lines = []
-                self.text_words = []
-                self.creature_words = []
-
-    def __str__(self):
-        return ''.join([
-            utils.fieldsep,
-            self.name,
-            utils.fieldsep,
-            (' ' + utils.dash_marker + ' ').join([' '.join(self.supertypes + self.types),
-                                                   ' '.join(self.subtypes)]),
-            utils.fieldsep,
-            str(self.cost.cmc) if self.cost.colors == '' 
-            else str(self.cost.cmc) + ', ' + self.cost.colors,
-            utils.fieldsep,
-        ])
-        
-    def reencode(self, randomize = False):
-        return ''.join([
-            utils.fieldsep,
-            self.name,
-            utils.fieldsep,
-            ' '.join(self.supertypes),
-            utils.fieldsep,
-            ' '.join(self.types),
-            utils.fieldsep,
-            self.loyalty if self.loyalty else '',
-            utils.fieldsep,
-            ' '.join(self.subtypes),
-            utils.fieldsep,
-            self.pt if self.pt else '',
-            utils.fieldsep,
-            self.cost.reencode(randomize) if not self.cost.none else '',
-            utils.fieldsep,
-            self.text if not randomize else randomize_all_mana(self.text),
-            utils.fieldsep,
-            utils.bsidesep + self.bside.reencode(randomize) if self.bside else '',
-        ])
-
 # global card pools
 unparsed_cards = []
 invalid_cards = []
--- a/encode.py
+++ b/encode.py
@ -1,526 +1,77 @@
-import jdecode
 import re
-import codecs
+import random
 import sys

-import utils
-
-#badwords = []
+import lib.utils as utils
+from lib.cardlib import Card
+import lib.jdecode as jdecode

 valid_encoded_char = r'[abcdefghijklmnopqrstuvwxyz\'+\-*",.:;WUBRGPV/XTQ|\\&^\{\}@ \n=~%\[\]]'

-cardsep = utils.cardsep
-fieldsep = utils.fieldsep
-bsidesep = utils.bsidesep
-newline = utils.newline
-dash_marker = utils.dash_marker
-bullet_marker = utils.bullet_marker
-this_marker = utils.this_marker
-counter_marker = utils.counter_marker
-reserved_marker = utils.reserved_marker
-x_marker = utils.x_marker
-tap_marker = utils.tap_marker
-untap_marker = utils.untap_marker
-counter_rename = utils.counter_rename
-unary_marker = utils.unary_marker
-unary_counter = utils.unary_counter
+def exclude_sets(cardset):
+    return cardset == 'Unglued' or cardset == 'Unhinged' or cardset == 'Celebration'

-# This whole things assumes the json format of mtgjson.com.
+def exclude_types(cardtype):
+    return cardtype in ['conspiracy']

-# Here's a brief list of relevant fields:
-# name - string
-# names - list (used for split, flip, and double-faced)
-# manaCost - string
-# cmc - number
-# colors - list
-# type - string (the whole big long damn thing)
-# supertypes - list
-# types - list
-# subtypes - list
-# text - string
-# power - string
-# toughness - string
-# loyalty - number
+def exclude_layouts(layout):
+    return layout in ['token', 'plane', 'scheme', 'phenomenon', 'vanguard']

-# And some less useful ones, in case they're wanted for something:
-# layout - string
-# rarity - string
-# flavor - string
-# artis - string
-# number - string
-# multiverseid - number
-# variations - list
-# imageName - string
-# watermark - string
-# border - string
-# timeshifted - boolean
-# hand - number
-# life - number
-# reserved - boolean
-# releaseDate - string
-# starter - boolean
-
-
-def strip_reminder_text(s):
-    return re.sub(r'\(.*\)', '', s)
-
-
-def replace_newlines(s):
-    return s.replace('\n', '\\')
-
-
-def replace_cardname(s, name):
-    # here are some fun edge cases, thanks to jml34 on the forum for 
-    # pointing them out
-    if name == 'sacrifice':
-        s = s.replace(name, this_marker, 1)
-        return s
-    elif name == 'fear':
-        return s
-
-    s = s.replace(name, this_marker)
-    
-    # so, some legends don't use the full cardname in their text box...
-    # this check finds about 400 of them
-    nameparts = name.split(',')
-    if len(nameparts) > 1:
-        mininame = nameparts[0]
-        new_s = s.replace(mininame, this_marker)
-        if not new_s == s:
-            s = new_s
-            # on first inspection, the replacements all look good
-            # print '------------------'
-            # print name
-            # print '----'
-            # print s
-        
-    # a few others don't have a convenient comma to detect their nicknames,
-    # so we override them here
-    overrides = [
-        # detectable by splitting on 'the', though that might cause other issues
-        'crovax',
-        'rashka',
-        'phage',
-        'shimatsu',
-        # random and arbitrary: they have a last name, 1996 world champion, etc.
-        'world champion',
-        'axelrod',
-        'hazezon',
-        'rubinia',
-        'rasputin',
-        'hivis',
-    ]
-    
-    for override in overrides:
-        s = s.replace(override, this_marker)
-
-    # some detection code for when the overrides need to be fixed...
-    # global badwords
-    # bad = False
-    # for word in name.replace(',', '').split():
-    #     if word in s and not word in badwords:
-    #             badwords += [word]
-    return s
-
-
-def sanitize_name(s):
-    s = s.replace('!', '')
-    s = s.replace('?', '')
-    s = s.replace('-', dash_marker)
-    s = s.replace('100,000', 'one hundred thousand')
-    s = s.replace('1,000', 'one thousand')
-    s = s.replace('1996', 'nineteen ninety-six')
-    return s
-
-
-# call this before replacing newlines
-# this one ends up being really bad because of the confusion
-# with 'counter target spell or ability'
-def replace_counters(s):
-    #so, big fat old dictionary time!!!!!!!!!
-    allcounters = [
-        'time counter',
-        'devotion counter',
-        'charge counter',
-        'ki counter',
-        'matrix counter',
-        'spore counter',
-        'poison counter',
-        'quest counter',
-        'hatchling counter',
-        'storage counter',
-        'growth counter',
-        'paralyzation counter',
-        'energy counter',
-        'study counter',
-        'glyph counter',
-        'depletion counter',
-        'sleight counter',
-        'loyalty counter',
-        'hoofprint counter',
-        'wage counter',
-        'echo counter',
-        'lore counter',
-        'page counter',
-        'divinity counter',
-        'mannequin counter',
-        'ice counter',
-        'fade counter',
-        'pain counter',
-        #'age counter',
-        'gold counter',
-        'muster counter',
-        'infection counter',
-        'plague counter',
-        'fate counter',
-        'slime counter',
-        'shell counter',
-        'credit counter',
-        'despair counter',
-        'globe counter',
-        'currency counter',
-        'blood counter',
-        'soot counter',
-        'carrion counter',
-        'fuse counter',
-        'filibuster counter',
-        'wind counter',
-        'hourglass counter',
-        'trap counter',
-        'corpse counter',
-        'awakening counter',
-        'verse counter',
-        'scream counter',
-        'doom counter',
-        'luck counter',
-        'intervention counter',
-        'eyeball counter',
-        'flood counter',
-        'eon counter',
-        'death counter',
-        'delay counter',
-        'blaze counter',
-        'magnet counter',
-        'feather counter',
-        'shield counter',
-        'wish counter',
-        'petal counter',
-        'music counter',
-        'pressure counter',
-        'manifestation counter',
-        #'net counter',
-        'velocity counter',
-        'vitality counter',
-        'treasure counter',
-        'pin counter',
-        'bounty counter',
-        'rust counter',
-        'mire counter',
-        'tower counter',
-        #'ore counter',
-        'cube counter',
-        'strife counter',
-        'elixir counter',
-        'hunger counter',
-        'level counter',
-        'winch counter',
-        'fungus counter',
-        'training counter',
-        'theft counter',
-        'arrowhead counter',
-        'sleep counter',
-        'healing counter',
-        'mining counter',
-        'dream counter',
-        'aim counter',
-        'arrow counter',
-        'javelin counter',
-        'gem counter',
-        'bribery counter',
-        'mine counter',
-        'omen counter',
-        'phylactery counter',
-        'tide counter',
-        'polyp counter',
-        'petrification counter',
-        'shred counter',
-        'pupa counter',
-    ]
-    usedcounters = []
-    for countername in allcounters:
-        if countername in s:
-            usedcounters += [countername]
-            s = s.replace(countername, counter_marker + ' counter')
-    
-    # oh god some of the counter names are suffixes of others...
-    shortcounters = [
-        'age counter',
-        'net counter',
-        'ore counter',
-    ]
-    for countername in shortcounters:
-        # SUPER HACKY fix for doubling season
-        if countername in s and 'more counter' not in s:
-            usedcounters += [countername]
-            s = s.replace(countername, counter_marker + ' counter')
-    
-    # miraculously this doesn't seem to happen
-    # if len(usedcounters) > 1:
-    #     print usedcounters
-
-    # we haven't done newline replacement yet, so use actual newlines
-    if len(usedcounters) == 1:
-        # and yeah, this line of code can blow up in all kinds of different ways
-        s = 'countertype ' + counter_marker + ' ' + usedcounters[0].split()[0] + '\n' + s
-
-    # random code for finding out all the counter names
-    # global badwords
-    # countertypes = re.findall(r'[| ][^ ]+ counter', s)
-    # for countertype in countertypes:
-    #     minicounter = countertype[1:]
-    #     if not minicounter in badwords:
-    #         badwords += [minicounter]
-    return s
-
-
-# the word counter is confusing when used to refer to what we do to spells
-# and sometimes abilities to make them not happen. Let's rename that.
-# call this after doing the counter replacement to simplify the regexes
-counter_rename = 'uncast'
-def rename_uncast(s):
-    # pre-checks to make sure we aren't doing anything dumb
-    # if '# counter target ' in s or '^ counter target ' in s or '& counter target ' in s:
-    #     print s + '\n'
-    # if '# counter a ' in s or '^ counter a ' in s or '& counter a ' in s:
-    #     print s + '\n'
-    # if '# counter all ' in s or '^ counter all ' in s or '& counter all ' in s:
-    #     print s + '\n'
-    # if '# counter a ' in s or '^ counter a ' in s or '& counter a ' in s:
-    #     print s + '\n'
-    # if '# counter that ' in s or '^ counter that ' in s or '& counter that ' in s:
-    #     print s + '\n'
-    # if '# counter @' in s or '^ counter @' in s or '& counter @' in s:
-    #     print s + '\n'
-    # if '# counter the ' in s or '^ counter the ' in s or '& counter the ' in s:
-    #     print s + '\n'
-
-    # counter target
-    s = s.replace('counter target ', counter_rename + ' target ')
-    # counter a
-    s = s.replace('counter a ', counter_rename + ' a ')
-    # counter all
-    s = s.replace('counter all ', counter_rename + ' all ')
-    # counters a
-    s = s.replace('counters a ', counter_rename + 's a ')
-    # countered (this could get weird in terms of englishing the word)
-    s = s.replace('countered', counter_rename)
-    # counter that
-    s = s.replace('counter that ', counter_rename + ' that ')
-    # counter @
-    s = s.replace('counter @', counter_rename + ' @')
-    # counter it (this is tricky
-    s = s.replace(', counter it', ', ' + counter_rename + ' it')
-    # counter the (it happens at least once, thanks wizards!)
-    s = s.replace('counter the ', counter_rename + ' the ')
-    # counter up to
-    s = s.replace('counter up to ', counter_rename + ' up to ')
-
-    # check if the word exists in any other context
-    # if 'counter' in s.replace('# counter', '').replace('countertype', '').replace('^ counter', '').replace('& counter', ''):
-    #     print s + '\n'
-
-    # whew! by manual inspection of a few dozen texts, it looks like this about covers it.
-    return s    
-    
-
-# run only after doing unary conversion
-def fix_dashes(s):
-    s = s.replace('-' + unary_marker, reserved_marker)
-    s = s.replace('-', dash_marker)
-    s = s.replace(reserved_marker, '-' + unary_marker)
-    
-    # level up is annoying
-    levels = re.findall(r'level &\^*\-&', s)
-    for level in levels:
-        newlevel = level.replace('-', dash_marker)
-        s = s.replace(level, newlevel)
-
-    levels = re.findall(r'level &\^*\+', s)
-    for level in levels:
-        newlevel = level.replace('+', dash_marker)
-        s = s.replace(level, newlevel)
-
-    # and we still have the ~x issue
-
-    return s
-
-
-# run this after fixing dashes, because this unbreaks the ~x issue
-# also probably don't run this on names, there are a few names with x~ in them.
-def fix_x(s):
-    s = s.replace(dash_marker + 'x', '-' + x_marker)
-    s = s.replace('+x', '+' + x_marker)
-    s = s.replace(' x ', ' ' + x_marker + ' ')
-    s = s.replace('x:', x_marker + ':')
-    s = s.replace('x~', x_marker + '~')
-    s = s.replace('x.', x_marker + '.')
-    s = s.replace('x,', x_marker + ',')
-    s = s.replace('x/x', x_marker + '/' + x_marker)
-    return s
-
-
-# run after fixing dashes, it makes the regexes better, but before replacing newlines
-def reformat_choice(s):
-    # the idea is to take 'choose n ~\n=ability\n=ability\n'
-    # to '[n = ability = ability]\n'
-    
-    def choice_formatting_helper(s_helper, prefix, count):
-        single_choices = re.findall(ur'(' + prefix + ur'\n?(\u2022.*(\n|$))+)', s_helper)
-        for choice in single_choices:
-            newchoice = choice[0]
-            newchoice = newchoice.replace(prefix, unary_marker + (unary_counter * count))
-            newchoice = newchoice.replace('\n', ' ')
-            if newchoice[-1:] == ' ':
-                newchoice = '[' + newchoice[:-1] + ']\n'
-            else:
-                newchoice = '[' + newchoice + ']'
-            s_helper = s_helper.replace(choice[0], newchoice)
-        return s_helper
-
-    s = choice_formatting_helper(s, ur'choose one \u2014', 1)
-    s = choice_formatting_helper(s, ur'choose one \u2014 ', 1) # ty Promise of Power
-    s = choice_formatting_helper(s, ur'choose two \u2014', 2)
-    s = choice_formatting_helper(s, ur'choose one or both \u2014', 0)
-    s = choice_formatting_helper(s, ur'choose one or more \u2014', 0)
-
-    return s
-
-
-# do before removing newlines
-# might as well do this after countertype because we probably care more about
-# the location of the equip cost
-def relocate_equip(s):
-    equips = re.findall(r'equip \{[WUBRGPV/XTQ&^]*\}.?$', s)
-    # there don't seem to be any cases with more than one
-    if len(equips) == 1:
-        equip = equips[0]
-        s = s.replace('\n' + equip, '')
-        s = s.replace(equip, '')
-
-        if equip[-1:] == ' ':
-            equip = equip[0:-1]
-
-        if s == '':
-            s = equip
-        else:
-            s = equip + '\n' + s
-
-    nonmana = re.findall(ur'(equip\u2014.*(\n|$))', s)
-    if len(nonmana) == 1:
-        equip = nonmana[0][0]
-        s = s.replace('\n' + equip, '')
-        s = s.replace(equip, '')
-        
-        if equip[-1:] == ' ':
-            equip = equip[0:-1]
-
-        if s == '':
-            s = equip
-        else:
-            s = equip + '\n' + s
-        
-    return s
-
-
-def encode(card):
-    # filter out vanguard cards
-    if card['layout'] in ['token', 'plane', 'scheme', 'phenomenon', 'vanguard']:
-        return
-    if card['type'] in ['Conspiracy']: # just for now?
-        return
-
-    encoding = fieldsep
-    if 'name' in card:
-        name = card['name'].lower()
-        encoding += sanitize_name(name)
-    encoding += fieldsep
-    if 'supertypes' in card:
-        encoding += ' '.join(card['supertypes']).lower()
-    encoding += fieldsep
-    if 'types' in card:
-        encoding += ' '.join(card['types']).lower()
-    encoding += fieldsep
-    if 'loyalty' in card:
-        encoding += utils.to_unary(str(card['loyalty']))
-    encoding += fieldsep
-    if 'subtypes' in card:
-        encoding += ' '.join(card['subtypes']).lower()
-    encoding += fieldsep
-    if 'power' in card and 'toughness' in card:
-        encoding += utils.to_unary(card['power']) + '/' + utils.to_unary(card['toughness'])        
-    encoding += fieldsep
-    if 'manaCost' in card:
-        encoding += utils.to_mana(card['manaCost'].lower())
-    encoding += fieldsep
-    if 'text' in card:
-        text = card['text'].lower()
-        text = strip_reminder_text(text)
-        text = replace_cardname(text, name)
-        text = utils.to_mana(text)
-        text = utils.to_symbols(text)
-        text = utils.to_unary(text)
-        text = fix_dashes(text)
-        text = fix_x(text)
-        text = replace_counters(text)
-        text = rename_uncast(text)
-        text = reformat_choice(text)
-        text = relocate_equip(text)
-        text = replace_newlines(text)
-        encoding += text.strip()
-    encoding += fieldsep
-
-    # now output the bside if there is one
-    if 'bside' in card:
-        encoding += bsidesep
-        encoding += encode(card['bside'])
-
-    encoding = utils.to_ascii(encoding)
-    # encoding = re.sub(valid_encoded_char, '', encoding)
-    # if not encoding == '':
-    #     print card
-
-    return encoding
-    
-def encode_duplicated(cards):
+def compile_duplicated(jcards):
    # Boring solution: only write out the first one...
-    return encode(cards[0])
-
+    card = Card(jcards[0])
+    if (exclude_sets(jcards[0][utils.json_field_set_name])
+        or exclude_layouts(jcards[0]['layout'])):
+        return None
+    for cardtype in card.types:
+        if exclude_types(cardtype):
+            return None
+    return card

 def main(fname, oname = None, verbose = True):
    if verbose:
        print 'Opening json file: ' + fname

-    allcards = jdecode.mtg_open_json(fname, verbose)
+    jcards = jdecode.mtg_open_json(fname, verbose)
+    cards = []

-    if not oname == None:
+    valid = 0
+    skipped = 0
+    invalid = 0
+    unparsed = 0
+
+    for jcard_name in jcards:
+        card = compile_duplicated(jcards[jcard_name])
+        if card:
+            if card.valid:
+                valid += 1
+                cards += [card]
+            elif card.parsed:
+                invalid += 1
+            else:
+                unparsed += 1
+        else:
+            skipped += 1
+
+    if verbose:
+        print (str(valid) + ' valid, ' + str(skipped) + ' skipped, ' 
+               + str(invalid) + ' invalid, ' + str(unparsed) + ' failed to parse.')
+
+    # This should give a random but consistent ordering, to make comparing changes
+    # between the output of different versions easier.
+    random.seed(1371367)
+    random.shuffle(cards)
+
+    if oname:
        if verbose:
            print 'Writing output to: ' + oname
-        ofile = codecs.open(oname, 'w', 'utf-8')
-
-    for card in allcards:
-        val = encode_duplicated(allcards[card])
-        if not (val == None or val == ''):
-            if oname == None:
-                print val + '\n'
-            else:
-                ofile.write(val + cardsep)
-        
-    # print len(badwords)
-    # for word in badwords:
-    #     print word
-
-    if not oname == None:
-        ofile.close()
+        with open(oname, 'w') as ofile:
+            for card in cards:
+                ofile.write(card.encode() + utils.cardsep)
+    else:
+        for card in cards:
+            sys.stdout.write(card.encode() + utils.cardsep)
+        sts.stdout.flush()

    
 if __name__ == '__main__':
--- a/jdecode.py
+++ b/jdecode.py
@ -1,65 +0,0 @@
-import json
-
-# to allow filtering of sets like un sets, etc...
-def legal_set(set):
-    return not (set['type'] == 'un' or set['name'] == 'Celebration')
-
-def mtg_open_json(fname, verbose = False):
-
-    f = open(fname, 'r')    
-    jobj = json.load(f)
-    f.close()
-    
-    allcards = {}
-    asides = {}
-    bsides = {}
-
-    for k_set in jobj:
-        set = jobj[k_set]
-        setname = set['name']
-        
-        if legal_set(set):
-            for card in set['cards']:
-                card['setName'] = setname
-
-                cardnumber = None
-                if 'number' in card:
-                    cardnumber = card['number']
-                # the lower avoids duplication of at least one card (Will-o/O'-the-Wisp)
-                cardname = card['name'].lower()
-
-                uid = set['code']
-                if cardnumber == None:
-                    uid = uid + '_' + cardname + '_'
-                else:
-                    uid = uid + '_' + cardnumber
-
-                # aggregate by name to avoid duplicates, not counting bsides
-                if not uid[-1] == 'b':
-                    if cardname in allcards:
-                        allcards[cardname] += [card]
-                    else:
-                        allcards[cardname] = [card]
-                    
-                # also aggregate aside cards by uid so we can add bsides later
-                if uid[-1:] == 'a':
-                    asides[uid] = card
-                if uid[-1:] == 'b':
-                    bsides[uid] = card
-
-    for uid in bsides:
-        aside_uid = uid[:-1] + 'a'
-        if aside_uid in asides:
-            # the second check handles the brothers yamazaki edge case
-            if not asides[aside_uid]['name'] == bsides[uid]['name']:
-                asides[aside_uid]['bside'] = bsides[uid]
-        else:
-            pass
-            # this exposes some coldsnap theme deck bsides that aren't
-            # really bsides; shouldn't matter too much
-            #print aside_uid
-            #print bsides[uid]
-
-    if verbose:
-        print 'Opened ' + str(len(allcards)) + ' uniquely named cards.'
-    return allcards
--- a/lib/cardlib.py
+++ b/lib/cardlib.py
@ -0,0 +1,402 @@
+# card representation
+import re
+
+import utils
+import transforms
+from manalib import Manacost, Manatext
+
+# These are used later to determine what the fields of the Card object are called.
+# Define them here because they have nothing to do with the actual format.
+field_name = 'name'
+field_rarity = 'rarity'
+field_cost = 'cost'
+field_supertypes = 'supertypes'
+field_types = 'types'
+field_subtypes = 'subtypes'
+field_loyalty = 'loyalty'
+field_pt = 'pt'
+field_text = 'text'
+field_other = 'other' # it's kind of a pseudo-field
+
+# Import the labels, because these do appear in the encoded text.
+field_label_name = utils.field_label_name
+field_label_rarity = utils.field_label_rarity
+field_label_cost = utils.field_label_cost
+field_label_supertypes = utils.field_label_supertypes
+field_label_types = utils.field_label_types
+field_label_subtypes = utils.field_label_subtypes
+field_label_loyalty = utils.field_label_loyalty
+field_label_pt = utils.field_label_pt
+field_label_text = utils.field_label_text
+
+fieldnames = [
+    field_name,
+    field_rarity,
+    field_cost,
+    field_supertypes,
+    field_types,
+    field_subtypes,
+    field_loyalty,
+    field_pt,
+    field_text,
+]
+
+fmt_ordered_default = [
+    field_name,
+    field_supertypes,
+    field_types,
+    field_loyalty,
+    field_subtypes,
+    field_pt,
+    field_cost,
+    field_text,
+]
+
+fmt_labeled_default = {
+    field_name : field_label_name,
+    field_rarity : field_label_rarity,
+    field_cost : field_label_cost,
+    field_supertypes : field_label_supertypes,
+    field_types : field_label_types,
+    field_loyalty : field_label_loyalty,
+    field_pt : field_label_pt,
+    field_text : field_label_text,
+}
+
+# sanity test if a card's fields look plausible
+def fields_check_valid(fields):
+    # all cards must have a name and a type
+    if not field_name in fields:
+        return False
+    if not field_types in fields:
+        return False
+    # creatures have p/t, other things don't
+    iscreature = False
+    for idx, value in fields[field_types]:
+        if 'creature' in value:
+            iscreature = True
+    if iscreature:
+        return field_pt in fields
+    else:
+        return not field_pt in fields
+
+# These functions take a bunch of source data in some format and turn
+# it into nicely labeled fields that we know how to initialize a card from.
+# Both return a dict that maps field names to lists of possible values,
+# paired with the index that we read that particular field value from.
+# So, {fieldname : [(idx, value), (idx, value)...].
+# Usually we want these lists to be length 1, but you never know.
+
+# Of course to make things nice and simple, that dict is the third element
+# of a triple that reports parsing success and valid success as its 
+# first two elements.
+
+# This whole things assumes the json format of mtgjson.com.
+
+# Here's a brief list of relevant fields:
+# name - string
+# names - list (used for split, flip, and double-faced)
+# manaCost - string
+# cmc - number
+# colors - list
+# type - string (the whole big long damn thing)
+# supertypes - list
+# types - list
+# subtypes - list
+# text - string
+# power - string
+# toughness - string
+# loyalty - number
+
+# And some less useful ones, in case they're wanted for something:
+# layout - string
+# rarity - string
+# flavor - string
+# artis - string
+# number - string
+# multiverseid - number
+# variations - list
+# imageName - string
+# watermark - string
+# border - string
+# timeshifted - boolean
+# hand - number
+# life - number
+# reserved - boolean
+# releaseDate - string
+# starter - boolean
+
+def fields_from_json(src_json):
+    parsed = True
+    valid = True
+    fields = {}
+
+    # we hardcode in what the things are called in the mtgjson format
+    if 'name' in src_json:
+        name_val = src_json['name'].lower()
+        name_orig = name_val
+        name_val = transforms.name_pass_1_sanitize(name_val)
+        name_val = utils.to_ascii(name_val)
+        fields[field_name] = [(-1, name_val)]
+    else:
+        name_orig = ''
+        parsed = False
+
+    # return the actual Manacost object
+    if 'manaCost' in src_json:
+        cost =  Manacost(src_json['manaCost'], fmt = 'json')
+        valid = valid and cost.valid
+        parsed = parsed and cost.parsed
+        fields[field_cost] = [(-1, cost)]
+
+    if 'supertypes' in src_json:
+        fields[field_supertypes] = [(-1, map(lambda s: utils.to_ascii(s.lower()), 
+                                             src_json['supertypes']))]
+
+    if 'types' in src_json:
+        fields[field_types] = [(-1, map(lambda s: utils.to_ascii(s.lower()), 
+                                        src_json['types']))]
+    else:
+        parsed = False
+
+    if 'subtypes' in src_json:
+        fields[field_subtypes] = [(-1, map(lambda s: utils.to_ascii(s.lower()), 
+                                           src_json['subtypes']))]
+
+    if 'loyalty' in src_json:
+        fields[field_loyalty] = [(-1, utils.to_unary(str(src_json['loyalty'])))]
+
+    p_t = ''
+    if 'power' in src_json:
+        p_t = utils.to_ascii(utils.to_unary(src_json['power'])) + '/' # hardcoded
+        valid = False
+        if 'toughness' in src_json:
+            p_t = p_t + utils.to_ascii(utils.to_unary(src_json['toughness']))
+            valid = True
+    elif 'toughness' in src_json:
+        p_t = '/' + utils.to_ascii(utils.to_unary(src_json['toughness'])) # hardcoded
+        valid = False
+    if p_t:
+        fields[field_pt] = [(-1, p_t)]
+        
+    # similarly, return the actual Manatext object
+    if 'text' in src_json:
+        text_val = src_json['text'].lower()
+        text_val = transforms.text_pass_1_strip_rt(text_val)
+        text_val = transforms.text_pass_2_cardname(text_val, name_orig)
+        text_val = transforms.text_pass_3_unary(text_val)
+        text_val = transforms.text_pass_4a_dashes(text_val)
+        text_val = transforms.text_pass_4b_x(text_val)
+        text_val = transforms.text_pass_5_counters(text_val)
+        text_val = transforms.text_pass_6_uncast(text_val)
+        text_val = transforms.text_pass_7_choice(text_val)
+        text_val = transforms.text_pass_8_equip(text_val)
+        text_val = transforms.text_pass_9_newlines(text_val)
+        text_val = transforms.text_pass_10_symbols(text_val)
+        text_val = utils.to_ascii(text_val)
+        text_val = text_val.strip()
+        mtext = Manatext(text_val, fmt = 'json')
+        valid = valid and mtext.valid
+        fields[field_text] = [(-1, mtext)]
+    
+    # we don't need to worry about bsides because we handle that in the constructor
+    return parsed, valid and fields_check_valid(fields), fields
+
+def fields_from_format(src_text, fmt_ordered, fmt_labeled, fieldsep):
+    pass
+
+
+# Here's the actual Card class that other files should use.
+
+class Card:
+    '''card representation with data'''
+
+    def __init__(self, src, fmt_ordered = fmt_ordered_default, 
+                            fmt_labeled = None, 
+                            fieldsep = utils.fieldsep):
+        # source fields, exactly one will be set
+        self.json = None
+        self.raw = None
+        # flags
+        self.parsed = True
+        self.valid = True # only records broken pt right now (broken as in, no /)
+        # default values for all fields
+        self.__dict__[field_name] = ''
+        self.__dict__[field_rarity] = ''
+        self.__dict__[field_cost] = Manacost('')
+        self.__dict__[field_supertypes] = []
+        self.__dict__[field_types] = []
+        self.__dict__[field_subtypes] = []
+        self.__dict__[field_loyalty] = ''
+        self.__dict__[field_loyalty + '_value'] = None
+        self.__dict__[field_pt] = ''
+        self.__dict__[field_pt + '_p'] = None
+        self.__dict__[field_pt + '_p_value'] = None
+        self.__dict__[field_pt + '_t'] = None
+        self.__dict__[field_pt + '_t_value'] = None
+        self.__dict__[field_text] = Manatext('')
+        self.__dict__[field_text + '_lines'] = []
+        self.__dict__[field_text + '_words'] = []
+        self.__dict__[field_other] = []
+        self.bside = None
+        # format-independent view of processed input
+        self.fields = None # will be reset later
+
+        # looks like a json object
+        if isinstance(src, dict):
+            if utils.json_field_bside in src:
+                self.bside = Card(src[utils.json_field_bside],
+                                  fmt_ordered = fmt_ordered,
+                                  fmt_labeled = fmt_labeled,
+                                  fieldsep = fieldsep)
+            p_success, v_success, parsed_fields = fields_from_json(src)
+            self.parsed = p_success
+            self.valid = v_success
+            self.fields = parsed_fields
+        # otherwise assume text encoding
+        else:
+            sides = src.split(utils.bsidesep)
+            if len(sides) > 1:
+                self.bside = Card(utils.bsidesep.join(sides[1:]), 
+                                  fmt_ordered = fmt_ordered,
+                                  fmt_labeled = fmt_labeled,
+                                  fieldsep = fieldsep)
+            p_success, v_success, parsed_fields = fields_from_format(sides[0], fmt_ordered, 
+                                                                     fmt_labeled,  fieldsep)
+            self.parsed = p_success
+            self.valid = v_success
+            self.fields = parsed_fields
+        # amusingly enough, both encodings allow infinitely deep nesting of bsides...
+
+        # python name hackery
+        if self.fields:
+            for field in self.fields:
+                # look for a specialized set function
+                if '_set_' + field in self.__dict__:
+                    self.__dict__['_set_' + field](self.fields[field])
+                # otherwise use the default one
+                elif field in self.__dict__:
+                    self.set_field_default(field, self.fields[field])
+                # If we don't recognize the field, fail. This is a totally artificial
+                # limitation; if we just used the default handler for the else case,
+                # we could set arbitrarily named fields.
+                else:
+                    raise ValueError('python name mangling failure: unknown field for Card(): ' 
+                                     + field)
+        else:
+            # valid but not parsed indicates that the card was apparently empty
+            self.parsed = False
+
+    # These setters are invoked via name mangling, so they have to match 
+    # the field names specified above to be used. Otherwise we just
+    # always fall back to the (uninteresting) default handler.
+
+    # Also note that all fields come wrapped in pairs, with the first member
+    # specifying the index the field was found at when parsing the card. These will
+    # all be -1 if the card was parsed from (unordered) json.
+
+    def set_field_default(self, field, values):
+        for idx, value in values:
+            self.__dict__[field] = value
+            break # only use the first one...
+
+    def _set_loyalty(self, values):
+        for idx, value in values:
+            self.__dict__[field_loyalty] = value
+            try:
+                self.__dict__[field_loyalty + '_value'] = int(value)
+            except ValueError:
+                self.__dict__[field_loyalty + '_value'] = None
+                # Technically '*' could still be valid, but it's unlikely...
+            break # only use the first one...
+
+    def _set_pt(self, values):
+        for idx, value in values:
+            self.__dict__[field_pt] = value
+            p_t = value.split('/') # hardcoded
+            if len(p_t) == 2:
+                self.__dict__[field_pt + '_p'] = p_t[0]
+                try:
+                    self.__dict__[field_pt + '_p_value'] = int(p_t[0])
+                except ValueError:
+                    self.__dict__[field_pt + '_p_value'] = None
+                self.__dict__[field_pt + '_t'] = p_t[1]
+                try:
+                    self.__dict__[field_pt + '_t_value'] = int(p_t[1])
+                except ValueError:
+                    self.__dict__[field_pt + '_t_value'] = None
+            else:
+                self.valid = False
+            break # only use the first one...
+    
+    def _set_text(self, values):
+        mtext = ''
+        for idx, value in values:
+            mtext = value
+        self.__dict__[field_text] = mtext
+        fulltext = mtext.encode()
+        if fulltext:
+            self.__dict__[field_text + '_lines'] = map(Manatext, fulltext.split(utils.newline))
+            self.__dict__[field_text + '_words'] = re.sub(utils.unletters_regex, 
+                                                          ' ', 
+                                                          fulltext).split()
+        
+    def _set_other(self, values):
+        # just record these, we could do somthing unset valid if we really wanted
+        for idx, value in values:
+            self.__dict__[field_other] += [(idx, value)]
+
+    # Output functions that produce various formats. encode() is specific to
+    # the NN representation, use str() or format() for output intended for human
+    # readers.
+
+    def encode(self, fmt_ordered = fmt_ordered_default,
+               fmt_labeled = None, fieldsep = utils.fieldsep,
+               randomize_fields = False, randomize_mana = False,
+               initial_sep = True, final_sep = True):
+        outfields = []
+
+        for field in fmt_ordered:
+            if field in self.__dict__:
+                if self.__dict__[field]:
+                    outfield = self.__dict__[field]
+                    # specialized field handling for the ones that aren't strings (sigh)
+                    if isinstance(outfield, list):
+                        outfield_str = ' '.join(outfield)
+                    elif isinstance(outfield, Manacost):
+                        outfield_str = outfield.encode(randomize = randomize_mana)
+                    elif isinstance(outfield, Manatext):
+                        outfield_str = outfield.encode(randomize = randomize_mana)
+                    else:
+                        outfield_str = outfield
+
+                    if fmt_labeled and field in fmt_labeled:
+                        outfield_str = fmt_labeled[field] + outfield_str
+
+                else:
+                    outfield_str = ''
+
+                outfields += [outfield_str]
+
+            else:
+                raise ValueError('unknown field for Card.encode(): ' + str(field))
+
+        if randomize_fields:
+            random.shuffle(outfields)
+        if initial_sep:
+            outfields = [''] + outfields
+        if final_sep:
+            outfields = outfields + ['']
+        
+        outstr = fieldsep.join(outfields)
+
+        if self.bside:
+            outstr = (outstr + utils.bsidesep 
+                      + self.bside.encode(fmt_ordered = fmt_ordered,
+                                          fmt_labeled = fmt_labeled,
+                                          fieldsep = fieldsep,
+                                          randomize_fields = randomize_fields, 
+                                          randomize_mana = randomize_mana,
+                                          initial_sep = initial_sep, final_sep = final_sep))
+
+        return outstr
--- a/lib/config.py
+++ b/lib/config.py
@ -15,7 +15,8 @@ dash_marker = '~'
 bullet_marker = '='
 this_marker = '@'
 counter_marker = '%'
-reserved_marker = '\r'
+reserved_marker = '\v'
+reserved_mana_marker = '$'
 x_marker = 'X'
 tap_marker = 'T'
 untap_marker = 'Q'
@ -35,3 +36,19 @@ unary_exceptions = {
    100: 'one hundred',
    200: 'two hundred',
 }
+
+# field labels, to allow potential reordering of card format
+field_label_name = '1'
+field_label_rarity = '2'
+field_label_cost = '3'
+field_label_supertypes = '4'
+field_label_types = '5'
+field_label_subtypes = '6'
+field_label_loyalty = '7'
+field_label_pt = '8'
+field_label_text = '9'
+# one left, could use for managing bsides
+
+# additional fields we add to the json cards
+json_field_bside = 'bside'
+json_field_set_name = 'setName'
--- a/lib/jdecode.py
+++ b/lib/jdecode.py
@ -0,0 +1,61 @@
+import json
+
+import config
+
+def mtg_open_json(fname, verbose = False):
+
+    with open(fname, 'r') as f:
+        jobj = json.load(f)
+    
+    allcards = {}
+    asides = {}
+    bsides = {}
+
+    for k_set in jobj:
+        set = jobj[k_set]
+        setname = set['name']
+        
+        for card in set['cards']:
+            card[config.json_field_set_name] = setname
+
+            cardnumber = None
+            if 'number' in card:
+                cardnumber = card['number']
+            # the lower avoids duplication of at least one card (Will-o/O'-the-Wisp)
+            cardname = card['name'].lower()
+
+            uid = set['code']
+            if cardnumber == None:
+                uid = uid + '_' + cardname + '_'
+            else:
+                uid = uid + '_' + cardnumber
+
+            # aggregate by name to avoid duplicates, not counting bsides
+            if not uid[-1] == 'b':
+                if cardname in allcards:
+                    allcards[cardname] += [card]
+                else:
+                    allcards[cardname] = [card]
+                    
+            # also aggregate aside cards by uid so we can add bsides later
+            if uid[-1:] == 'a':
+                asides[uid] = card
+            if uid[-1:] == 'b':
+                bsides[uid] = card
+
+    for uid in bsides:
+        aside_uid = uid[:-1] + 'a'
+        if aside_uid in asides:
+            # the second check handles the brothers yamazaki edge case
+            if not asides[aside_uid]['name'] == bsides[uid]['name']:
+                asides[aside_uid][config.json_field_bside] = bsides[uid]
+        else:
+            pass
+            # this exposes some coldsnap theme deck bsides that aren't
+            # really bsides; shouldn't matter too much
+            #print aside_uid
+            #print bsides[uid]
+
+    if verbose:
+        print 'Opened ' + str(len(allcards)) + ' uniquely named cards.'
+    return allcards
--- a/lib/manalib.py
+++ b/lib/manalib.py
@ -0,0 +1,173 @@
+# representation for mana costs and text with embedded mana costs
+# data aggregating classes
+import random
+import re
+
+import utils
+
+class Manacost:
+    '''mana cost representation with data'''
+    
+    # hardcoded to be dependent on the symbol structure... ah well
+    def get_colors(self):
+        colors = ''
+        for sym in self.symbols:
+            if self.symbols[sym] > 0:
+                symcolors = re.sub(r'2|P|S|X', '', sym)
+                for symcolor in symcolors:
+                    if symcolor not in colors:
+                        colors += symcolor
+        # sort so the order is always consistent
+        return ''.join(sorted(colors))
+
+    def check_colors(self, symbolstring):
+        for sym in symbolstring:
+            if not sym in self.colors:
+                return False
+        return True
+
+    def __init__(self, src, fmt = ''):
+        # source fields, exactly one will be set
+        self.raw = None
+        self.json = None
+        # flags
+        self.parsed = True
+        self.valid = True
+        self.none = False
+        # default values for all fields
+        self.inner = None
+        self.cmc = 0
+        self.colorless = 0
+        self.sequence = []
+        self.symbols = {sym : 0 for sym in utils.mana_syms}
+        self.allsymbols = {sym : 0 for sym in utils.mana_symall}
+        self.colors = ''
+
+        if fmt == 'json':
+            self.json = src
+            text = utils.mana_translate(self.json.upper())
+        else:
+            self.raw = src
+            text = self.raw
+
+        if text == '':
+            self.inner = ''
+            self.none = True
+
+        elif not (len(text) >= 2 and text[0] == '{' and text[-1] == '}'):
+            self.parsed = False
+            self.valid = False
+
+        else:
+            self.inner = text[1:-1]
+
+            # structure mirrors the decoding in utils, but we pull out different data here
+            idx = 0
+            while idx < len(self.inner):
+                # taking this branch is an infinite loop if unary_marker is empty
+                if (len(utils.mana_unary_marker) > 0 and 
+                    self.inner[idx:idx+len(utils.mana_unary_marker)] == utils.mana_unary_marker):
+                    idx += len(utils.mana_unary_marker)
+                    self.sequence += [utils.mana_unary_marker]
+                elif self.inner[idx:idx+len(utils.mana_unary_counter)] == utils.mana_unary_counter:
+                    idx += len(utils.mana_unary_counter)
+                    self.sequence += [utils.mana_unary_counter]
+                    self.colorless += 1
+                    self.cmc += 1
+                else:
+                    old_idx = idx
+                    for symlen in range(utils.mana_symlen_min, utils.mana_symlen_max + 1):
+                        encoded_sym = self.inner[idx:idx+symlen]
+                        if encoded_sym in utils.mana_symall_decode:
+                            idx += symlen
+                            # leave the sequence encoded for convenience
+                            self.sequence += [encoded_sym]
+                            sym = utils.mana_symall_decode[encoded_sym]
+                            self.allsymbols[sym] += 1
+                            if sym in utils.mana_symalt:
+                                self.symbols[utils.mana_alt(sym)] += 1
+                            else:
+                                self.symbols[sym] += 1
+                            if sym == utils.mana_X:
+                                self.cmc += 0
+                            elif utils.mana_2 in sym:
+                                self.cmc += 2
+                            else:
+                                self.cmc += 1
+                            break
+                    # otherwise we'll go into an infinite loop if we see a symbol we don't know
+                    if idx == old_idx:
+                        idx += 1
+                        self.valid = False
+
+        self.colors = self.get_colors()
+
+    def __str__(self):
+        return utils.mana_untranslate(utils.mana_open_delimiter + ''.join(self.sequence)
+                                      + utils.mana_close_delimiter)
+
+    def format(self, for_forum = False):
+        return utils.mana_untranslate(utils.mana_open_delimiter + ''.join(self.sequence, for_forum)
+                                      + utils.mana_close_delimiter)
+
+    def encode(self, randomize = False):
+        if self.none:
+            return ''
+        elif randomize:
+            # so this won't work very well if mana_unary_marker isn't empty
+            return (utils.mana_open_delimiter 
+                    + ''.join(random.sample(self.sequence, len(self.sequence)))
+                    + utils.mana_close_delimiter)
+        else:
+            return utils.mana_open_delimiter + ''.join(self.sequence) + utils.mana_close_delimiter
+
+class Manatext:
+    '''text representation with embedded mana costs'''
+    
+    def __init__(self, src, fmt = ''):
+        # source fields
+        self.raw = None
+        self.json = None
+        # flags
+        self.valid = True
+        # default values for all fields
+        self.text = src
+        self.costs = []
+        
+        if fmt == 'json':
+            self.json = src
+            manastrs = re.findall(utils.mana_json_regex, src)
+        else:
+            self.raw = src
+            manastrs = re.findall(utils.mana_regex, src)
+            
+        for manastr in manastrs:
+            cost = Manacost(manastr, fmt)
+            if not cost.valid:
+                self.valid = False
+            self.costs += [cost]
+            self.text = self.text.replace(manastr, utils.reserved_mana_marker, 1)
+
+        if (utils.mana_open_delimiter in self.text 
+            or utils.mana_close_delimiter in self.text
+            or utils.mana_json_open_delimiter in self.text 
+            or utils.mana_json_close_delimiter in self.text):
+            self.valid = False
+
+    def __str__(self):
+        text = self.text
+        for cost in self.costs:
+            text = text.replace(utils.reserved_mana_marker, str(cost), 1)
+        return text
+
+    def format(self, for_forum = False):
+        text = self.text
+        for cost in self.costs:
+            text = text.replace(utils.reserved_mana_marker, cost.format(for_forum = for_forum), 1)
+        return text
+
+    def encode(self, randomize = False):
+        text = self.text
+        for cost in self.costs:
+            text = text.replace(utils.reserved_mana_marker, cost.encode(randomize = randomize), 1)
+        return text
--- a/lib/transforms.py
+++ b/lib/transforms.py
@ -0,0 +1,390 @@
+# transform passes used to encode / decode cards
+import re
+
+# These could probably use a little love... They tend to hardcode in lots
+# of things very specific to the mtgjson format.
+
+import utils
+
+cardsep = utils.cardsep
+fieldsep = utils.fieldsep
+bsidesep = utils.bsidesep
+newline = utils.newline
+dash_marker = utils.dash_marker
+bullet_marker = utils.bullet_marker
+this_marker = utils.this_marker
+counter_marker = utils.counter_marker
+reserved_marker = utils.reserved_marker
+x_marker = utils.x_marker
+tap_marker = utils.tap_marker
+untap_marker = utils.untap_marker
+counter_rename = utils.counter_rename
+unary_marker = utils.unary_marker
+unary_counter = utils.unary_counter
+
+
+# Name Passes.
+
+
+def name_pass_1_sanitize(s):
+    s = s.replace('!', '')
+    s = s.replace('?', '')
+    s = s.replace('-', dash_marker)
+    s = s.replace('100,000', 'one hundred thousand')
+    s = s.replace('1,000', 'one thousand')
+    s = s.replace('1996', 'nineteen ninety-six')
+    return s
+
+
+# Text Passes.
+
+
+def text_pass_1_strip_rt(s):
+    return re.sub(r'\(.*\)', '', s)
+
+
+def text_pass_2_cardname(s, name):
+    # Here are some fun edge cases, thanks to jml34 on the forum for 
+    # pointing them out.
+    if name == 'sacrifice':
+        s = s.replace(name, this_marker, 1)
+        return s
+    elif name == 'fear':
+        return s
+
+    s = s.replace(name, this_marker)
+    
+    # So, some legends don't use the full cardname in their text box...
+    # this check finds about 400 of them.
+    nameparts = name.split(',')
+    if len(nameparts) > 1:
+        mininame = nameparts[0]
+        new_s = s.replace(mininame, this_marker)
+        if not new_s == s:
+            s = new_s
+        
+    # A few others don't have a convenient comma to detect their nicknames,
+    # so we override them here.
+    overrides = [
+        # detectable by splitting on 'the', though that might cause other issues
+        'crovax',
+        'rashka',
+        'phage',
+        'shimatsu',
+        # random and arbitrary: they have a last name, 1996 world champion, etc.
+        'world champion',
+        'axelrod',
+        'hazezon',
+        'rubinia',
+        'rasputin',
+        'hivis',
+    ]
+    
+    for override in overrides:
+        s = s.replace(override, this_marker)
+
+    return s
+
+
+def text_pass_3_unary(s):
+    return utils.to_unary(s)
+
+
+# Run only after doing unary conversion.
+def text_pass_4a_dashes(s):
+    s = s.replace('-' + unary_marker, reserved_marker)
+    s = s.replace('-', dash_marker)
+    s = s.replace(reserved_marker, '-' + unary_marker)
+    
+    # level up is annoying
+    levels = re.findall(r'level &\^*\-&', s)
+    for level in levels:
+        newlevel = level.replace('-', dash_marker)
+        s = s.replace(level, newlevel)
+
+    levels = re.findall(r'level &\^*\+', s)
+    for level in levels:
+        newlevel = level.replace('+', dash_marker)
+        s = s.replace(level, newlevel)
+
+    # and we still have the ~x issue
+    return s
+
+
+# Run this after fixing dashes, because this unbreaks the ~x issue.
+# Also probably don't run this on names, there are a few names with x~ in them.
+def text_pass_4b_x(s):
+    s = s.replace(dash_marker + 'x', '-' + x_marker)
+    s = s.replace('+x', '+' + x_marker)
+    s = s.replace(' x ', ' ' + x_marker + ' ')
+    s = s.replace('x:', x_marker + ':')
+    s = s.replace('x~', x_marker + '~')
+    s = s.replace('x.', x_marker + '.')
+    s = s.replace('x,', x_marker + ',')
+    s = s.replace('x/x', x_marker + '/' + x_marker)
+    return s
+
+
+# Call this before replacing newlines.
+# This one ends up being really bad because of the confusion
+# with 'counter target spell or ability'.
+def text_pass_5_counters(s):
+    # so, big fat old dictionary time!!!!!!!!!
+    allcounters = [
+        'time counter',
+        'devotion counter',
+        'charge counter',
+        'ki counter',
+        'matrix counter',
+        'spore counter',
+        'poison counter',
+        'quest counter',
+        'hatchling counter',
+        'storage counter',
+        'growth counter',
+        'paralyzation counter',
+        'energy counter',
+        'study counter',
+        'glyph counter',
+        'depletion counter',
+        'sleight counter',
+        'loyalty counter',
+        'hoofprint counter',
+        'wage counter',
+        'echo counter',
+        'lore counter',
+        'page counter',
+        'divinity counter',
+        'mannequin counter',
+        'ice counter',
+        'fade counter',
+        'pain counter',
+        #'age counter',
+        'gold counter',
+        'muster counter',
+        'infection counter',
+        'plague counter',
+        'fate counter',
+        'slime counter',
+        'shell counter',
+        'credit counter',
+        'despair counter',
+        'globe counter',
+        'currency counter',
+        'blood counter',
+        'soot counter',
+        'carrion counter',
+        'fuse counter',
+        'filibuster counter',
+        'wind counter',
+        'hourglass counter',
+        'trap counter',
+        'corpse counter',
+        'awakening counter',
+        'verse counter',
+        'scream counter',
+        'doom counter',
+        'luck counter',
+        'intervention counter',
+        'eyeball counter',
+        'flood counter',
+        'eon counter',
+        'death counter',
+        'delay counter',
+        'blaze counter',
+        'magnet counter',
+        'feather counter',
+        'shield counter',
+        'wish counter',
+        'petal counter',
+        'music counter',
+        'pressure counter',
+        'manifestation counter',
+        #'net counter',
+        'velocity counter',
+        'vitality counter',
+        'treasure counter',
+        'pin counter',
+        'bounty counter',
+        'rust counter',
+        'mire counter',
+        'tower counter',
+        #'ore counter',
+        'cube counter',
+        'strife counter',
+        'elixir counter',
+        'hunger counter',
+        'level counter',
+        'winch counter',
+        'fungus counter',
+        'training counter',
+        'theft counter',
+        'arrowhead counter',
+        'sleep counter',
+        'healing counter',
+        'mining counter',
+        'dream counter',
+        'aim counter',
+        'arrow counter',
+        'javelin counter',
+        'gem counter',
+        'bribery counter',
+        'mine counter',
+        'omen counter',
+        'phylactery counter',
+        'tide counter',
+        'polyp counter',
+        'petrification counter',
+        'shred counter',
+        'pupa counter',
+    ]
+    usedcounters = []
+    for countername in allcounters:
+        if countername in s:
+            usedcounters += [countername]
+            s = s.replace(countername, counter_marker + ' counter')
+    
+    # oh god some of the counter names are suffixes of others...
+    shortcounters = [
+        'age counter',
+        'net counter',
+        'ore counter',
+    ]
+    for countername in shortcounters:
+        # SUPER HACKY fix for doubling season
+        if countername in s and 'more counter' not in s:
+            usedcounters += [countername]
+            s = s.replace(countername, counter_marker + ' counter')
+    
+    # miraculously this doesn't seem to happen
+    # if len(usedcounters) > 1:
+    #     print usedcounters
+
+    # we haven't done newline replacement yet, so use actual newlines
+    if len(usedcounters) == 1:
+        # and yeah, this line of code can blow up in all kinds of different ways
+        s = 'countertype ' + counter_marker + ' ' + usedcounters[0].split()[0] + '\n' + s
+
+    return s
+
+
+# The word 'counter' is confusing when used to refer to what we do to spells
+# and sometimes abilities to make them not happen. Let's rename that.
+# Call this after doing the counter replacement to simplify the regexes.
+counter_rename = 'uncast'
+def text_pass_6_uncast(s):
+    # pre-checks to make sure we aren't doing anything dumb
+    # if '% counter target ' in s or '^ counter target ' in s or '& counter target ' in s:
+    #     print s + '\n'
+    # if '% counter a ' in s or '^ counter a ' in s or '& counter a ' in s:
+    #     print s + '\n'
+    # if '% counter all ' in s or '^ counter all ' in s or '& counter all ' in s:
+    #     print s + '\n'
+    # if '% counter a ' in s or '^ counter a ' in s or '& counter a ' in s:
+    #     print s + '\n'
+    # if '% counter that ' in s or '^ counter that ' in s or '& counter that ' in s:
+    #     print s + '\n'
+    # if '% counter @' in s or '^ counter @' in s or '& counter @' in s:
+    #     print s + '\n'
+    # if '% counter the ' in s or '^ counter the ' in s or '& counter the ' in s:
+    #     print s + '\n'
+
+    # counter target
+    s = s.replace('counter target ', counter_rename + ' target ')
+    # counter a
+    s = s.replace('counter a ', counter_rename + ' a ')
+    # counter all
+    s = s.replace('counter all ', counter_rename + ' all ')
+    # counters a
+    s = s.replace('counters a ', counter_rename + 's a ')
+    # countered (this could get weird in terms of englishing the word)
+    s = s.replace('countered', counter_rename)
+    # counter that
+    s = s.replace('counter that ', counter_rename + ' that ')
+    # counter @
+    s = s.replace('counter @', counter_rename + ' @')
+    # counter it (this is tricky
+    s = s.replace(', counter it', ', ' + counter_rename + ' it')
+    # counter the (it happens at least once, thanks wizards!)
+    s = s.replace('counter the ', counter_rename + ' the ')
+    # counter up to
+    s = s.replace('counter up to ', counter_rename + ' up to ')
+
+    # check if the word exists in any other context
+    # if 'counter' in (s.replace('% counter', '').replace('countertype', '')
+    #                  .replace('^ counter', '').replace('& counter', ''):
+    #     print s + '\n'
+
+    # whew! by manual inspection of a few dozen texts, it looks like this about covers it.
+    return s    
+    
+
+# Run after fixing dashes, it makes the regexes better, but before replacing newlines.
+def text_pass_7_choice(s):
+    # the idea is to take 'choose n ~\n=ability\n=ability\n'
+    # to '[n = ability = ability]\n'
+    
+    def choice_formatting_helper(s_helper, prefix, count):
+        single_choices = re.findall(ur'(' + prefix + ur'\n?(\u2022.*(\n|$))+)', s_helper)
+        for choice in single_choices:
+            newchoice = choice[0]
+            newchoice = newchoice.replace(prefix, unary_marker + (unary_counter * count))
+            newchoice = newchoice.replace('\n', ' ')
+            if newchoice[-1:] == ' ':
+                newchoice = '[' + newchoice[:-1] + ']\n'
+            else:
+                newchoice = '[' + newchoice + ']'
+            s_helper = s_helper.replace(choice[0], newchoice)
+        return s_helper
+
+    s = choice_formatting_helper(s, ur'choose one \u2014', 1)
+    s = choice_formatting_helper(s, ur'choose one \u2014 ', 1) # ty Promise of Power
+    s = choice_formatting_helper(s, ur'choose two \u2014', 2)
+    s = choice_formatting_helper(s, ur'choose one or both \u2014', 0)
+    s = choice_formatting_helper(s, ur'choose one or more \u2014', 0)
+
+    return s
+
+
+# do before removing newlines
+# might as well do this after countertype because we probably care more about
+# the location of the equip cost
+def text_pass_8_equip(s):
+    equips = re.findall(r'equip ' + utils.mana_json_regex + r'.?$', s)
+    # there don't seem to be any cases with more than one
+    if len(equips) == 1:
+        equip = equips[0]
+        s = s.replace('\n' + equip, '')
+        s = s.replace(equip, '')
+
+        if equip[-1:] == ' ':
+            equip = equip[0:-1]
+
+        if s == '':
+            s = equip
+        else:
+            s = equip + '\n' + s
+
+    nonmana = re.findall(ur'(equip\u2014.*(\n|$))', s)
+    if len(nonmana) == 1:
+        equip = nonmana[0][0]
+        s = s.replace('\n' + equip, '')
+        s = s.replace(equip, '')
+        
+        if equip[-1:] == ' ':
+            equip = equip[0:-1]
+
+        if s == '':
+            s = equip
+        else:
+            s = equip + '\n' + s
+        
+    return s
+
+
+def text_pass_9_newlines(s):
+    return s.replace('\n', '\\')
+
+
+def text_pass_10_symbols(s):
+    return utils.to_symbols(s)
--- a/lib/utils.py
+++ b/lib/utils.py
@ -18,6 +18,7 @@ bullet_marker = config.bullet_marker
 this_marker = config.this_marker
 counter_marker = config.counter_marker
 reserved_marker = config.reserved_marker
+reserved_mana_marker = config.reserved_mana_marker
 x_marker = config.x_marker
 tap_marker = config.tap_marker
 untap_marker = config.untap_marker
@ -25,6 +26,21 @@ untap_marker = config.untap_marker
 # unambiguous synonyms
 counter_rename = config.counter_rename

+# field labels
+field_label_name = config.field_label_name
+field_label_rarity = config.field_label_rarity
+field_label_cost = config.field_label_cost
+field_label_supertypes = config.field_label_supertypes
+field_label_types = config.field_label_types
+field_label_subtypes = config.field_label_subtypes
+field_label_loyalty = config.field_label_loyalty
+field_label_pt = config.field_label_pt
+field_label_text = config.field_label_text
+
+# additional fields we add to the json cards
+json_field_bside = config.json_field_bside
+json_field_set_name = config.json_field_set_name
+
 # unicode / ascii conversion
 unicode_trans = {
    u'\u2014' : dash_marker, # unicode long dash
@ -297,16 +313,16 @@ mana_regex = (re.escape(mana_open_delimiter) + '['
              + ']*' + re.escape(mana_close_delimiter))

 # as a special case, we let unary or decimal numbers exist in json mana strings
-mana_jcharset_special = '0123456789' + mana_unary_marker + mana_unary_counter
-mana_jcharset_strict = unique_string(''.join(mana_symall_jdecode) + mana_jcharset_special)
-mana_jcharset = unique_string(mana_jcharset_strict + mana_jcharset_strict.lower())
+mana_json_charset_special = ('0123456789' + unary_marker + unary_counter)
+mana_json_charset_strict = unique_string(''.join(mana_symall_jdecode) + mana_json_charset_special)
+mana_json_charset = unique_string(mana_json_charset_strict + mana_json_charset_strict.lower())

 # note that json mana strings can't be empty between the delimiters
-mana_jregex_strict = (re.escape(mana_json_open_delimiter) + '['
-                     + re.escape(mana_jcharset_strict) 
+mana_json_regex_strict = (re.escape(mana_json_open_delimiter) + '['
+                     + re.escape(mana_json_charset_strict) 
                     + ']+' + re.escape(mana_json_close_delimiter))
-mana_jregex = (re.escape(mana_json_open_delimiter) + '['
-               + re.escape(mana_jcharset)
+mana_json_regex = (re.escape(mana_json_open_delimiter) + '['
+               + re.escape(mana_json_charset)
               + ']+' + re.escape(mana_json_close_delimiter))

 number_decimal_regex = r'[0123456789]+'
@ -322,7 +338,7 @@ def mana_translate(jmanastr):
    for n in sorted(re.findall(mana_unary_regex, manastr),
                    lambda x,y: cmp(len(x), len(y)), reverse = True):
        ns = re.findall(number_unary_regex, n)
-        i = (len(ns[0]) - len(mana_unary_marker)) / len(mana_unary_counter)
+        i = (len(ns[0]) - len(unary_marker)) / len(unary_counter)
        manastr = manastr.replace(n, mana_unary_marker + mana_unary_counter * i)
    for n in sorted(re.findall(mana_decimal_regex, manastr),
                        lambda x,y: cmp(len(x), len(y)), reverse = True):
@ -381,7 +397,7 @@ def mana_untranslate(manastr, for_forum = False):
 # finally, replacing all instances in a string
 # notice the calls to .upper(), this way we recognize lowercase symbols as well just in case
 def to_mana(s):
-    jmanastrs = re.findall(mana_jregex, s)
+    jmanastrs = re.findall(mana_json_regex, s)
    for jmanastr in sorted(jmanastrs, lambda x,y: cmp(len(x), len(y)), reverse = True):
        s = s.replace(jmanastr, mana_translate(jmanastr.upper()))
    return s
@ -434,3 +450,6 @@ def from_symbols(s, for_forum = False):
    for symstr in sorted(symstrs, lambda x,y: cmp(len(x), len(y)), reverse = True):
        s = s.replace(symstr, symbol_trans[symstr])
    return s
+
+unletters_regex = r"[^abcdefghijklmnopqrstuvwxyz']"
+
--- a/scripts/randomize_mana.py
+++ b/scripts/randomize_mana.py
@ -15,11 +15,13 @@ def main(fname, oname = None, verbose = True):
    datamine.analyze(cardtexts)

    multicards = []
-    reps = 5
+    reps = 10

    for card in datamine.cards:
        for i in range(reps):
            multicards += [card.reencode(randomize = True)]
+            # multicards += [card.reencode(randomize = True) 
+            #                + card.cost.reencode(randomize = True) + utils.fieldsep]
            
    random.shuffle(multicards)