From 336ac2701c14f21cdca83356113e7fd7a95d54b6 Mon Sep 17 00:00:00 2001 From: Bill Zorn Date: Sat, 4 Jul 2015 02:39:41 -0700 Subject: [PATCH] significantly enhanced datamining code, maybe it even does something --- datamine.py | 321 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 304 insertions(+), 17 deletions(-) diff --git a/datamine.py b/datamine.py index c4fd164..5c98d4c 100644 --- a/datamine.py +++ b/datamine.py @@ -2,20 +2,242 @@ import re import codecs import sys +# grab the characters we need out of encode +# really we should probably make a settings file with that stuff in it +import encode + + +# global helpers +def prettymana(s, for_forum): + # this agorithm is pretty generic, intended fro use on Manacost.symbols keys + if len(s) == 1: + if for_forum: + return '[mana]' + s + '[/mana]' + else: + return '{' + s + '}' + elif len(s) == 2: + if for_forum: + return '[mana]{' + s[0] + '/' + s[1] + '}[/mana]' + else: + return '{' + s[0] + '/' + s[1] + '}' + +punctuation_chars = r'[+\-*",.:;WUBRGPV/XTQ|\\&^\{\}@ \n=~%\[\]]' +creature_keywords = [ + # evergreen + 'deathtouch', + 'defender', + 'double strike', + 'first strike', + 'flash', + 'flying', + 'haste', + 'hexproof', + 'indestructible', + 'lifelink', + 'menace', + 'prowess', + 'reach', + 'trample', + 'vigilance', + # no longer evergreen + 'banding', + 'fear', + 'shroud', + 'intimidate', + # rare ones that work the same way and interfere + 'rampage', + 'infect', + 'bushido', + 'exalted', + 'shadow', +] # there are other keywords out there, these are just easy to detect + +# data aggregating classes +class Manacost: + '''mana cost representation with data''' + + def get_colors(self): + colors = '' + for sym in self.symbols: + if self.symbols[sym] > 0: + symcolors = re.sub(r'2|P|S|X', '', sym) + for symcolor in symcolors: + if symcolor not in colors: + colors += symcolor + return colors + + def check_colors(self, symbolstring): + for sym in symbolstring: + if not sym in self.colors: + return False + return True + + def __init__(self, text): + self.raw = text + self.cmc = 0 + self.colorless = 0 + self.sequence = [] + self.symbols = { + 'W' : 0, # single color + 'U' : 0, + 'B' : 0, + 'R' : 0, + 'G' : 0, + 'P' : 0, # colorless phyrexian + 'S' : 0, # snow + 'X' : 0, # number of x symbols + 'WP' : 0, # single color phyrexian + 'UP' : 0, + 'BP' : 0, + 'RP' : 0, + 'GP' : 0, + '2W' : 0, # single color hybrid + '2U' : 0, + '2B' : 0, + '2R' : 0, + '2G' : 0, + 'WU' : 0, # dual color hybrid + 'WB' : 0, + 'RW' : 0, + 'GW' : 0, + 'UB' : 0, + 'UR' : 0, + 'GU' : 0, + 'BR' : 0, + 'BG' : 0, + 'RG' : 0, + } + + if text == '': + self._parsed = True + self._valid = True + self.none = True + self.inner = '' + + elif not (len(self.raw) >= 2 and self.raw[0] == '{' and self.raw[-1] == '}'): + self._parsed = False + self._valid = False + self.none = False + + else: + self._parsed = True + self._valid = True + self.none = False + self.inner = self.raw[1:-1] + + trans_decode = { + 'WW' : 'W', + 'UU' : 'U', + 'BB' : 'B', + 'RR' : 'R', + 'GG' : 'G', + 'PP' : 'P', + 'SS' : 'S', + 'XX' : 'X', + 'WP' : 'WP', + 'UP' : 'UP', + 'BP' : 'BP', + 'RP' : 'RP', + 'GP' : 'GP', + 'VW' : '2W', + 'VU' : '2U', + 'VB' : '2B', + 'VR' : '2R', + 'VG' : '2G', + 'WU' : 'WU', + 'WB' : 'WB', + 'RW' : 'RW', + 'GW' : 'GW', + 'UB' : 'UB', + 'UR' : 'UR', + 'GU' : 'GU', + 'BR' : 'BR', + 'BG' : 'BG', + 'RG' : 'RG', + } + + # read the symbols in a loop + inner_current = self.inner + while len(inner_current) > 0: + # look for counters to get the colorless cost + if inner_current[:1] == encode.unary_counter: + self.colorless += 1 + self.cmc += 1 + inner_current = inner_current[1:] + # or look for symbols to read + elif inner_current[:2] in trans_decode: + sym = trans_decode[inner_current[:2]] + self.sequence += [sym] + self.symbols[sym] += 1 + if sym == 'X': + self.cmc += 0 + elif sym[:1] == '2': + self.cmc += 2 + else: + self.cmc += 1 + inner_current = inner_current[2:] + # if we don't recognize the symbol, bail out + else: + self._valid = False + break + + self.colors = self.get_colors() + + def __str__(self): + if self.colorless == 0 and self.sequence == []: + return '{0}' + else: + if self.colorless > 0: + colorless_part = '{' + str(self.colorless) + '}' + else: + colorless_part = '' + return colorless_part + ''.join(map(lambda s: prettymana(s, False), self.sequence)) + + def format(self, for_forum): + if self.colorless == 0 and self.sequence == []: + if for_forum: + return '[mana]0[/mana]' + else: + return '{0}' + else: + if self.colorless > 0: + if for_forum: + colorless_part = '[mana]{' + str(self.colorless) + '}[/mana]' + else: + colorless_part = '{' + str(self.colorless) + '}' + else: + colorless_part = '' + return colorless_part + ''.join(map(lambda s: prettymana(s, for_forum), self.sequence)) + class Card: '''card representation with data''' def __init__(self, text): self.raw = text - fields = self.raw.split('|') + self._parsed = True + self._valid = True + + if '\n' in self.raw: + halves = self.raw.split('\n') + if not len(halves) == 2: + self._parsed = False + self._valid = False + self.fields = halves + return + else: + self.raw = halves[0] + self.bside = Card(halves[1]) + if not self.bside._valid: + self._valid = False + else: + self.bside = None + + fields = self.raw.split(encode.fieldsep) if not len(fields) == 10: self._parsed = False self._valid = False self.fields = fields else: - self._parsed = True - self._valid = True - if not fields[1] == '': self.name = fields[1] else: @@ -25,12 +247,12 @@ class Card: if not fields[2] == '': self.supertypes = fields[2].split(' ') else: - self.supertypes = None + self.supertypes = [] if not fields[3] == '': self.types = fields[3].split(' ') else: - self.types = '' + self.types = [] self._valid = False if not fields[4] == '': @@ -47,8 +269,13 @@ class Card: if not fields[5] == '': self.subtypes = fields[5].split(' ') + if 'creature' in self.types: + self.creaturetypes = self.subtypes + else: + self.creaturetypes = [] else: - self.subtypes = None + self.subtypes = [] + self.creaturetypes = [] if not fields[6] == '': self.pt = fields[6] @@ -77,17 +304,47 @@ class Card: self.toughness = None self.toughness_value = None - if not fields[7] == '': - self.cost = fields[7] - else: - self.cost = None - + # if there's no cost (lands) then cost.none will be True + self.cost = Manacost(fields[7]) + if not fields[8] == '': self.text = fields[8] + self.text_lines = self.text.split(encode.newline) + self.text_words = re.sub(punctuation_chars, ' ', self.text).split() + self.creature_words = [] + # SUPER HACK + if 'creature' in self.types: + for line in self.text_lines: + guess = [] + for keyword in creature_keywords: + if keyword in line: + guess += [keyword] + line = line.replace(keyword, '') + if re.sub(punctuation_chars, ' ', line).split() == [] or 'protect' in line: + for word in guess: + if word not in self.creature_words: + self.creature_words += [word] else: self.text = None + self.text_lines = [] + self.text_words = [] + self.creature_words = [] -def main(fname, oname = None, verbose = True): + def __str__(self): + return ''.join([ + encode.fieldsep, + self.name, + encode.fieldsep, + (' ' + encode.dash_marker + ' ').join([' '.join(self.supertypes + self.types), + ' '.join(self.subtypes)]), + encode.fieldsep, + str(self.cost.cmc) if self.cost.colors == '' + else str(self.cost.cmc) + ', ' + self.cost.colors, + encode.fieldsep, + ]) + + +def main(fname, oname = None, verbose = False): if verbose: print 'Opening encoded card file: ' + fname @@ -99,15 +356,45 @@ def main(fname, oname = None, verbose = True): cardtexts = text.split('\n\n')[1:-1] cards = [] + creatures = 0 + cwords = 0 + allwords = {} + + mcolor = 'G' + i = 0 for cardtext in cardtexts: - cards += [Card(cardtext)] i += 1 - if i >= 5: - break + card = Card(cardtext) + if not (card._parsed and card._valid): + print card.raw + continue + cards += [card] - print cards + if 'creature' in card.types: + creatures += 1 + if card.creature_words: + cwords += 1 + if card.cost.check_colors(mcolor): + print ' '.join(card.text_words) + + for word in card.text_words: + if word in allwords: + allwords[word] += 1 + else: + allwords[word] = 1 + + + # print str(creatures) + ' creatures, ' + str(cwords) + ' with keywords' + # print str(len(allwords)) + ' unique words in card text' + # i = 0 + # for word in sorted(allwords, key=allwords.get, reverse=True): + # i += 1 + # if i > 0: + # break + # print word + ': ' + str(allwords[word]) + if __name__ == '__main__': import sys