significantly enhanced datamining code, maybe it even does something

This commit is contained in:
Bill Zorn 2015-07-04 02:39:41 -07:00
parent 3223618873
commit 336ac2701c
1 changed files with 304 additions and 17 deletions

View File

@ -2,20 +2,242 @@ import re
import codecs
import sys
# grab the characters we need out of encode
# really we should probably make a settings file with that stuff in it
import encode
# global helpers
def prettymana(s, for_forum):
# this agorithm is pretty generic, intended fro use on Manacost.symbols keys
if len(s) == 1:
if for_forum:
return '[mana]' + s + '[/mana]'
else:
return '{' + s + '}'
elif len(s) == 2:
if for_forum:
return '[mana]{' + s[0] + '/' + s[1] + '}[/mana]'
else:
return '{' + s[0] + '/' + s[1] + '}'
punctuation_chars = r'[+\-*",.:;WUBRGPV/XTQ|\\&^\{\}@ \n=~%\[\]]'
creature_keywords = [
# evergreen
'deathtouch',
'defender',
'double strike',
'first strike',
'flash',
'flying',
'haste',
'hexproof',
'indestructible',
'lifelink',
'menace',
'prowess',
'reach',
'trample',
'vigilance',
# no longer evergreen
'banding',
'fear',
'shroud',
'intimidate',
# rare ones that work the same way and interfere
'rampage',
'infect',
'bushido',
'exalted',
'shadow',
] # there are other keywords out there, these are just easy to detect
# data aggregating classes
class Manacost:
'''mana cost representation with data'''
def get_colors(self):
colors = ''
for sym in self.symbols:
if self.symbols[sym] > 0:
symcolors = re.sub(r'2|P|S|X', '', sym)
for symcolor in symcolors:
if symcolor not in colors:
colors += symcolor
return colors
def check_colors(self, symbolstring):
for sym in symbolstring:
if not sym in self.colors:
return False
return True
def __init__(self, text):
self.raw = text
self.cmc = 0
self.colorless = 0
self.sequence = []
self.symbols = {
'W' : 0, # single color
'U' : 0,
'B' : 0,
'R' : 0,
'G' : 0,
'P' : 0, # colorless phyrexian
'S' : 0, # snow
'X' : 0, # number of x symbols
'WP' : 0, # single color phyrexian
'UP' : 0,
'BP' : 0,
'RP' : 0,
'GP' : 0,
'2W' : 0, # single color hybrid
'2U' : 0,
'2B' : 0,
'2R' : 0,
'2G' : 0,
'WU' : 0, # dual color hybrid
'WB' : 0,
'RW' : 0,
'GW' : 0,
'UB' : 0,
'UR' : 0,
'GU' : 0,
'BR' : 0,
'BG' : 0,
'RG' : 0,
}
if text == '':
self._parsed = True
self._valid = True
self.none = True
self.inner = ''
elif not (len(self.raw) >= 2 and self.raw[0] == '{' and self.raw[-1] == '}'):
self._parsed = False
self._valid = False
self.none = False
else:
self._parsed = True
self._valid = True
self.none = False
self.inner = self.raw[1:-1]
trans_decode = {
'WW' : 'W',
'UU' : 'U',
'BB' : 'B',
'RR' : 'R',
'GG' : 'G',
'PP' : 'P',
'SS' : 'S',
'XX' : 'X',
'WP' : 'WP',
'UP' : 'UP',
'BP' : 'BP',
'RP' : 'RP',
'GP' : 'GP',
'VW' : '2W',
'VU' : '2U',
'VB' : '2B',
'VR' : '2R',
'VG' : '2G',
'WU' : 'WU',
'WB' : 'WB',
'RW' : 'RW',
'GW' : 'GW',
'UB' : 'UB',
'UR' : 'UR',
'GU' : 'GU',
'BR' : 'BR',
'BG' : 'BG',
'RG' : 'RG',
}
# read the symbols in a loop
inner_current = self.inner
while len(inner_current) > 0:
# look for counters to get the colorless cost
if inner_current[:1] == encode.unary_counter:
self.colorless += 1
self.cmc += 1
inner_current = inner_current[1:]
# or look for symbols to read
elif inner_current[:2] in trans_decode:
sym = trans_decode[inner_current[:2]]
self.sequence += [sym]
self.symbols[sym] += 1
if sym == 'X':
self.cmc += 0
elif sym[:1] == '2':
self.cmc += 2
else:
self.cmc += 1
inner_current = inner_current[2:]
# if we don't recognize the symbol, bail out
else:
self._valid = False
break
self.colors = self.get_colors()
def __str__(self):
if self.colorless == 0 and self.sequence == []:
return '{0}'
else:
if self.colorless > 0:
colorless_part = '{' + str(self.colorless) + '}'
else:
colorless_part = ''
return colorless_part + ''.join(map(lambda s: prettymana(s, False), self.sequence))
def format(self, for_forum):
if self.colorless == 0 and self.sequence == []:
if for_forum:
return '[mana]0[/mana]'
else:
return '{0}'
else:
if self.colorless > 0:
if for_forum:
colorless_part = '[mana]{' + str(self.colorless) + '}[/mana]'
else:
colorless_part = '{' + str(self.colorless) + '}'
else:
colorless_part = ''
return colorless_part + ''.join(map(lambda s: prettymana(s, for_forum), self.sequence))
class Card:
'''card representation with data'''
def __init__(self, text):
self.raw = text
fields = self.raw.split('|')
self._parsed = True
self._valid = True
if '\n' in self.raw:
halves = self.raw.split('\n')
if not len(halves) == 2:
self._parsed = False
self._valid = False
self.fields = halves
return
else:
self.raw = halves[0]
self.bside = Card(halves[1])
if not self.bside._valid:
self._valid = False
else:
self.bside = None
fields = self.raw.split(encode.fieldsep)
if not len(fields) == 10:
self._parsed = False
self._valid = False
self.fields = fields
else:
self._parsed = True
self._valid = True
if not fields[1] == '':
self.name = fields[1]
else:
@ -25,12 +247,12 @@ class Card:
if not fields[2] == '':
self.supertypes = fields[2].split(' ')
else:
self.supertypes = None
self.supertypes = []
if not fields[3] == '':
self.types = fields[3].split(' ')
else:
self.types = ''
self.types = []
self._valid = False
if not fields[4] == '':
@ -47,8 +269,13 @@ class Card:
if not fields[5] == '':
self.subtypes = fields[5].split(' ')
if 'creature' in self.types:
self.creaturetypes = self.subtypes
else:
self.creaturetypes = []
else:
self.subtypes = None
self.subtypes = []
self.creaturetypes = []
if not fields[6] == '':
self.pt = fields[6]
@ -77,17 +304,47 @@ class Card:
self.toughness = None
self.toughness_value = None
if not fields[7] == '':
self.cost = fields[7]
else:
self.cost = None
# if there's no cost (lands) then cost.none will be True
self.cost = Manacost(fields[7])
if not fields[8] == '':
self.text = fields[8]
self.text_lines = self.text.split(encode.newline)
self.text_words = re.sub(punctuation_chars, ' ', self.text).split()
self.creature_words = []
# SUPER HACK
if 'creature' in self.types:
for line in self.text_lines:
guess = []
for keyword in creature_keywords:
if keyword in line:
guess += [keyword]
line = line.replace(keyword, '')
if re.sub(punctuation_chars, ' ', line).split() == [] or 'protect' in line:
for word in guess:
if word not in self.creature_words:
self.creature_words += [word]
else:
self.text = None
self.text_lines = []
self.text_words = []
self.creature_words = []
def main(fname, oname = None, verbose = True):
def __str__(self):
return ''.join([
encode.fieldsep,
self.name,
encode.fieldsep,
(' ' + encode.dash_marker + ' ').join([' '.join(self.supertypes + self.types),
' '.join(self.subtypes)]),
encode.fieldsep,
str(self.cost.cmc) if self.cost.colors == ''
else str(self.cost.cmc) + ', ' + self.cost.colors,
encode.fieldsep,
])
def main(fname, oname = None, verbose = False):
if verbose:
print 'Opening encoded card file: ' + fname
@ -99,15 +356,45 @@ def main(fname, oname = None, verbose = True):
cardtexts = text.split('\n\n')[1:-1]
cards = []
creatures = 0
cwords = 0
allwords = {}
mcolor = 'G'
i = 0
for cardtext in cardtexts:
cards += [Card(cardtext)]
i += 1
if i >= 5:
break
card = Card(cardtext)
if not (card._parsed and card._valid):
print card.raw
continue
cards += [card]
print cards
if 'creature' in card.types:
creatures += 1
if card.creature_words:
cwords += 1
if card.cost.check_colors(mcolor):
print ' '.join(card.text_words)
for word in card.text_words:
if word in allwords:
allwords[word] += 1
else:
allwords[word] = 1
# print str(creatures) + ' creatures, ' + str(cwords) + ' with keywords'
# print str(len(allwords)) + ' unique words in card text'
# i = 0
# for word in sorted(allwords, key=allwords.get, reverse=True):
# i += 1
# if i > 0:
# break
# print word + ': ' + str(allwords[word])
if __name__ == '__main__':
import sys