various improvements, added cbow

This commit is contained in:
Bill Zorn 2015-07-29 01:21:34 -07:00
parent 08dc3944f8
commit 49e386ac4f
9 changed files with 29818 additions and 34 deletions

BIN
data/cbow.bin Normal file

Binary file not shown.

29488
data/cbow.txt Normal file

File diff suppressed because it is too large Load diff

View file

@ -6529,7 +6529,7 @@
|sigil of the new dawn||enchantment||||{^^^WW}|whenever a creature is put into your graveyard from the battlefield, you may pay {^WW}. if you do, return that card to your hand.|
|benalish commander||creature||human soldier|*/*|{^^^WW}|countertype % time\@'s power and toughness are each equal to the number of soldiers you control.\suspend x~{XXWWWW}. X can't be &. \whenever a % counter is removed from @ while it's exiled, put a &^/&^ white soldier creature token onto the battlefield.|
|benalish commander||creature||human soldier|*/*|{^^^WW}|countertype % time\@'s power and toughness are each equal to the number of soldiers you control.\suspend X~{XXWWWW}. X can't be &. \whenever a % counter is removed from @ while it's exiled, put a &^/&^ white soldier creature token onto the battlefield.|
|deluge||instant||||{^^UU}|tap all creatures without flying.|
@ -8148,7 +8148,7 @@
|ventifact bottle||artifact||||{^^^}|countertype % charge\{XX^}, T: put X % counters on @. activate this ability only any time you could cast a sorcery.\at the beginning of your precombat main phase, if @ has a % counter on it, tap it and remove all % counters from it. add {^} to your mana pool for each % counter removed this way.|
|aeon chronicler||creature||avatar|*/*|{^^^UUUU}|countertype % time\@'s power and toughness are each equal to the number of cards in your hand.\suspend x~{XX^^^UU}. X can't be &. \whenever a % counter is removed from @ while it's exiled, draw a card.|
|aeon chronicler||creature||avatar|*/*|{^^^UUUU}|countertype % time\@'s power and toughness are each equal to the number of cards in your hand.\suspend X~{XX^^^UU}. X can't be &. \whenever a % counter is removed from @ while it's exiled, draw a card.|
|brine seer||creature||human wizard|&^/&^|{^^^UU}|{^^UU}, T: reveal any number of blue cards in your hand. uncast target spell unless its controller pays {^} for each card revealed this way.|
@ -9290,7 +9290,7 @@
|gaea's liege||creature||avatar|*/*|{^^^GGGGGG}|as long as @ isn't attacking, its power and toughness are each equal to the number of forests you control. as long as @ is attacking, its power and toughness are each equal to the number of forests defending player controls.\T: target land becomes a forest until @ leaves the battlefield.|
|swell of courage||instant||||{^^^WWWW}|creatures you control get +&^^/+&^^ until end of turn.\reinforce x~{XXWWWW}|
|swell of courage||instant||||{^^^WWWW}|creatures you control get +&^^/+&^^ until end of turn.\reinforce X~{XXWWWW}|
|battlegrace angel||creature||angel|&^^^^/&^^^^|{^^^WWWW}|flying\exalted \whenever a creature you control attacks alone, it gains lifelink until end of turn.|
@ -9919,7 +9919,7 @@
|bellowing saddlebrute||creature||orc warrior|&^^^^/&^^^^^|{^^^BB}|raid ~ when @ enters the battlefield, you lose &^^^^ life unless you attacked with a creature this turn.|
|detritivore||creature||lhurgoyf|*/*|{^^RRRR}|countertype % time\@'s power and toughness are each equal to the number of nonbasic land cards in your opponents' graveyards.\suspend x~{XX^^^RR}. X can't be &. \whenever a % counter is removed from @ while it's exiled, destroy target nonbasic land.|
|detritivore||creature||lhurgoyf|*/*|{^^RRRR}|countertype % time\@'s power and toughness are each equal to the number of nonbasic land cards in your opponents' graveyards.\suspend X~{XX^^^RR}. X can't be &. \whenever a % counter is removed from @ while it's exiled, destroy target nonbasic land.|
|gruul signet||artifact||||{^^}|{^}, T: add {RRGG} to your mana pool.|
@ -17008,7 +17008,7 @@
|viashino grappler||creature||viashino|&^^^/&^|{^^RR}|{GG}: @ gains trample until end of turn.|
|fungal behemoth||creature||fungus|*/*|{^^^GG}|countertype % time\@'s power and toughness are each equal to the number of +&^/+&^ counters on creatures you control.\suspend x~{XXGGGG}. X can't be &. \whenever a % counter is removed from @ while it's exiled, you may put a +&^/+&^ counter on target creature.|
|fungal behemoth||creature||fungus|*/*|{^^^GG}|countertype % time\@'s power and toughness are each equal to the number of +&^/+&^ counters on creatures you control.\suspend X~{XXGGGG}. X can't be &. \whenever a % counter is removed from @ while it's exiled, you may put a +&^/+&^ counter on target creature.|
|defang||enchantment||aura||{^WW}|enchant creature\prevent all damage that would be dealt by enchanted creature.|
@ -18757,7 +18757,7 @@
|arcane lighthouse||land|||||T: add {^} to your mana pool.\{^}, T: until end of turn, creatures your opponents control lose hexproof and shroud and can't have hexproof or shroud.|
|roiling horror||creature||horror|*/*|{^^^BBBB}|countertype % time\@'s power and toughness are each equal to your life total minus the life total of an opponent with the most life.\suspend x~{XXBBBBBB}. X can't be &. \whenever a % counter is removed from @ while it's exiled, target player loses &^ life and you gain &^ life.|
|roiling horror||creature||horror|*/*|{^^^BBBB}|countertype % time\@'s power and toughness are each equal to your life total minus the life total of an opponent with the most life.\suspend X~{XXBBBBBB}. X can't be &. \whenever a % counter is removed from @ while it's exiled, target player loses &^ life and you gain &^ life.|
|illusionary terrain||enchantment||||{UUUU}|cumulative upkeep {^^} \as @ enters the battlefield, choose two basic land types.\basic lands of the first chosen type are the second chosen type.|

View file

@ -7,8 +7,10 @@ sys.path.append(libdir)
import utils
import jdecode
import cardlib
from cbow import CBOW
def main(fname, oname = None, verbose = True, gatherer = False, for_forum = False):
def main(fname, oname = None, verbose = True,
gatherer = False, for_forum = False, creativity = False):
cards = []
valid = 0
invalid = 0
@ -51,17 +53,28 @@ def main(fname, oname = None, verbose = True, gatherer = False, for_forum = Fals
print (str(valid) + ' valid, ' + str(invalid) + ' invalid, '
+ str(unparsed) + ' failed to parse.')
if creativity:
cbow = CBOW()
def writecards(writer):
for card in cards:
writer.write((card.format(gatherer = gatherer, for_forum = for_forum)).encode('utf-8'))
if creativity:
writer.write('~~ closest cards ~~\n'.encode('utf-8'))
nearest = cbow.nearest(card)
for dist, cardname in nearest:
if for_forum:
cardname = '[card]' + cardname + '[/card]'
writer.write((cardname + ': ' + str(dist) + '\n').encode('utf-8'))
writer.write('\n'.encode('utf-8'))
if oname:
if verbose:
print 'Writing output to: ' + oname
with open(oname, 'w') as ofile:
for card in cards:
ofile.write((card.format(gatherer = gatherer, for_forum = for_forum)
+ '\n').encode('utf-8'))
writecards(ofile)
else:
for card in cards:
sys.stdout.write((card.format(gatherer = gatherer, for_forum = for_forum)
+ '\n').encode('utf-8'))
writecards(sys.stdout)
sys.stdout.flush()
@ -77,10 +90,12 @@ if __name__ == '__main__':
help='emulate Gatherer visual spoiler')
parser.add_argument('-f', '--forum', action='store_true',
help='use pretty mana encoding for mtgsalvation forum')
parser.add_argument('-c', '--creativity', action='store_true',
help='use CBOW fuzzy matching to check creativity of cards')
parser.add_argument('-v', '--verbose', action='store_true',
help='verbose output')
args = parser.parse_args()
main(args.infile, args.outfile, verbose = args.verbose,
gatherer = args.gatherer, for_forum = args.forum)
gatherer = args.gatherer, for_forum = args.forum, creativity = args.creativity)
exit(0)

View file

@ -29,7 +29,9 @@ def main(fname, oname = None, verbose = True, dupes = 0, encoding = 'std', stabl
final_sep = True
# set the properties of the encoding
if encoding in ['std']:
if encoding in ['vec']:
pass
elif encoding in ['std']:
if dupes == 0:
dupes = 1
elif encoding in ['rmana']:
@ -125,22 +127,12 @@ def main(fname, oname = None, verbose = True, dupes = 0, encoding = 'std', stabl
random.seed(1371367)
random.shuffle(cards)
if oname:
if verbose:
print 'Writing output to: ' + oname
with open(oname, 'w') as ofile:
for card in cards:
ofile.write(card.encode(fmt_ordered = fmt_ordered,
fmt_labeled = fmt_labeled,
fieldsep = fieldsep,
randomize_fields = randomize_fields,
randomize_mana = randomize_mana,
initial_sep = initial_sep,
final_sep = final_sep)
+ utils.cardsep)
else:
def writecards(writer):
for card in cards:
sys.stdout.write(card.encode(fmt_ordered = fmt_ordered,
if encoding in ['vec']:
writer.write(card.vectorize() + '\n\n')
else:
writer.write(card.encode(fmt_ordered = fmt_ordered,
fmt_labeled = fmt_labeled,
fieldsep = fieldsep,
randomize_fields = randomize_fields,
@ -148,6 +140,14 @@ def main(fname, oname = None, verbose = True, dupes = 0, encoding = 'std', stabl
initial_sep = initial_sep,
final_sep = final_sep)
+ utils.cardsep)
if oname:
if verbose:
print 'Writing output to: ' + oname
with open(oname, 'w') as ofile:
writecards(ofile)
else:
writecards(sys.stdout)
sys.stdout.flush()
@ -162,7 +162,7 @@ if __name__ == '__main__':
parser.add_argument('-d', '--duplicate', metavar='N', type=int, default=0,
help='number of times to duplicate each card')
parser.add_argument('-e', '--encoding', default='std',
choices=['std', 'rmana', 'rmana_dual', 'rfields'])
choices=['std', 'rmana', 'rmana_dual', 'rfields', 'vec'])
parser.add_argument('-s', '--stable', action='store_true',
help="don't randomize the order of the cards")
parser.add_argument('-v', '--verbose', action='store_true',

View file

@ -6,6 +6,32 @@ import utils
import transforms
from manalib import Manacost, Manatext
# Some text prettification stuff that people may not have installed
try:
from titlecase import titlecase
except ImportError:
def titlecase(s):
return s.title()
try:
import textwrap
import nltk.data
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# This crazy thing is actually invoked as an unpass, so newlines are still
# encoded.
def sentencecase(s):
s = s.replace(utils.x_marker, utils.reserved_marker)
lines = s.split(utils.newline)
clines = []
for line in lines:
if line:
sentences = sent_tokenizer.tokenize(line)
clines += [' '.join([sent.capitalize() for sent in sentences])]
return utils.newline.join(clines).replace(utils.reserved_marker, utils.x_marker)
except ImportError:
def sentencecase(s):
return s
# These are used later to determine what the fields of the Card object are called.
# Define them here because they have nothing to do with the actual format.
field_name = 'name'
@ -495,7 +521,7 @@ class Card:
def format(self, gatherer = False, for_forum = False):
outstr = ''
if gatherer:
cardname = self.__dict__[field_name].title()
cardname = titlecase(self.__dict__[field_name])
if not cardname:
cardname = '_NONAME_'
if for_forum:
@ -516,11 +542,11 @@ class Card:
outstr += '\n'
basetypes = self.__dict__[field_types]
basetypes = map(str.capitalize, self.__dict__[field_types])
if len(basetypes) < 1:
basetypes = ['_NOTYPE_']
outstr += ' '.join(self.__dict__[field_supertypes] + basetypes)
outstr += ' '.join(map(str.capitalize, self.__dict__[field_supertypes]) + basetypes)
if self.__dict__[field_subtypes]:
outstr += (' ' + utils.dash_marker + ' ' +
@ -540,6 +566,7 @@ class Card:
mtext = transforms.text_unpass_2_counters(mtext)
mtext = transforms.text_unpass_3_unary(mtext)
mtext = transforms.text_unpass_4_symbols(mtext, for_forum)
mtext = sentencecase(mtext)
mtext = transforms.text_unpass_5_cardname(mtext, cardname)
mtext = transforms.text_unpass_6_newlines(mtext)
newtext = Manatext('')
@ -615,3 +642,41 @@ class Card:
outstr += self.bside.format(gatherer = gatherer, for_forum = for_forum)
return outstr
def vectorize(self):
ld = '('
rd = ')'
outstr = ''
if self.__dict__[field_rarity]:
outstr += ld + self.__dict__[field_rarity] + rd + ' '
coststr = self.__dict__[field_cost].vectorize(delimit = True)
if coststr:
outstr += coststr + ' '
typestr = ' '.join(map(lambda s: '(' + s + ')',
self.__dict__[field_supertypes] + self.__dict__[field_types]))
if typestr:
outstr += typestr + ' '
if self.__dict__[field_subtypes]:
outstr += ' '.join(self.__dict__[field_subtypes]) + ' '
if self.__dict__[field_pt]:
outstr += ' '.join(map(lambda s: '(' + s + ')',
self.__dict__[field_pt].replace('/', '/ /').split()))
outstr += ' '
if self.__dict__[field_loyalty]:
outstr += '((' + self.__dict__[field_loyalty] + ')) '
outstr += self.__dict__[field_text].vectorize()
if self.bside:
outstr = '_ASIDE_ ' + outstr + '\n\n_BSIDE_ ' + self.bside.vectorize()
return outstr

181
lib/cbow.py Normal file
View file

@ -0,0 +1,181 @@
# Infinite thanks to Talcos from the mtgsalvation forums, who among
# many, many other things wrote the original version of this code.
# I have merely ported it to fit my needs.
import re
import sys
import subprocess
import os
import struct
import math
import utils
import cardlib
import transforms
# # this would be nice, but doing it naively makes things worse
# from joblib import Parallel, delayed
# import multiprocessing
libdir = os.path.dirname(os.path.realpath(__file__))
datadir = os.path.realpath(os.path.join(libdir, '../data'))
# # multithreading control parameters
# cores = multiprocessing.cpu_count()
# segments = cores / 2 if cores / 2 > 0 else 1
# max length of vocabulary entries
max_w = 50
#### snip! ####
def read_vector_file(fname):
with open(fname, 'rb') as f:
words = int(f.read(4))
size = int(f.read(4))
vocab = [' '] * (words * max_w)
M = []
for b in range(0,words):
a = 0
while True:
c = f.read(1)
vocab[b * max_w + a] = c;
if len(c) == 0 or c == ' ':
break
if (a < max_w) and vocab[b * max_w + a] != '\n':
a += 1
tmp = list(struct.unpack('f'*size,f.read(4 * size)))
length = math.sqrt(sum([tmp[i] * tmp[i] for i in range(0,len(tmp))]))
for i in range(0,len(tmp)):
tmp[i] /= length
M.append(tmp)
return ((''.join(vocab)).split(),M)
def makevector(vocabulary,vecs,sequence):
words = sequence.split()
indices = []
for word in words:
if word not in vocabulary:
#print("Missing word in vocabulary: " + word)
continue
#return [0.0]*len(vecs[0])
indices.append(vocabulary.index(word))
#res = map(sum,[vecs[i] for i in indices])
res = None
for v in [vecs[i] for i in indices]:
if res == None:
res = v
else:
res = [x + y for x, y in zip(res,v)]
length = math.sqrt(sum([res[i] * res[i] for i in range(0,len(res))]))
for i in range(0,len(res)):
res[i] /= length
return res
#### !snip ####
try:
import numpy
def cosine_similarity(v1,v2):
A = numpy.array([v1,v2])
# from http://stackoverflow.com/questions/17627219/whats-the-fastest-way-in-python-to-calculate-cosine-similarity-given-sparse-mat
# base similarity matrix (all dot products)
# replace this with A.dot(A.T).todense() for sparse representation
similarity = numpy.dot(A, A.T)
# squared magnitude of preference vectors (number of occurrences)
square_mag = numpy.diag(similarity)
# inverse squared magnitude
inv_square_mag = 1 / square_mag
# if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
inv_square_mag[numpy.isinf(inv_square_mag)] = 0
# inverse of the magnitude
inv_mag = numpy.sqrt(inv_square_mag)
# cosine similarity (elementwise multiply by inverse magnitudes)
cosine = similarity * inv_mag
cosine = cosine.T * inv_mag
return cosine[0][1]
except ImportError:
def cosine_similarity(v1,v2):
#compute cosine similarity of v1 to v2: (v1 dot v1)/{||v1||*||v2||)
sumxx, sumxy, sumyy = 0, 0, 0
for i in range(len(v1)):
x = v1[i]; y = v2[i]
sumxx += x*x
sumyy += y*y
sumxy += x*y
return sumxy/math.sqrt(sumxx*sumyy)
def cosine_similarity_name(cardvec, v, name):
return (cosine_similarity(cardvec, v), name)
class CBOW:
def __init__(self, verbose = True,
vector_fname = os.path.join(datadir, 'cbow.bin'),
card_fname = os.path.join(datadir, 'output.txt')):
self.verbose = verbose
self.cardvecs = []
if self.verbose:
print 'Building a cbow model...'
if self.verbose:
print ' Reading binary vector data from: ' + vector_fname
(vocab, vecs) = read_vector_file(vector_fname)
self.vocab = vocab
self.vecs = vecs
if self.verbose:
print ' Reading encoded cards from: ' + card_fname
print ' They\'d better be in the same order as the file used to build the vector model!'
with open(card_fname, 'rt') as f:
text = f.read()
for card_src in text.split(utils.cardsep):
if card_src:
card = cardlib.Card(card_src)
name = card.name
self.cardvecs += [(name, makevector(self.vocab,
self.vecs,
card.vectorize()))]
# self.par = Parallel(n_jobs=segments)
if self.verbose:
print '... Done.'
print ' vocab size: ' + str(len(self.vocab))
print ' raw vecs: ' + str(len(self.vecs))
print ' card vecs: ' + str(len(self.cardvecs))
def nearest(self, card, n=5):
if isinstance(card, cardlib.Card):
words = card.vectorize().split('\n\n')[0]
else:
# assume it's a string (that's already a vector)
words = card
if not words:
return []
cardvec = makevector(self.vocab, self.vecs, words)
comparisons = [cosine_similarity_name(cardvec, v, name) for (name, v) in self.cardvecs]
# comparisons = self.par(delayed(cosine_similarity_name)(cardvec, v, name)
# for (name, v) in self.cardvecs)
comparisons.sort(reverse = True)
comp_n = comparisons[:n]
if isinstance(card, cardlib.Card) and card.bside:
comp_n += self.nearest(card.bside)
return comp_n

View file

@ -126,6 +126,18 @@ class Manacost:
else:
return utils.mana_open_delimiter + ''.join(self.sequence) + utils.mana_close_delimiter
def vectorize(self, delimit = False):
if self.none:
return ''
elif delimit:
ld = '('
rd = ')'
else:
ld = ''
rd = ''
return ' '.join(map(lambda s: ld + s + rd, self.sequence))
class Manatext:
'''text representation with embedded mana costs'''
@ -176,3 +188,25 @@ class Manatext:
for cost in self.costs:
text = text.replace(utils.reserved_mana_marker, cost.encode(randomize = randomize), 1)
return text
def vectorize(self):
text = self.text
special_chars = [utils.reserved_mana_marker,
utils.dash_marker,
utils.bullet_marker,
utils.this_marker,
utils.counter_marker,
utils.choice_open_delimiter,
utils.choice_close_delimiter,
utils.newline,
#utils.x_marker,
utils.tap_marker,
utils.untap_marker,
utils.newline,
';', ':', '"', ',', '.']
for char in special_chars:
text = text.replace(char, ' ' + char + ' ')
text = text.replace('/', '/ /')
for cost in self.costs:
text = text.replace(utils.reserved_mana_marker, cost.vectorize(), 1)
return ' '.join(text.split())

View file

@ -121,6 +121,7 @@ def text_pass_4b_x(s):
s = s.replace(' x ', ' ' + x_marker + ' ')
s = s.replace('x:', x_marker + ':')
s = s.replace('x~', x_marker + '~')
s = s.replace(u'x\u2014', x_marker + u'\u2014')
s = s.replace('x.', x_marker + '.')
s = s.replace('x,', x_marker + ',')
s = s.replace('x/x', x_marker + '/' + x_marker)