updated to new mtgjson version and added legacy support for formats

without a rarity field
This commit is contained in:
Bill Zorn 2015-08-01 13:26:03 -07:00
parent f0e631e015
commit 758f48b790
5 changed files with 42511 additions and 41737 deletions

Binary file not shown.

3
data/cbow.sh Executable file
View file

@ -0,0 +1,3 @@
#!/bin/bash
word2vec -train cbow.txt -output cbow.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 8 -binary 1 -iter 15

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -10,12 +10,27 @@ import cardlib
from cbow import CBOW from cbow import CBOW
def main(fname, oname = None, verbose = True, def main(fname, oname = None, verbose = True,
gatherer = False, for_forum = False, creativity = False): gatherer = False, for_forum = False, creativity = False, norarity = False):
cards = [] cards = []
valid = 0 valid = 0
invalid = 0 invalid = 0
unparsed = 0 unparsed = 0
if norarity:
decode_fields = [
cardlib.field_name,
cardlib.field_supertypes,
cardlib.field_types,
cardlib.field_loyalty,
cardlib.field_subtypes,
#cardlib.field_rarity,
cardlib.field_pt,
cardlib.field_cost,
cardlib.field_text,
]
else:
decode_fields = cardlib.fmt_ordered_default
if fname[-5:] == '.json': if fname[-5:] == '.json':
if verbose: if verbose:
print 'This looks like a json file: ' + fname print 'This looks like a json file: ' + fname
@ -23,7 +38,7 @@ def main(fname, oname = None, verbose = True,
for json_cardname in sorted(json_srcs): for json_cardname in sorted(json_srcs):
if len(json_srcs[json_cardname]) > 0: if len(json_srcs[json_cardname]) > 0:
jcards = json_srcs[json_cardname] jcards = json_srcs[json_cardname]
card = cardlib.Card(json_srcs[json_cardname][0]) card = cardlib.Card(json_srcs[json_cardname][0], fmt_ordered = decode_fields)
if card.valid: if card.valid:
valid += 1 valid += 1
elif card.parsed: elif card.parsed:
@ -40,7 +55,7 @@ def main(fname, oname = None, verbose = True,
text = f.read() text = f.read()
for card_src in text.split(utils.cardsep): for card_src in text.split(utils.cardsep):
if card_src: if card_src:
card = cardlib.Card(card_src) card = cardlib.Card(card_src, fmt_ordered = decode_fields)
if card.valid: if card.valid:
valid += 1 valid += 1
elif card.parsed: elif card.parsed:
@ -53,6 +68,20 @@ def main(fname, oname = None, verbose = True,
print (str(valid) + ' valid, ' + str(invalid) + ' invalid, ' print (str(valid) + ' valid, ' + str(invalid) + ' invalid, '
+ str(unparsed) + ' failed to parse.') + str(unparsed) + ' failed to parse.')
good_count = 0
bad_count = 0
for card in cards:
if not card.parsed and not card.text.text:
bad_count += 1
else:
good_count += 1
if good_count + bad_count > 15:
break
# random heuristic
if bad_count > 10:
print 'Saw a bunch of unparsed cards with no text:'
print 'If this is a legacy format, try rerunning with --norarity'
if creativity: if creativity:
cbow = CBOW() cbow = CBOW()
@ -92,10 +121,13 @@ if __name__ == '__main__':
help='use pretty mana encoding for mtgsalvation forum') help='use pretty mana encoding for mtgsalvation forum')
parser.add_argument('-c', '--creativity', action='store_true', parser.add_argument('-c', '--creativity', action='store_true',
help='use CBOW fuzzy matching to check creativity of cards') help='use CBOW fuzzy matching to check creativity of cards')
parser.add_argument('--norarity', action='store_true',
help='the card format has no rarity field; use for legacy input')
parser.add_argument('-v', '--verbose', action='store_true', parser.add_argument('-v', '--verbose', action='store_true',
help='verbose output') help='verbose output')
args = parser.parse_args() args = parser.parse_args()
main(args.infile, args.outfile, verbose = args.verbose, main(args.infile, args.outfile, verbose = args.verbose,
gatherer = args.gatherer, for_forum = args.forum, creativity = args.creativity) gatherer = args.gatherer, for_forum = args.forum, creativity = args.creativity,
norarity = args.norarity)
exit(0) exit(0)