first version of something that torch-rnn can use with streaming

This commit is contained in:
Bill Zorn 2016-05-09 02:13:17 -07:00
parent d4b5ef2104
commit 0132345ebe
5 changed files with 203 additions and 3 deletions

1
data/mtgvocab.json Normal file
View File

@ -0,0 +1 @@
{"idx_to_token": {"1": "\n", "2": " ", "3": "\"", "4": "%", "5": "&", "6": "'", "7": "*", "8": "+", "9": ",", "10": "-", "11": ".", "12": "/", "13": "0", "14": "1", "15": "2", "16": "3", "17": "4", "18": "5", "19": "6", "20": "7", "21": "8", "22": "9", "23": ":", "24": "=", "25": "@", "26": "A", "27": "B", "28": "C", "29": "E", "30": "G", "31": "L", "32": "N", "33": "O", "34": "P", "35": "Q", "36": "R", "37": "S", "38": "T", "39": "U", "40": "W", "41": "X", "42": "Y", "43": "[", "44": "\\", "45": "]", "46": "^", "47": "a", "48": "b", "49": "c", "50": "d", "51": "e", "52": "f", "53": "g", "54": "h", "55": "i", "56": "j", "57": "k", "58": "l", "59": "m", "60": "n", "61": "o", "62": "p", "63": "q", "64": "r", "65": "s", "66": "t", "67": "u", "68": "v", "69": "w", "70": "x", "71": "y", "72": "z", "73": "{", "74": "|", "75": "}", "76": "~"}, "token_to_idx": {"\n": 1, " ": 2, "\"": 3, "%": 4, "'": 6, "&": 5, "+": 8, "*": 7, "-": 10, ",": 9, "/": 12, ".": 11, "1": 14, "0": 13, "3": 16, "2": 15, "5": 18, "4": 17, "7": 20, "6": 19, "9": 22, "8": 21, ":": 23, "=": 24, "A": 26, "@": 25, "C": 28, "B": 27, "E": 29, "G": 30, "L": 31, "O": 33, "N": 32, "Q": 35, "P": 34, "S": 37, "R": 36, "U": 39, "T": 38, "W": 40, "Y": 42, "X": 41, "[": 43, "]": 45, "\\": 44, "^": 46, "a": 47, "c": 49, "b": 48, "e": 51, "d": 50, "g": 53, "f": 52, "i": 55, "h": 54, "k": 57, "j": 56, "m": 59, "l": 58, "o": 61, "n": 60, "q": 63, "p": 62, "s": 65, "r": 64, "u": 67, "t": 66, "w": 69, "v": 68, "y": 71, "x": 70, "{": 73, "z": 72, "}": 75, "|": 74, "~": 76}}

View File

@ -565,9 +565,9 @@ class Card:
# the NN representation, use str() or format() for output intended for human
# readers.
def encode(self, fmt_ordered = fmt_ordered_default, fmt_labeled = None,
fieldsep = utils.fieldsep, randomize_fields = False, randomize_mana = False,
initial_sep = True, final_sep = True):
def encode(self, fmt_ordered = fmt_ordered_default, fmt_labeled = fmt_labeled_default,
fieldsep = utils.fieldsep, initial_sep = True, final_sep = True,
randomize_fields = False, randomize_mana = False, randomize_lines = False):
outfields = []
for field in fmt_ordered:
@ -581,6 +581,8 @@ class Card:
outfield_str = outfield.encode(randomize = randomize_mana)
elif isinstance(outfield, Manatext):
outfield_str = outfield.encode(randomize = randomize_mana)
if randomize_lines:
outfield_str = transforms.randomize_lines(outfield_str)
else:
outfield_str = outfield
else:

View File

@ -1,5 +1,6 @@
# transform passes used to encode / decode cards
import re
import random
# These could probably use a little love... They tend to hardcode in lots
# of things very specific to the mtgjson format.
@ -482,6 +483,7 @@ def text_pass_11_linetrans(s):
alllines = prelines + keylines + mainlines + postlines
return utils.newline.join(alllines)
# randomize the order of the lines
# not a text pass, intended to be invoked dynamically when encoding a card
# call this on fully encoded text, with mana symbols expanded
@ -491,6 +493,7 @@ def separate_lines(text):
return [],[],[],[],[]
preline_search = ['equip', 'fortify', 'enchant ', 'bestow']
# probably could use optimization with a regex
costline_search = [
'multikicker', 'kicker', 'suspend', 'echo', 'awaken',
'buyback', 'dash', 'entwine', 'evoke', 'flashback',
@ -537,6 +540,48 @@ def separate_lines(text):
return prelines, keylines, mainlines, costlines, postlines
choice_re = re.compile(re.escape(utils.choice_open_delimiter) + r'.*' +
re.escape(utils.choice_close_delimiter))
choice_divider = ' ' + utils.bullet_marker + ' '
def randomize_choice(line):
choices = re.findall(choice_re, line)
if len(choices) < 1:
return line
new_line = line
for choice in choices:
parts = choice[1:-1].split(choice_divider)
if len(parts) < 3:
continue
choiceparts = parts[1:]
random.shuffle(choiceparts)
new_line = new_line.replace(choice,
utils.choice_open_delimiter +
choice_divider.join(parts[:1] + choiceparts) +
utils.choice_close_delimiter,
1)
return new_line
def randomize_lines(text):
if text == '' or 'level up' in text:
return text
prelines, keylines, mainlines, costlines, postlines = separate_lines(text)
random.shuffle(prelines)
random.shuffle(keylines)
new_mainlines = []
for line in mainlines:
if line.endswith(utils.choice_close_delimiter):
new_mainlines.append(randomize_choice(line))
# elif utils.choice_open_delimiter in line or utils.choice_close_delimiter in line:
# print(line)
else:
new_mainlines.append(line)
random.shuffle(new_mainlines)
random.shuffle(costlines)
#random.shuffle(postlines) # only one kind ever (countertype)
return utils.newline.join(prelines+keylines+new_mainlines+costlines+postlines)
# Text unpasses, for decoding. All assume the text inside a Manatext, so don't do anything
# weird with the mana cost symbol.

View File

@ -2,11 +2,13 @@
import sys
import os
import re
import json
libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
sys.path.append(libdir)
import utils
import jdecode
import cardlib
import transforms
def check_lines(fname):
@ -122,6 +124,28 @@ def check_vocab(fname):
print(card.encode())
break
def check_characters(fname, vname):
cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
tokens = {c for c in utils.cardsep}
for card in cards:
for c in card.encode():
tokens.add(c)
token_to_idx = {tok:i+1 for i, tok in enumerate(sorted(tokens))}
idx_to_token = {i+1:tok for i, tok in enumerate(sorted(tokens))}
print('Vocabulary: ({:d} symbols)'.format(len(token_to_idx)))
for token in sorted(token_to_idx):
print('{:8s} : {:4d}'.format(repr(token), token_to_idx[token]))
# compliant with torch-rnn
if vname:
json_data = {'token_to_idx':token_to_idx, 'idx_to_token':idx_to_token}
print('writing vocabulary to {:s}'.format(vname))
with open(vname, 'w') as f:
json.dump(json_data, f)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
@ -132,11 +156,17 @@ if __name__ == '__main__':
help='show behavior of line separation')
parser.add_argument('-vocab', action='store_true',
help='show vocabulary counts from encoded card text')
parser.add_argument('-chars', action='store_true',
help='generate and display vocabulary of characters used in encoding')
parser.add_argument('--vocab_name', default=None,
help='json file to write vocabulary to')
args = parser.parse_args()
if args.lines:
check_lines(args.infile)
if args.vocab:
check_vocab(args.infile)
if args.chars:
check_characters(args.infile, args.vocab_name)
exit(0)

122
scripts/streamcards.py Executable file
View File

@ -0,0 +1,122 @@
#!/usr/bin/env python
# -- STOLEN FROM torch-rnn/scripts/streamfile.py -- #
import os
import threading
import time
import signal
import traceback
import psutil
# correctly setting up a stream that won't get orphaned and left clutting the operating
# system proceeds in 3 parts:
# 1) invoke install_suicide_handlers() to ensure correct behavior on interrupt
# 2) get threads by invoking spawn_stream_threads
# 3) invoke wait_and_kill_self_noreturn(threads)
# or, use the handy wrapper that does it for you
def spawn_stream_threads(fds, runthread, mkargs):
threads = []
for i, fd in enumerate(fds):
stream_thread = threading.Thread(target=runthread, args=mkargs(i, fd))
stream_thread.daemon = True
stream_thread.start()
threads.append(stream_thread)
return threads
def force_kill_self_noreturn():
# We have a strange issue here, which is that our threads will refuse to die
# to a normal exit() or sys.exit() because they're all blocked in write() calls
# on full pipes; the simplest workaround seems to be to ask the OS to terminate us.
# This kinda works, but...
#os.kill(os.getpid(), signal.SIGTERM)
# psutil might have useful features like checking if the pid has been reused before killing it.
# Also we might have child processes like l2e luajits to think about.
me = psutil.Process(os.getpid())
for child in me.children(recursive=True):
child.terminate()
me.terminate()
def handler_kill_self(signum, frame):
if signum != signal.SIGQUIT:
traceback.print_stack(frame)
print('caught signal {:d} - streamer sending SIGTERM to self'.format(signum))
force_kill_self_noreturn()
def install_suicide_handlers():
for sig in [signal.SIGHUP, signal.SIGINT, signal.SIGQUIT]:
signal.signal(sig, handler_kill_self)
def wait_and_kill_self_noreturn(threads):
running = True
while running:
running = False
for thread in threads:
if thread.is_alive():
running = True
if(os.getppid() <= 1):
# exit if parent process died (and we were reparented to init)
break
time.sleep(1)
force_kill_self_noreturn()
def streaming_noreturn(fds, write_stream, mkargs):
install_suicide_handlers()
threads = spawn_stream_threads(fds, write_stream, mkargs)
wait_and_kill_self_noreturn(threads)
assert False, 'should not return from streaming'
# -- END STOLEN FROM torch-rnn/scripts/streamfile.py -- #
import sys
import random
libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
sys.path.append(libdir)
import utils
import jdecode
import transforms
def main(args):
fds = args.fds
fname = args.fname
block_size = args.block_size
main_seed = args.seed if args.seed != 0 else None
# simple default encoding for now, will add more options with the curriculum
# learning feature
cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
def write_stream(i, fd):
local_random = random.Random(main_seed)
local_random.jumpahead(i)
local_cards = [card for card in cards]
with open('/proc/self/fd/'+str(fd), 'wt') as f:
while True:
local_random.shuffle(local_cards)
for card in local_cards:
f.write(card.encode(randomize_mana=True, randomize_lines=True))
f.write(utils.cardsep)
def mkargs(i, fd):
return i, fd
streaming_noreturn(fds, write_stream, mkargs)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('fds', type=int, nargs='+',
help='file descriptors to write streams to')
parser.add_argument('-f', '--fname', default=os.path.join(libdir, '../data/output.txt'),
help='file to read cards from')
parser.add_argument('-n', '--block_size', type=int, default=10000,
help='number of characters each stream should read/write at a time')
parser.add_argument('-s', '--seed', type=int, default=0,
help='random seed')
args = parser.parse_args()
main(args)