added unscrambling script to make reading the format easier
This commit is contained in:
parent
41e269d9b3
commit
92d9dc8bd8
2 changed files with 133 additions and 0 deletions
|
@ -13,6 +13,8 @@ python encode.py AllSets.json output.txt
|
|||
```
|
||||
will read the corpus from AllSets.json and put the new encoding in output.txt.
|
||||
|
||||
You can also use unscramble.py to take data formatted like hte output of encode.py and make it more human readable (though definitely not valid json). Works the same way as encode.
|
||||
|
||||
Apparently I'm running Python 2.7.6.
|
||||
|
||||
======
|
||||
|
|
131
unscramble.py
Normal file
131
unscramble.py
Normal file
|
@ -0,0 +1,131 @@
|
|||
import re
|
||||
import codecs
|
||||
import sys
|
||||
|
||||
# there should really be a separate file to store the character choices and such
|
||||
|
||||
def from_unary(s):
|
||||
numbers = re.findall(r'&\^*', s)
|
||||
for number in sorted(numbers, cmp = lambda x,y: cmp(len(x), len(y)) * -1):
|
||||
i = len(number) - 1
|
||||
s = s.replace(number, str(i))
|
||||
return s
|
||||
|
||||
def cleanup_mana(s):
|
||||
untranslations = {
|
||||
'WW' : '{W}',
|
||||
'UU' : '{U}',
|
||||
'BB' : '{B}',
|
||||
'RR' : '{R}',
|
||||
'GG' : '{G}',
|
||||
'PP' : '{P}',
|
||||
'WP' : '{W/P}',
|
||||
'UP' : '{U/P}',
|
||||
'BP' : '{B/P}',
|
||||
'RP' : '{R/P}',
|
||||
'GP' : '{G/P}',
|
||||
'VW' : '{2/W}',
|
||||
'VU' : '{2/U}',
|
||||
'VB' : '{2/B}',
|
||||
'VR' : '{2/R}',
|
||||
'VG' : '{2/G}',
|
||||
'WU' : '{W/U}',
|
||||
'WB' : '{W/B}',
|
||||
'RW' : '{R/W}',
|
||||
'GW' : '{G/W}',
|
||||
'UB' : '{U/B}',
|
||||
'UR' : '{U/R}',
|
||||
'GU' : '{G/U}',
|
||||
'BR' : '{B/R}',
|
||||
'BG' : '{B/G}',
|
||||
'RG' : '{R/G}',
|
||||
'SS' : '{S}',
|
||||
'XX' : '{X}',
|
||||
}
|
||||
|
||||
manacosts = re.findall(r'\{[WUBRGPVSX\^]*\}', s)
|
||||
for cost in manacosts:
|
||||
if cost == '{}':
|
||||
s = s.replace(cost, '{0}')
|
||||
continue
|
||||
|
||||
innercost = cost[1:-1]
|
||||
newcost = ''
|
||||
colorless_total = 0
|
||||
|
||||
# pull out unary countingses
|
||||
colorless_counts = re.findall(r'\^+', innercost)
|
||||
for count in colorless_counts:
|
||||
innercost = innercost.replace(count, '')
|
||||
colorless_total += len(count)
|
||||
if colorless_total > 0:
|
||||
newcost += '{' + str(colorless_total) + '}'
|
||||
|
||||
# now try to read the remaining characters in pairs
|
||||
success = True
|
||||
while len(innercost) > 1:
|
||||
fragment = innercost[0:2]
|
||||
if fragment in untranslations:
|
||||
newcost += untranslations[fragment]
|
||||
else:
|
||||
success = False
|
||||
break
|
||||
innercost = innercost[2:]
|
||||
|
||||
if len(innercost) == 0 and success:
|
||||
s = s.replace(cost, newcost)
|
||||
else:
|
||||
print cost
|
||||
print newcost
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def unreplace_newlines(s):
|
||||
return s.replace('\\', '\n')
|
||||
|
||||
def unscramble(s):
|
||||
s = from_unary(s)
|
||||
s = cleanup_mana(s)
|
||||
s = unreplace_newlines(s)
|
||||
return s
|
||||
|
||||
|
||||
def main(fname, oname = None, verbose = True):
|
||||
if verbose:
|
||||
print 'Opening encoded card file: ' + fname
|
||||
|
||||
f = open(fname, 'r')
|
||||
lines = f.readlines()
|
||||
f.close()
|
||||
|
||||
if not oname == None:
|
||||
if verbose:
|
||||
print 'Writing output to: ' + oname
|
||||
ofile = codecs.open(oname, 'w', 'utf-8')
|
||||
|
||||
for line in lines:
|
||||
val = unscramble(line)
|
||||
if oname == None:
|
||||
sys.stdout.write(val)
|
||||
else:
|
||||
ofile.write(val)
|
||||
|
||||
# print len(badwords)
|
||||
# for word in badwords:
|
||||
# print word
|
||||
|
||||
if not oname == None:
|
||||
ofile.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
if len(sys.argv) == 2:
|
||||
main(sys.argv[1])
|
||||
elif len(sys.argv) == 3:
|
||||
main(sys.argv[1], oname = sys.argv[2])
|
||||
else:
|
||||
print 'Usage: ' + sys.argv[0] + ' ' + '<encoded file> [output filename]'
|
||||
exit(1)
|
||||
|
Loading…
Reference in a new issue