""" Filter the results of an XPath query through a command. Usage: tools/filter_xpath.py Options: -h --help Show this screen. -v --verbose Verbose. -e --encoding-format=FMT Encoding format. [default: m4a] -l --local-audio-source Use local file source, do not download [default: false] -o --output-file=PATH Destination file for edited XML """ # TODO: actually add commandline tool args # cat dicts/crkeng.xml| python tools/filter_xpath.py "./e" "./lg/l" "test_cmd.sh" 1> dicts/crkeng-cans.xml # TODO: option for no fetch, incase they are stored locally: is # the target compressed audio store for now, but could serve as local copy too # # python tools/extract_audio.py dicts/sms-all.xml static/aud/sms --verbose > test_aud.xml # TODO: only download updated files, storing in manifest in path/to/stored/audio/ from docopt import docopt import os, sys import requests from lxml import etree command = None from sh import hfst_lookup def run_cmd(_in, args): tool = hfst_lookup(*args, _in=_in.encode('utf-8'), _bg=True) stsrs = [] for l in tool.split('\n\n'): ll = l.split('\t') if len(ll) == 3: stsrs.append(ll[1]) else: stsrs.append('--') return stsrs # lxml_root, [(source, target), ... ] def replace_xpath(xml_root, nodes, elems, tool_name=False, tool_args=False): import copy root_duplicate = copy.deepcopy(xml_root) all_nodes = etree.XPath( nodes, )(root_duplicate) # nodes with audios get replaced with the new URL. print >> sys.stderr, len(all_nodes) n = 0 convert = [] for node in all_nodes: strs = node.xpath(elems) if len(strs) > 0: for s in strs: if s.text is not None: convert.append(s.text) print >> sys.stderr, len(convert) converted = run_cmd('\n'.join(convert), tool_args) print >> sys.stderr, len(converted) for node in all_nodes: strs = node.xpath(elems) # print c, strs[0].text if len(strs) > 0: for s in strs: if s.text is not None: s.text = converted[0].strip() converted.pop(0) # new xml root return root_duplicate def write_xml(root, output_file=False): # TODO: strips some headers stringed = etree.tostring(root, pretty_print=True, method='xml', encoding='unicode') if output_file is not None: with open(output_file, 'w') as F: F.write(stringed.encode('utf-8')) else: print >> sys.stdout, stringed.encode('utf-8') # def init_tool(path): # from sh import hfst_lookup # tool = hfst_lookup('/Users/pyry/gtsvn/langs/crk/src/orthography/Latn-to-Cans.lookup.hfst', _in="nipi", _bg=True) # return tool def main(): arguments = docopt(__doc__, version='asdf') # Usage: tools/filter_xpath.py xp_n = arguments.get('') xp = arguments.get('') tool_name = arguments.get('') tool_args = arguments.get('') root = etree.parse(sys.stdin) updated_xml = replace_xpath(root, nodes=xp_n, elems=xp, tool_name=tool_name, tool_args=tool_args.split(' ')) write_xml(updated_xml, arguments.get('--output-file')) return 0 if __name__ == "__main__": sys.exit(main())