#!/usr/bin/python # -*- coding: utf-8 -*- # ============================================================================= # Version: 1.5 (Oct 17, 2009) # Author: Antonio Fuschetto (fuschett@di.unipi.it), University of Pisa # ============================================================================= # ============================================================================= # This file is part of Tanl. # # Tanl is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License, version 3, # as published by the Free Software Foundation. # # Tanl is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # ============================================================================= """Wikipedia Extractor: Extracts and cleans text from Wikipedia database dump and stores output in a number of files of similar size in a given directory. Each file contains several documents in Tanl document format. Usage: WikiExtractor.py [options] Trond usage: mkdir lgdir cat wikipediafile_lg.xml | WikiExtractor.py -o lgdir Options: -c, --compress : compress output files using bzip2 algorithm -b ..., --bytes=... : put specified bytes per output file (500K by default) -o ..., --output=... : place output files in specified directory (current directory by default) --help : display this help and exit --usage : display script usage """ import sys import getopt import pickle import urllib import re import bz2 import os.path ### PARAMS #################################################################### prefix = 'http://it.wikipedia.org/wiki/' ### SUPPORT CLASSES ########################################################### class WikiDocument: def __init__(self): self.id = None self.url = None self.text = None def __str__(self): return '\n%s\n\n' % (self.id, self.url, self.text) def get_wiki_document_url(wiki_document_title, prefix): quoted_title = urllib.quote(wiki_document_title.replace(' ', '_').encode('utf-8')) quoted_title = quoted_title.replace('%28', '(').replace('%29', ')') return prefix + quoted_title[0].upper() + quoted_title[1:] #------------------------------------------------------------------------------ class WikiExtractor: __garbage_tags = ('ref', 'gallery', 'timeline', 'noinclude', 'pre', 'table', 'tr', 'td', 'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir') __wrapper_tags = ('nowiki', 'cite', 'source', 'hiero', 'div', 'font', 'span', 'strong', 'strike', 'blockquote', 'tt', 'var', 'sup', 'sub', 'big', 'small', 'center', 'h1', 'h2', 'h3', 'em', 'b', 'i', 'u', 'a', 's', 'p') __single_tags = ('references', 'ref', 'img', 'br', 'hr', 'li', 'dt', 'dd') __placeholder_tags = {'math':'formula', 'code':'codice'} __project_namespaces = ('wikipedia', 'mediawiki', 'wikiquote', 'wikibooks', 'wikisource', 'wiktionary', 'wikispecies', 'wikinews', 'wikiversita', 'commons') __garbage_namespaces = ('immagine', 'image', 'categoria', 'category', 'file') __char_entities = {' ' :u'\u00A0', '¡' :u'\u00A1', '¢' :u'\u00A2', '£' :u'\u00A3', '¤':u'\u00A4', '¥' :u'\u00A5', '¦' :u'\u00A6', '§' :u'\u00A7', '¨' :u'\u00A8', '©' :u'\u00A9', 'ª' :u'\u00AA', '«' :u'\u00AB', '¬' :u'\u00AC', '­' :u'\u00AD', '®' :u'\u00AE', '¯' :u'\u00AF', '°' :u'\u00B0', '±' :u'\u00B1', '²' :u'\u00B2', '³' :u'\u00B3', '´' :u'\u00B4', 'µ' :u'\u00B5', '¶' :u'\u00B6', '·' :u'\u00B7', '¸' :u'\u00B8', '¹' :u'\u00B9', 'º' :u'\u00BA', '»' :u'\u00BB', '¼':u'\u00BC', '½' :u'\u00BD', '¾' :u'\u00BE', '¿':u'\u00BF', 'À' :u'\u00C0', 'Á' :u'\u00C1', 'Â' :u'\u00C2', 'Ã' :u'\u00C3', 'Ä' :u'\u00C4', 'Å' :u'\u00C5', 'Æ' :u'\u00C6', 'Ç' :u'\u00C7', 'È':u'\u00C8', 'É' :u'\u00C9', 'Ê' :u'\u00CA', 'Ë' :u'\u00CB', 'Ì' :u'\u00CC', 'Í' :u'\u00CD', 'Î' :u'\u00CE', 'Ï' :u'\u00CF', 'Ð' :u'\u00D0', 'Ñ':u'\u00D1', 'Ò' :u'\u00D2', 'Ó' :u'\u00D3', 'Ô' :u'\u00D4', 'Õ' :u'\u00D5', 'Ö' :u'\u00D6', '×' :u'\u00D7', 'Ø' :u'\u00D8', 'Ù' :u'\u00D9', 'Ú':u'\u00DA', 'Û' :u'\u00DB', 'Ü' :u'\u00DC', 'Ý':u'\u00DD', 'Þ' :u'\u00DE', 'ß' :u'\u00DF', 'à':u'\u00E0', 'á' :u'\u00E1', 'â' :u'\u00E2', 'ã':u'\u00E3', 'ä' :u'\u00E4', 'å' :u'\u00E5', 'æ' :u'\u00E6', 'ç' :u'\u00E7', 'è' :u'\u00E8', 'é':u'\u00E9', 'ê' :u'\u00EA', 'ë' :u'\u00EB', 'ì':u'\u00EC', 'í' :u'\u00ED', 'î' :u'\u00EE', 'ï' :u'\u00EF', 'ð' :u'\u00F0', 'ñ' :u'\u00F1', 'ò':u'\u00F2', 'ó' :u'\u00F3', 'ô' :u'\u00F4', 'õ':u'\u00F5', 'ö' :u'\u00F6', '÷' :u'\u00F7', 'ø':u'\u00F8', 'ù' :u'\u00F9', 'ú' :u'\u00FA', 'û' :u'\u00FB', 'ü' :u'\u00FC', 'ý' :u'\u00FD', 'þ' :u'\u00FE', 'ÿ' :u'\u00FF', 'ƒ' :u'\u0192', 'Α' :u'\u0391', 'Β' :u'\u0392', 'Γ' :u'\u0393', 'Δ' :u'\u0394', 'Ε' :u'\u0395', 'Ζ' :u'\u0396', 'Η' :u'\u0397', 'Θ' :u'\u0398', 'Ι' :u'\u0399', 'Κ' :u'\u039A', 'Λ' :u'\u039B', 'Μ' :u'\u039C', 'Ν' :u'\u039D', 'Ξ' :u'\u039E', 'Ο':u'\u039F', 'Π' :u'\u03A0', 'Ρ' :u'\u03A1', 'Σ' :u'\u03A3', 'Τ' :u'\u03A4', 'Υ' :u'\u03A5', 'Φ' :u'\u03A6', 'Χ' :u'\u03A7', 'Ψ' :u'\u03A8', 'Ω' :u'\u03A9', 'α' :u'\u03B1', 'β' :u'\u03B2', 'γ' :u'\u03B3', 'δ' :u'\u03B4', 'ε' :u'\u03B5', 'ζ' :u'\u03B6', 'η' :u'\u03B7', 'θ' :u'\u03B8', 'ι' :u'\u03B9', 'κ' :u'\u03BA', 'λ' :u'\u03BB', 'μ' :u'\u03BC', 'ν' :u'\u03BD', 'ξ' :u'\u03BE', 'ο':u'\u03BF', 'π' :u'\u03C0', 'ρ' :u'\u03C1', 'ς' :u'\u03C2', 'σ' :u'\u03C3', 'τ' :u'\u03C4', 'υ':u'\u03C5', 'φ' :u'\u03C6', 'χ' :u'\u03C7', 'ψ' :u'\u03C8', 'ω' :u'\u03C9', 'ϑ':u'\u03D1', 'ϒ' :u'\u03D2', 'ϖ' :u'\u03D6', '•' :u'\u2022', '…' :u'\u2026', '′' :u'\u2032', '″' :u'\u2033', '‾' :u'\u203E', '⁄' :u'\u2044', '℘' :u'\u2118', 'ℑ' :u'\u2111', 'ℜ' :u'\u211C', '™' :u'\u2122', 'ℵ':u'\u2135', '←' :u'\u2190', '↑' :u'\u2191', '→' :u'\u2192', '↓' :u'\u2193', '↔' :u'\u2194', '↵' :u'\u21B5', '⇐' :u'\u21D0', '⇑' :u'\u21D1', '⇒' :u'\u21D2', '⇓' :u'\u21D3', '⇔' :u'\u21D4', '∀' :u'\u2200', '∂' :u'\u2202', '∃' :u'\u2203', '∅' :u'\u2205', '∇' :u'\u2207', '∈' :u'\u2208', '∉' :u'\u2209', '∋' :u'\u220B', '∏' :u'\u220F', '∑' :u'\u2211', '−' :u'\u2212', '∗' :u'\u2217', '√' :u'\u221A', '∝' :u'\u221D', '∞' :u'\u221E', '∠' :u'\u2220', '∧' :u'\u2227', '∨' :u'\u2228', '∩' :u'\u2229', '∪' :u'\u222A', '∫' :u'\u222B', '∴' :u'\u2234', '∼' :u'\u223C', '≅' :u'\u2245', '≈' :u'\u2248', '≠' :u'\u2260', '≡' :u'\u2261', '≤' :u'\u2264', '≥' :u'\u2265', '⊂' :u'\u2282', '⊃' :u'\u2283', '⊄' :u'\u2284', '⊆' :u'\u2286', '⊇' :u'\u2287', '⊕' :u'\u2295', '⊗' :u'\u2297', '⊥' :u'\u22A5', '⋅' :u'\u22C5', '⌈' :u'\u2308', '⌉' :u'\u2309', '⌊':u'\u230A', '⌋' :u'\u230B', '⟨' :u'\u2329', '⟩' :u'\u232A', '◊' :u'\u25CA', '♠' :u'\u2660', '♣' :u'\u2663', '♥' :u'\u2665', '♦' :u'\u2666', '"' :u'\u0022', '<' :u'\u003C', '>' :u'\u003E', 'Œ' :u'\u0152', 'œ' :u'\u0153', 'Š' :u'\u0160', 'š':u'\u0161', 'Ÿ' :u'\u0178', 'ˆ' :u'\u02C6', '˜' :u'\u02DC', ' ' :u'\u2002', ' ' :u'\u2003', ' ':u'\u2009', '‌' :u'\u200C', '‍' :u'\u200D', '‎' :u'\u200E', '‏' :u'\u200F', '–' :u'\u2013', '—' :u'\u2014', '‘' :u'\u2018', '’' :u'\u2019', '‚' :u'\u201A', '“' :u'\u201C', '”' :u'\u201D', '„' :u'\u201E', '†' :u'\u2020', '‡' :u'\u2021', '‰':u'\u2030', '‹' :u'\u2039', '›' :u'\u203A', '€' :u'\u20AC'} def __init__(self): # Riconosce i commenti HTML self.__comment_pattern = re.compile(r'', re.DOTALL) # Riconosce i tag HTML spazzatura self.__garbage_tag_patterns = list() for tag in self.__class__.__garbage_tags: pattern = re.compile(r'<\s*%s(\s*| [^/]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), re.DOTALL | re.IGNORECASE) self.__garbage_tag_patterns.append(pattern) # Riconosce i tag HTML contenitori self.__wrapper_tag_patterns = list() for tag in self.__class__.__wrapper_tags: left_pattern = re.compile(r'<\s*%s(\s*| [^/]+?)>' % tag, re.DOTALL | re.IGNORECASE) right_pattern = re.compile(r'<\s*/\s*%s\s*>' % tag, re.DOTALL | re.IGNORECASE) self.__wrapper_tag_patterns.append((left_pattern, right_pattern)) # Riconosce i tag HTML singoli self.__single_tag_patterns = list() for tag in self.__class__.__single_tags: good_pattern = re.compile(r'<\s*%s(\s*| .+?)/\s*>' % tag, re.DOTALL | re.IGNORECASE) bad_pattern = re.compile(r'<\s*(/|\\)?\s*%s(\s*| [^/]+?)\\?\s*>' % tag, re.DOTALL | re.IGNORECASE) self.__single_tag_patterns.append((good_pattern, bad_pattern)) # Riconosce i tag HTML segnaposto self.__placeholder_tag_patterns = list() for tag in self.__class__.__placeholder_tags.iterkeys(): pattern = re.compile(r'<\s*%s(\s*| [^/]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), re.DOTALL | re.IGNORECASE) self.__placeholder_tag_patterns.append((pattern, self.__class__.__placeholder_tags[tag])) # Riconosce le tabelle e i template self.__table_pattern = re.compile(r'\{[^{]*?\}', re.DOTALL) # Riconosce i wikilink good_wikilink_pattern = re.compile(r'\[\[[^[]*?\]\]', re.DOTALL) bad_left_wikilink_pattern = re.compile(r'\[[^[]*?\]\]', re.DOTALL) bad_right_wikilink_pattern = re.compile(r'\[\[[^[]*?\]', re.DOTALL) self.__wikilink_pattern = (good_wikilink_pattern, bad_left_wikilink_pattern, bad_right_wikilink_pattern) # Riconosce i link HTTP self.__http_link_pattern = re.compile(r'\[http.*?\]', re.DOTALL | re.IGNORECASE) # Riconosce gli apostrofi che precedono grassetto e corsivo apostrophe_bold_pattern = re.compile(r"\w'('''[^\s'][^']*?[^\s']''')[^']", re.DOTALL) apostrophe_italic_pattern = re.compile(r"\w'(''[^\s'][^']*?[^\s']'')[^']", re.DOTALL) self.__apostrophe_pattern = (apostrophe_bold_pattern, apostrophe_italic_pattern) # Riconosce le entita' numeriche self.__numeric_entity_pattern = re.compile(r'&#\d+?;') # Riconosce gli spazi multipli self.__multi_space_pattern = re.compile(r' {2,}') # Riconosce i punti multipli self.__multi_dot_pattern = re.compile(r'\.{4,}') def extract(self, wiki_document): wiki_document = self.__clean(wiki_document) if not wiki_document: return None wiki_document = self.__compact(wiki_document) return wiki_document def __clean(self, wiki_document): # Rende maggiormente riconoscibili i tag wiki_document.text = wiki_document.text.replace('<', '<').replace('>', '>') wiki_document.text = wiki_document.text.replace('<<', u'«').replace('>>', u'»') # Elimina i commenti HTML wiki_document.text = self.__comment_pattern.sub('', wiki_document.text) # Elimina i tag HTML spazzatura for pattern in self.__garbage_tag_patterns: wiki_document.text = pattern.sub('', wiki_document.text) # Elimina i tag HTML contenitori for left_pattern, right_pattern in self.__wrapper_tag_patterns: wiki_document.text = left_pattern.sub('', wiki_document.text) wiki_document.text = right_pattern.sub('', wiki_document.text) # Elimina i tag HTML singoli for good_pattern, bad_pattern in self.__single_tag_patterns: wiki_document.text = good_pattern.sub('', wiki_document.text) wiki_document.text = bad_pattern.sub('', wiki_document.text) # Elimina i tag HTML segnaposto for pattern, placeholder in self.__placeholder_tag_patterns: index = 1 for match in pattern.finditer(wiki_document.text): wiki_document.text = wiki_document.text.replace(match.group(), '%s_%d' % (placeholder, index)) index += 1 # Elimina le tabelle e i template wiki_document.text = wiki_document.text.replace('{{end box}}', '}') wiki_document.text = wiki_document.text.replace('{{', '{').replace('}}', '}') wiki_document.text = wiki_document.text.replace('{|', '{').replace('|}', '}') wiki_document.text = self.__table_pattern.sub('', wiki_document.text) wiki_document.text = self.__table_pattern.sub('', wiki_document.text) wiki_document.text = self.__table_pattern.sub('', wiki_document.text) # Gestisce i wikilink (ben formattati; due livelli di annidamento) good_wikilink_pattern = self.__wikilink_pattern[0] for match in good_wikilink_pattern.finditer(wiki_document.text): wikilink = match.group() document_title, link_text = self.__handle_wikilink(wikilink[2:-2]) wiki_document.text = wiki_document.text.replace(wikilink, self.__get_anchor_tag(document_title, link_text)) for match in good_wikilink_pattern.finditer(wiki_document.text): wikilink = match.group() wiki_document.text = wiki_document.text.replace(wikilink, self.__handle_wikilink(wikilink[2:-2])[1]) # Gestisce i wikilink (mal formattati) bad_left_wikilink_pattern = self.__wikilink_pattern[1] for match in bad_left_wikilink_pattern.finditer(wiki_document.text): wikilink = match.group() document_title, link_text = self.__handle_wikilink(wikilink[1:-2]) wiki_document.text = wiki_document.text.replace(wikilink, self.__get_anchor_tag(document_title, link_text)) bad_right_wikilink_pattern = self.__wikilink_pattern[2] for match in bad_right_wikilink_pattern.finditer(wiki_document.text): wikilink = match.group() document_title, link_text = self.__handle_wikilink(wikilink[2:-1]) wiki_document.text = wiki_document.text.replace(wikilink, self.__get_anchor_tag(document_title, link_text)) wiki_document.text = wiki_document.text.replace('[[', '').replace(']]', '') # Elimina i link HTTP wiki_document.text = self.__http_link_pattern.sub('', wiki_document.text).replace('[]', '') # Gestisce i grassetti e i corsivi apostrophe_bold_pattern = self.__apostrophe_pattern[0] for match in apostrophe_bold_pattern.finditer(wiki_document.text): bold_text = match.group(1) wiki_document.text = wiki_document.text.replace(bold_text, bold_text[3:-3]) apostrophe_italic_pattern = self.__apostrophe_pattern[1] for match in apostrophe_italic_pattern.finditer(wiki_document.text): italic_text = match.group(1) wiki_document.text = wiki_document.text.replace(italic_text, '"%s"' % italic_text[2:-2]) wiki_document.text = wiki_document.text.replace("'''", '').replace("''", '"') # Gestisce i caratteri speciali wiki_document.text = wiki_document.text.replace('&', '&').replace('""', '"') for entity in self.__class__.__char_entities.iterkeys(): wiki_document.text = wiki_document.text.replace(entity, self.__class__.__char_entities[entity]) # Gestisce i caratteri speciali for match in self.__numeric_entity_pattern.finditer(wiki_document.text): entity = match.group() wiki_document.text = wiki_document.text.replace(entity, self.__handle_unicode(entity)) # Gestisce alcune imperfezioni del testo wiki_document.text = wiki_document.text.replace('\t', ' ') wiki_document.text = self.__multi_space_pattern.sub(' ', wiki_document.text) wiki_document.text = self.__multi_dot_pattern.sub('...', wiki_document.text) wiki_document.text = wiki_document.text.replace(' ,', ',').replace(' .', '.') wiki_document.text = wiki_document.text.replace(' :', ':').replace(' ;', ';') wiki_document.text = wiki_document.text.replace(',,', ',').replace(',.', '.') wiki_document.text = wiki_document.text.replace('( ', '(').replace(' )', ')') wiki_document.text = wiki_document.text.replace('[ ', '[').replace(' ]', ']') wiki_document.text = wiki_document.text.replace(u'« ', u'«').replace(u' »', u'»') return wiki_document def __compact(self, wiki_document): page = list() paragraph = list() for line in wiki_document.text.split('\n'): line = line.strip() if not line: continue # Gestisce il titolo della pagina if line.startswith('++'): title = line[2:-2] if title and title[-1] not in '!?': title = '%s.' % title page = [title] # Gestisce i titoli dei paragrafi elif line.startswith('=='): if len(paragraph) > 1: page.extend(paragraph) title = line[2:-2] if title and title[-1] not in '!?': title = '%s.' % title paragraph = [title] # Elimina gli elenchi puntati e numerati elif line[-1] == ':' or line[0] in '*#:;': continue # Elimina i resti delle tabelle elif line[0] in '{|' or line[-1] in '}': continue # Elimina le righe non significative elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '': continue # Elimina le righe con un basso numero di token elif not '_' in line and len(line.split()) < 6: continue # Gestisce il testo della pagina elif len(paragraph) == 0: page.append(line) # Gestisce il testo dei paragrafi else: paragraph.append(line) if len(paragraph) > 1: page.extend(paragraph) elif len(page) == 1: return None wiki_document.text = '\n'.join(page) return wiki_document def __handle_wikilink(self, wikilink): tokens = wikilink.split(':') while not tokens[0]: if len(tokens) < 2: return '', '' tokens = tokens[1:] if len(tokens) == 1 or tokens[0].strip().lower() in self.__class__.__project_namespaces: tokens = tokens[-1].split('|') while not tokens[-1]: if len(tokens) < 2: return '', '' tokens = tokens[:-1] link_text = tokens[-1].split('#')[-1].split('/')[-1].strip() if len(tokens) > 1: article_title = tokens[-2].strip() else: article_title = link_text return article_title, link_text if tokens[0].strip().lower() in self.__class__.__garbage_namespaces: return '', '' tokens = tokens[-1].split('|') while not tokens[-1]: if len(tokens) < 2: return '', '' tokens = tokens[:-1] if len(tokens) == 1: return '', '' link_text = tokens[-1].split('#')[-1].split('/')[-1].strip() if len(tokens) > 1: article_title = tokens[-2].strip() else: article_title = link_text return article_title, link_text def __get_anchor_tag(self, document_title, link_text): if not link_text: return '' if not document_title: return link_text return '%s' % (get_wiki_document_url(document_title, ''), link_text) def __handle_unicode(self, entity): numeric_code = int(entity[2:-1]) if numeric_code >= 0x10000: return '' return unichr(numeric_code) #------------------------------------------------------------------------------ class OutputSplitter: def __init__(self, compress, max_file_size, path_name): self.__dir_index = 0 self.__file_index = -1 self.__cur_file_size = 0 self.__compress = compress self.__max_file_size = max_file_size self.__path_name = path_name self.__out_file = self.__open_next_file() def write(self, text): text_len = len(text) if self.__cur_file_size + text_len / 2 > self.__max_file_size: self.__close_cur_file() self.__out_file = self.__open_next_file() self.__cur_file_size = 0 self.__out_file.write(text) self.__cur_file_size += text_len def close(self): self.__close_cur_file() def __open_next_file(self): self.__file_index += 1 if self.__file_index == 100: self.__dir_index += 1 self.__file_index = 0 dir_name = self.__get_dir_name() if not os.path.isdir(dir_name): os.makedirs(dir_name) file_name = os.path.join(dir_name, self.__get_file_name()) if self.__compress: return bz2.BZ2File('%s.bz2' % file_name, 'w') else: return open(file_name, 'w') def __close_cur_file(self): self.__out_file.close() def __get_dir_name(self): char1 = self.__dir_index % 26 char2 = self.__dir_index / 26 % 26 return os.path.join(self.__path_name, '%c%c' % (ord('A') + char2, ord('A') + char1)) def __get_file_name(self): return 'wiki%02d' % self.__file_index ### CORE ###################################################################### def process_data(input_file, wiki_extractor, output_splitter): page = [] for line in input_file: line = line.decode('utf-8').strip() if line == '': page = [] elif line == '': process_page(page, wiki_extractor, output_splitter) else: page.append(line) #------------------------------------------------------------------------------ def process_page(page, wiki_extractor, output_splitter): wiki_document = extract_document(page) if not wiki_document: return wiki_document = wiki_extractor.extract(wiki_document) if not wiki_document: return output_splitter.write(wiki_document.__str__().encode('utf-8')) #------------------------------------------------------------------------------ def extract_document(page): wiki_document = WikiDocument() for line in page: if not line: continue # Identificatore della pagina (nodo XML) if not wiki_document.id and line.startswith('') and line.endswith(''): wiki_document.id = int(line[4:-5]) continue # Titolo della pagina (nodo XML) elif not wiki_document.url and line.startswith('') and line.endswith(''): title = line[7:-8].replace('&', '&') if ':' in title: return None wiki_document.url = get_wiki_document_url(title, prefix) wiki_document.text = '++%s++' % title continue # Inizio del testo della pagina (nodo XML) elif line.startswith(''): return None line = line[27:] if not line: continue # Fine del testo della pagina (nodo XML) elif line.endswith(''): line = line[:-7] if not line: continue # Informazione superflua (nodo XML) elif line[0] == '<': continue # Titolo di paragafo (testo della pagina) elif line[0] == '=': line = '==%s==' % line.strip('= ') wiki_document.text += '\n%s' % line return wiki_document ### USER INTERFACE ############################################################ def show_help(): print >> sys.stdout, __doc__, def show_usage(output_file, script_name): print >> output_file, 'Usage: %s [options]' % script_name def show_suggestion(output_file, script_name): print >> output_file, 'Try \'%s --help\' for more information.' % script_name def show_size_error(script_name, file_size): print >> sys.stderr, '%s: %s: Insufficient or invalid number of bytes' % (script_name, file_size) def show_file_error(script_name, file_name): print >> sys.stderr, '%s: %s: No such file or directory' % (script_name, file_name) def main(): script_name = os.path.basename(sys.argv[0]) try: long_opts = ['help', 'usage', 'compress', 'bytes=', 'output='] opts, args = getopt.gnu_getopt(sys.argv[1:], 'cb:o:', long_opts) except getopt.GetoptError: show_usage(sys.stderr, script_name) show_suggestion(sys.stderr, script_name) sys.exit(1) compress = False file_size = 500 * 1024 output_dir = '.' for opt, arg in opts: if opt == '--help': show_help() sys.exit() elif opt == '--usage': show_usage(sys.stdout, script_name) sys.exit() elif opt in ('-c', '--compress'): compress = True elif opt in ('-b', '--bytes'): try: if arg[-1] in 'kK': file_size = int(arg[:-1]) * 1024 elif arg[-1] in 'mM': file_size = int(arg[:-1]) * 1024 * 1024 else: file_size = int(arg) if file_size < 200 * 1024: raise ValueError() except ValueError: show_size_error(script_name, arg) sys.exit(2) elif opt in ('-o', '--output'): if os.path.isdir(arg): output_dir = arg else: show_file_error(script_name, arg) sys.exit(3) if len(args) > 0: show_usage(sys.stderr, script_name) show_suggestion(sys.stderr, script_name) sys.exit(4) wiki_extractor = WikiExtractor() output_splitter = OutputSplitter(compress, file_size, output_dir) process_data(sys.stdin, wiki_extractor, output_splitter) output_splitter.close() if __name__ == '__main__': main()