Generování FAQ z wiki (Python)
Skočit na navigaci
Skočit na vyhledávání
Pavouk v Pythonu, který z FAQ na wiki vygeneruje FAQ v html v Bruceově formátu:
## coding: UTF-8 import urllib2 import sgmllib from sets import Set import re import sys # Parser položky FAQ. Musí odfiltrovat wiki balast .. inside_c, # přeskočit případně vloženou tabulku s obsahem .. skip_toc class ParserItem(sgmllib.SGMLParser): def reset(self): sgmllib.SGMLParser.reset(self) self.flags = Set([]) def start_div(self,attrs): id = [v for k, v in attrs if k == 'id'] classv = [v for k, v in attrs if k == 'class'] if id and id[0] == "contentSub": self.flags.add('content') if classv and classv[0] == "printfooter": self.flags.discard('content') def end_div(self): pass def end_td(self): if 'content' in self.flags: self.fags.discard('td') sys.stdout.write("</td>") def start_h3(self, data): if 'content' in self.flags: self.flags.add('h3') sys.stdout.write("<h4>") def end_h3(self): if 'content' in self.flags: self.flags.discard('h3') sys.stdout.write("</h4>") def start_table(self, attrs): if 'content' in self.flags: id = [v for k, v in attrs if k == 'id'] if id and id[0] == "toc": self.flags.add('skip_toc') self.flags.discard('content') else: print "<table border=\"0\">" def end_table(self): if 'skip_toc' in self.flags: self.flags.discard('skip_toc') self.flags.add('content') else: if 'content' in self.flags: print "</table>" # Párové tagy <pre> a <p> nesou zajímavý obsah def unknown_starttag(self, tag, attrs): if 'content' in self.flags: sys.stdout.write("<%s>" % tag) if tag in ['pre','p','li','td']: self.flags.add(tag) def unknown_endtag(self, tag): if 'content' in self.flags: print "</%s>" % tag if tag in ['pre','p','li','td']: self.flags.discard(tag) def handle_data(self, data): if 'content' in self.flags: if len(Set(['pre','p','li','td']) & self.flags) > 0: sys.stdout.write(data) def start_a(self, attrs): if 'content' in self.flags: href = [v for k, v in attrs if k == 'href'] title = [v for k, v in attrs if k == 'title'] if title and re.match('^Edit', title[0]): self.flags.add('skip_close_a') else: if title and href: self.flags.add('a') sys.stdout.write("<a href=\"%s\" title=\"%s\">" % (href[0], title[0])) def end_a(self): if 'content' in self.flags: self.flags.discard('a') if 'skip_close_a' in self.flags: self.flags.discard('skip_close_a') else: sys.stdout.write("</a>") # Parser hlavní stránky. V podstatě jenom filtruje odkazy # začínající číslicí a skrz regulární výrazy z nich dělá nadpisy. # Pro každý identifikovaný odkaz spustí načtení detailu FAQ class ParserMainPage(sgmllib.SGMLParser): def reset(self): sgmllib.SGMLParser.reset(self) self.inside_h1 = 0 def start_h1(self, data): self.inside_h1 = 1 def end_h1(self): self.inside_h1 = 0 def handle_data(self, data): if self.inside_h1: if re.match('^\d', data): print "<hr>" print "<h2 align=\"center\">%s</h2>" % re.sub('\d\. ','',data) def start_a(self, attrs): href = [v for k, v in attrs if k == 'href'] title = [v for k, v in attrs if k == 'title'] if title and re.match('^\d\.',title[0]): if re.match('(\d)\.0?(\d{1,2})+(\.\d)', title[0]): print "<h3>%s</h3>" % re.sub('(\d)\.0?(\d{1,2})+(\.\d)','<a name="item\\1.\\2\\3">\\1.\\2\\3</a>)', title[0]) else: print "<h3>%s</h3>" % re.sub('(\d)\.0?(\d{1,2})+','<a name="item\\1.\\2">\\1.\\2</a>)', title[0]) iresp = urllib2.urlopen("http://www.pgsql.cz" + href[0]) item = iresp.read() iresp.close() parserItem = ParserItem() parserItem.feed(item) parserItem.close() # Parsuje odstavce v FAQ (metainformace) a seznam otázek # uvedený na začátku FAQ class ParserMainPageProlog(sgmllib.SGMLParser): def reset(self): sgmllib.SGMLParser.reset(self) self.flags = Set([]) def start_div(self,attrs): id = [v for k, v in attrs if k == 'id'] classv = [v for k, v in attrs if k == 'class'] if id and id[0] == "contentSub": self.flags.add('content') if classv and classv[0] == "printfooter": self.flags.discard('content') def end_div(self): pass def unknown_starttag(self, tag, attrs): if 'content' in self.flags: sys.stdout.write("<%s>" % tag) if tag in ['p']: self.flags.add(tag) def unknown_endtag(self, tag): if 'content' in self.flags: sys.stdout.write("</%s>" % tag) if tag in ['p']: self.flags.discard(tag) def start_h1(self, data): self.flags.add('h1') def end_h1(self): self.flags.discard('h1') def handle_data(self, data): if 'h1' in self.flags: if re.match('^\d', data): if re.match('1', data): print "<hr>" print "<h2 align=\"center\">%s</h2>" % re.sub('\d\. ','',data) else: if 'p' in self.flags: sys.stdout.write(data) def start_a(self, attrs): href = [v for k, v in attrs if k == 'href'] title = [v for k, v in attrs if k == 'title'] if title and re.match('^\d\.',title[0]): if re.match('(\d)\.0?(\d{1,2})+(\.\d)', title[0]): print "%s<br>" % re.sub('(\d)\.0?(\d{1,2})+(\.\d)','<a href="#item\\1.\\2\\3">\\1.\\2\\3</a>)', title[0]) else: print "%s<br>" % re.sub('(\d)\.0?(\d{1,2})+','<a href="#item\\1.\\2">\\1.\\2</a>)', title[0]) self.flags.add('skip_close_a') elif title and re.match('^Edit', title[0]): self.flags.add('skip_close_a') else: if title and href and 'content' in self.flags: self.flags.add('a') sys.stdout.write("<a href=\"%s\" title=\"%s\">" % (href[0], title[0])) def end_a(self): if 'content' in self.flags: self.flags.discard('a') if 'skip_close_a' not in self.flags: sys.stdout.write("</a>") else: self.flags.discard('skip_close_a') def start_ul(self, data): pass def start_li(self, data): pass def end_ul(self): pass def end_li(self): pass def end_div(self): pass response = urllib2.urlopen("http://www.pgsql.cz/index.php/Frequently_Asked_Questions") html = response.read() response.close() print "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">" print "<html>" print "<head>" print "<title>PostgreSQL FAQ</title>" print "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />" print "<meta http-equiv=\"Content-language\" content=\"cs\">" print "<meta name=\"description\" lang=\"en\" content=\"Czech translation of FAQ for PostgreSQL\">" print "<meta name=\"description\" lang=\"cs\" content=\"Český překlad FAQ PostgreSQL\">" print "</head>" print "<body bgcolor=\"#ffffff\" text=\"#000000\" link=\"#ff0000\" vlink=\"#a00000\" alink=\"#0000ff\">" print "<h1>Frequently Asked Questions</h1>" parser2 = ParserMainPageProlog() parser2.short_run = 0 parser2.feed(html) parser2.close() parser = ParserMainPage() parser.feed(html) parser.close() print "</body>" print "</htm>"