Generování FAQ z wiki (Python)
Pavouk v Pythonu, který z FAQ na wiki vygeneruje FAQ v html v Bruceově formátu:
## coding: UTF-8
import urllib2
import sgmllib
from sets import Set
import re
import sys
# Parser položky FAQ. Musí odfiltrovat wiki balast .. inside_c,
# přeskočit případně vloženou tabulku s obsahem .. skip_toc
class ParserItem(sgmllib.SGMLParser):
def reset(self):
sgmllib.SGMLParser.reset(self)
self.flags = Set([])
def start_div(self,attrs):
id = [v for k, v in attrs if k == 'id']
classv = [v for k, v in attrs if k == 'class']
if id and id[0] == "contentSub":
self.flags.add('content')
if classv and classv[0] == "printfooter":
self.flags.discard('content')
def end_div(self):
pass
def end_td(self):
if 'content' in self.flags:
self.fags.discard('td')
sys.stdout.write("</td>")
def start_h3(self, data):
if 'content' in self.flags:
self.flags.add('h3')
sys.stdout.write("<h4>")
def end_h3(self):
if 'content' in self.flags:
self.flags.discard('h3')
sys.stdout.write("</h4>")
def start_table(self, attrs):
if 'content' in self.flags:
id = [v for k, v in attrs if k == 'id']
if id and id[0] == "toc":
self.flags.add('skip_toc')
self.flags.discard('content')
else:
print "<table border=\"0\">"
def end_table(self):
if 'skip_toc' in self.flags:
self.flags.discard('skip_toc')
self.flags.add('content')
else:
if 'content' in self.flags:
print "</table>"
# Párové tagy <pre> a <p> nesou zajímavý obsah
def unknown_starttag(self, tag, attrs):
if 'content' in self.flags:
sys.stdout.write("<%s>" % tag)
if tag in ['pre','p','li','td']:
self.flags.add(tag)
def unknown_endtag(self, tag):
if 'content' in self.flags:
print "</%s>" % tag
if tag in ['pre','p','li','td']:
self.flags.discard(tag)
def handle_data(self, data):
if 'content' in self.flags:
if len(Set(['pre','p','li','td']) & self.flags) > 0:
sys.stdout.write(data)
def start_a(self, attrs):
if 'content' in self.flags:
href = [v for k, v in attrs if k == 'href']
title = [v for k, v in attrs if k == 'title']
if title and re.match('^Edit', title[0]):
self.flags.add('skip_close_a')
else:
if title and href:
self.flags.add('a')
sys.stdout.write("<a href=\"%s\" title=\"%s\">" % (href[0], title[0]))
def end_a(self):
if 'content' in self.flags:
self.flags.discard('a')
if 'skip_close_a' in self.flags:
self.flags.discard('skip_close_a')
else:
sys.stdout.write("</a>")
# Parser hlavní stránky. V podstatě jenom filtruje odkazy
# začínající číslicí a skrz regulární výrazy z nich dělá nadpisy.
# Pro každý identifikovaný odkaz spustí načtení detailu FAQ
class ParserMainPage(sgmllib.SGMLParser):
def reset(self):
sgmllib.SGMLParser.reset(self)
self.inside_h1 = 0
def start_h1(self, data):
self.inside_h1 = 1
def end_h1(self):
self.inside_h1 = 0
def handle_data(self, data):
if self.inside_h1:
if re.match('^\d', data):
print "<hr>"
print "<h2 align=\"center\">%s</h2>" % re.sub('\d\. ','',data)
def start_a(self, attrs):
href = [v for k, v in attrs if k == 'href']
title = [v for k, v in attrs if k == 'title']
if title and re.match('^\d\.',title[0]):
if re.match('(\d)\.0?(\d{1,2})+(\.\d)', title[0]):
print "<h3>%s</h3>" % re.sub('(\d)\.0?(\d{1,2})+(\.\d)','<a name="item\\1.\\2\\3">\\1.\\2\\3</a>)', title[0])
else:
print "<h3>%s</h3>" % re.sub('(\d)\.0?(\d{1,2})+','<a name="item\\1.\\2">\\1.\\2</a>)', title[0])
iresp = urllib2.urlopen("http://www.pgsql.cz" + href[0])
item = iresp.read()
iresp.close()
parserItem = ParserItem()
parserItem.feed(item)
parserItem.close()
# Parsuje odstavce v FAQ (metainformace) a seznam otázek
# uvedený na začátku FAQ
class ParserMainPageProlog(sgmllib.SGMLParser):
def reset(self):
sgmllib.SGMLParser.reset(self)
self.flags = Set([])
def start_div(self,attrs):
id = [v for k, v in attrs if k == 'id']
classv = [v for k, v in attrs if k == 'class']
if id and id[0] == "contentSub":
self.flags.add('content')
if classv and classv[0] == "printfooter":
self.flags.discard('content')
def end_div(self):
pass
def unknown_starttag(self, tag, attrs):
if 'content' in self.flags:
sys.stdout.write("<%s>" % tag)
if tag in ['p']:
self.flags.add(tag)
def unknown_endtag(self, tag):
if 'content' in self.flags:
sys.stdout.write("</%s>" % tag)
if tag in ['p']:
self.flags.discard(tag)
def start_h1(self, data):
self.flags.add('h1')
def end_h1(self):
self.flags.discard('h1')
def handle_data(self, data):
if 'h1' in self.flags:
if re.match('^\d', data):
if re.match('1', data):
print "<hr>"
print "<h2 align=\"center\">%s</h2>" % re.sub('\d\. ','',data)
else:
if 'p' in self.flags:
sys.stdout.write(data)
def start_a(self, attrs):
href = [v for k, v in attrs if k == 'href']
title = [v for k, v in attrs if k == 'title']
if title and re.match('^\d\.',title[0]):
if re.match('(\d)\.0?(\d{1,2})+(\.\d)', title[0]):
print "%s<br>" % re.sub('(\d)\.0?(\d{1,2})+(\.\d)','<a href="#item\\1.\\2\\3">\\1.\\2\\3</a>)', title[0])
else:
print "%s<br>" % re.sub('(\d)\.0?(\d{1,2})+','<a href="#item\\1.\\2">\\1.\\2</a>)', title[0])
self.flags.add('skip_close_a')
elif title and re.match('^Edit', title[0]):
self.flags.add('skip_close_a')
else:
if title and href and 'content' in self.flags:
self.flags.add('a')
sys.stdout.write("<a href=\"%s\" title=\"%s\">" % (href[0], title[0]))
def end_a(self):
if 'content' in self.flags:
self.flags.discard('a')
if 'skip_close_a' not in self.flags:
sys.stdout.write("</a>")
else:
self.flags.discard('skip_close_a')
def start_ul(self, data):
pass
def start_li(self, data):
pass
def end_ul(self):
pass
def end_li(self):
pass
def end_div(self):
pass
response = urllib2.urlopen("http://www.pgsql.cz/index.php/Frequently_Asked_Questions")
html = response.read()
response.close()
print "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">"
print "<html>"
print "<head>"
print "<title>PostgreSQL FAQ</title>"
print "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
print "<meta http-equiv=\"Content-language\" content=\"cs\">"
print "<meta name=\"description\" lang=\"en\" content=\"Czech translation of FAQ for PostgreSQL\">"
print "<meta name=\"description\" lang=\"cs\" content=\"Český překlad FAQ PostgreSQL\">"
print "</head>"
print "<body bgcolor=\"#ffffff\" text=\"#000000\" link=\"#ff0000\" vlink=\"#a00000\" alink=\"#0000ff\">"
print "<h1>Frequently Asked Questions</h1>"
parser2 = ParserMainPageProlog()
parser2.short_run = 0
parser2.feed(html)
parser2.close()
parser = ParserMainPage()
parser.feed(html)
parser.close()
print "</body>"
print "</htm>"