Generování FAQ z wiki (Python)

Z PostgreSQL
Skočit na navigaci Skočit na vyhledávání

Pavouk v Pythonu, který z FAQ na wiki vygeneruje FAQ v html v Bruceově formátu:

## coding: UTF-8
import urllib2
import sgmllib
from sets import Set
import re
import sys

# Parser položky FAQ. Musí odfiltrovat wiki balast .. inside_c,
# přeskočit případně vloženou tabulku s obsahem .. skip_toc
class ParserItem(sgmllib.SGMLParser):
    def reset(self):
        sgmllib.SGMLParser.reset(self)
        self.flags = Set([])

    def start_div(self,attrs):
        id = [v for k, v in attrs if k == 'id']
        classv = [v for k, v in attrs if k == 'class']
        if id and id[0] == "contentSub":
            self.flags.add('content')
        if classv and classv[0] == "printfooter":
            self.flags.discard('content')

    def end_div(self):
        pass

    def end_td(self):
        if 'content' in self.flags:
            self.fags.discard('td')
            sys.stdout.write("</td>")

    def start_h3(self, data):
        if 'content' in self.flags:
            self.flags.add('h3')
            sys.stdout.write("<h4>")

    def end_h3(self):
        if 'content' in self.flags:
            self.flags.discard('h3')
            sys.stdout.write("</h4>")

    def start_table(self, attrs):
        if 'content' in self.flags:
            id = [v for k, v in attrs if k == 'id']
            if id and id[0] == "toc":
                self.flags.add('skip_toc')
                self.flags.discard('content')
            else:
                print "<table border=\"0\">"
            
    def end_table(self):
        if 'skip_toc' in self.flags:
            self.flags.discard('skip_toc')
            self.flags.add('content')
        else:
            if 'content' in self.flags:
                print "</table>"
    
    # Párové tagy <pre> a <p> nesou zajímavý obsah 
    def unknown_starttag(self, tag, attrs):
        if 'content' in self.flags:
            sys.stdout.write("<%s>" % tag)
            if tag in ['pre','p','li','td']:
                self.flags.add(tag)

    def unknown_endtag(self, tag):         
        if 'content' in self.flags:
            print "</%s>" % tag
            if tag in ['pre','p','li','td']:
                self.flags.discard(tag)

    def handle_data(self, data):
        if 'content' in self.flags:
            if len(Set(['pre','p','li','td']) & self.flags) > 0:
                sys.stdout.write(data)

    def start_a(self, attrs):
        if 'content' in self.flags:
            href = [v for k, v in attrs if k == 'href']
            title = [v for k, v in attrs if k == 'title']
            if title and re.match('^Edit', title[0]):
                self.flags.add('skip_close_a')
            else:
                if title and href:
                    self.flags.add('a')
                    sys.stdout.write("<a href=\"%s\" title=\"%s\">" % (href[0], title[0]))

    def end_a(self):
        if 'content' in self.flags:
            self.flags.discard('a')
            if 'skip_close_a' in self.flags: 
                self.flags.discard('skip_close_a')
            else:
                sys.stdout.write("</a>")


# Parser hlavní stránky. V podstatě jenom filtruje odkazy
# začínající číslicí a skrz regulární výrazy z nich dělá nadpisy.
# Pro každý identifikovaný odkaz spustí načtení detailu FAQ
class ParserMainPage(sgmllib.SGMLParser):
    def reset(self):
        sgmllib.SGMLParser.reset(self)
        self.inside_h1 = 0

    def start_h1(self, data):
        self.inside_h1 = 1

    def end_h1(self):
        self.inside_h1 = 0

    def handle_data(self, data):
        if self.inside_h1:
            if re.match('^\d', data):
                print "<hr>"
                print "<h2 align=\"center\">%s</h2>" % re.sub('\d\. ','',data)

    def start_a(self, attrs):
        href = [v for k, v in attrs if k == 'href']
        title = [v for k, v in  attrs if k == 'title']
        if title and re.match('^\d\.',title[0]):
            if re.match('(\d)\.0?(\d{1,2})+(\.\d)', title[0]):
                print "<h3>%s</h3>" % re.sub('(\d)\.0?(\d{1,2})+(\.\d)','<a name="item\\1.\\2\\3">\\1.\\2\\3</a>)', title[0])
            else:
                print "<h3>%s</h3>" % re.sub('(\d)\.0?(\d{1,2})+','<a name="item\\1.\\2">\\1.\\2</a>)', title[0])
            iresp = urllib2.urlopen("http://www.pgsql.cz" + href[0])
            item = iresp.read()
            iresp.close()
            parserItem = ParserItem()
            parserItem.feed(item)
            parserItem.close()

# Parsuje odstavce v FAQ (metainformace) a seznam otázek
# uvedený na začátku FAQ
class ParserMainPageProlog(sgmllib.SGMLParser):
    def reset(self):
        sgmllib.SGMLParser.reset(self)
        self.flags = Set([])

    def start_div(self,attrs):
        id = [v for k, v in attrs if k == 'id']
        classv = [v for k, v in attrs if k == 'class']
        if id and id[0] == "contentSub":
            self.flags.add('content')
        if classv and classv[0] == "printfooter":
            self.flags.discard('content')

    def end_div(self):
        pass

    def unknown_starttag(self, tag, attrs):
        if 'content' in self.flags:
            sys.stdout.write("<%s>" % tag)
            if tag in ['p']:
                self.flags.add(tag)

    def unknown_endtag(self, tag):         
        if 'content' in self.flags:
            sys.stdout.write("</%s>" % tag)
            if tag in ['p']:
                self.flags.discard(tag)

    def start_h1(self, data):
        self.flags.add('h1')

    def end_h1(self):
        self.flags.discard('h1')

    def handle_data(self, data):
        if 'h1' in self.flags:
            if re.match('^\d', data):
                if re.match('1', data):
                    print "<hr>"
                print "<h2 align=\"center\">%s</h2>" % re.sub('\d\. ','',data)
        else:
            if 'p' in self.flags:
                sys.stdout.write(data)

    def start_a(self, attrs):
        href = [v for k, v in attrs if k == 'href']
        title = [v for k, v in  attrs if k == 'title']
        if title and re.match('^\d\.',title[0]):
            if re.match('(\d)\.0?(\d{1,2})+(\.\d)', title[0]):
                print "%s<br>" % re.sub('(\d)\.0?(\d{1,2})+(\.\d)','<a href="#item\\1.\\2\\3">\\1.\\2\\3</a>)', title[0])
            else:
                print "%s<br>" % re.sub('(\d)\.0?(\d{1,2})+','<a href="#item\\1.\\2">\\1.\\2</a>)', title[0])
            self.flags.add('skip_close_a')
        elif title and re.match('^Edit', title[0]):
            self.flags.add('skip_close_a')
        else:
            if title and href and 'content' in self.flags:
                self.flags.add('a')
                sys.stdout.write("<a href=\"%s\" title=\"%s\">" % (href[0], title[0]))

    def end_a(self):
        if 'content' in self.flags:
            self.flags.discard('a')
            if 'skip_close_a' not in self.flags:
                sys.stdout.write("</a>")
            else:
                self.flags.discard('skip_close_a')

    def start_ul(self, data):
        pass

    def start_li(self, data):
        pass

    def end_ul(self):
        pass

    def end_li(self):
        pass 

    def end_div(self):
        pass


response = urllib2.urlopen("http://www.pgsql.cz/index.php/Frequently_Asked_Questions")
html = response.read()
response.close()

print "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">"
print "<html>"
print "<head>"
print "<title>PostgreSQL FAQ</title>"
print "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
print "<meta http-equiv=\"Content-language\" content=\"cs\">"
print "<meta name=\"description\" lang=\"en\" content=\"Czech translation of FAQ for PostgreSQL\">"
print "<meta name=\"description\" lang=\"cs\" content=\"Český překlad FAQ PostgreSQL\">"
print "</head>"
print "<body bgcolor=\"#ffffff\" text=\"#000000\" link=\"#ff0000\" vlink=\"#a00000\" alink=\"#0000ff\">"
print "<h1>Frequently Asked Questions</h1>"

parser2 = ParserMainPageProlog()
parser2.short_run = 0
parser2.feed(html)
parser2.close()

parser = ParserMainPage()
parser.feed(html)
parser.close()

print "</body>"
print "</htm>"