29 lines
825 B
Python
29 lines
825 B
Python
from html.parser import HTMLParser
|
|
from html.entities import name2codepoint
|
|
|
|
class MyHTMLParser(HTMLParser):
|
|
def __init__(self):
|
|
super(MyHTMLParser, self).__init__()
|
|
self.vdm = []
|
|
self.save = 0
|
|
|
|
def _searchClass(self, listTuple, className="post article"):
|
|
for v in listTuple:
|
|
if v[0] == 'class' and v[1] == className:
|
|
return True
|
|
return False
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag == "p" and self._searchClass(attrs, className="block"):
|
|
self.save = 1
|
|
|
|
if tag == "a" and self.save == 1:
|
|
self.save = 2
|
|
|
|
def handle_data(self, data):
|
|
if self.save == 2:
|
|
self.vdm.append(data.strip())
|
|
self.save = 0
|
|
|
|
def getText(self):
|
|
return self.vdm |