26 lines
764 B
Python
26 lines
764 B
Python
from html.parser import HTMLParser
|
|
from html.entities import name2codepoint
|
|
|
|
class MyHTMLParser(HTMLParser):
|
|
def __init__(self):
|
|
super(MyHTMLParser, self).__init__()
|
|
self.vdm = []
|
|
self.save = False
|
|
|
|
def _searchClass(self, listTuple, className="post article"):
|
|
for v in listTuple:
|
|
if v[0] == 'class' and v[1] == className:
|
|
return True
|
|
return False
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag == "p" and self._searchClass(attrs, className="content"):
|
|
self.save = True
|
|
|
|
def handle_data(self, data):
|
|
if self.save:
|
|
self.vdm.append(data)
|
|
self.save = False
|
|
|
|
def getText(self):
|
|
return self.vdm |