from html.parser import HTMLParser
from html.entities import name2codepoint
class MyHTMLParser(HTMLParser):
def __init__(self):
super(MyHTMLParser, self).__init__()
self.vdm = []
self.save = False
def _searchClass(self, listTuple, className="post article"):
for v in listTuple:
if v[0] == 'class' and v[1] == className:
return True
return False
def handle_starttag(self, tag, attrs):
if tag == "p" and self._searchClass(attrs, className="content"):
self.save = True
def handle_data(self, data):
if self.save:
self.vdm.append(data)
self.save = False
def getText(self):
return self.vdm