from html.parser import HTMLParser from html.entities import name2codepoint class MyHTMLParser(HTMLParser): def __init__(self): super(MyHTMLParser, self).__init__() self.vdm = [] self.save = 0 def _searchClass(self, listTuple, className="post article"): for v in listTuple: if v[0] == 'class' and v[1] == className: return True return False def handle_starttag(self, tag, attrs): if tag == "p" and self._searchClass(attrs, className="block"): self.save = 1 if tag == "a" and self.save == 1: self.save = 2 def handle_data(self, data): if self.save == 2: self.vdm.append(data.strip()) self.save = 0 def getText(self): return self.vdm