from html.parser import HTMLParser from html.entities import name2codepoint class MyHTMLParser(HTMLParser): def __init__(self): super(MyHTMLParser, self).__init__() self.vdm = [] # self.article = False # self.panel = False # self.panelBody = False # self.panelContent = False # self.p = False # self.a = False # self.skip = [] # @classmethod # def _hasClass(cls, list_tuple, class_name): # for name_value in list_tuple: # array_class = name_value[1].strip().split(" ") # array_class = [ x.strip() for x in array_class ] # contain_all = set(array_class).issubset(class_name) # if name_value[0] == 'class' and contain_all: # return True # return False # def handle_starttag(self, tag, attrs): # # print(self.get_starttag_text()) # if tag == "article" and self._hasClass(attrs, ["art-panel", "col-xs-12"]): # self.article = True # self.skip = [] # else: # if tag == "div" and self._hasClass(attrs, ["panel", "panel-default"]) and self.article: # self.panel = True # else: # if tag == "div" and self._hasClass(attrs, ["panel-body"]) and self.article and self.panel: # self.panelBody = True # else: # if tag == "div" and self._hasClass(attrs, ["panel-content"]) and self.article and self.panel and self.panelBody: # self.panelContent = True # else: # if tag == "p" and self._hasClass(attrs, ["block", "hidden-xs"]) and self.article and self.panel and self.panelBody and self.panelContent: # self.p = True # else: # if tag == "a" and self.article and self.panel and self.panelBody and self.panelContent and self.p: # self.a = True # else: # self.skip.append(self.get_starttag_text()) # def handle_endtag(self, tag): # if len(self.skip) == 0: # if tag == "a" and self.a: # self.a = False # else: # if tag == "p" and self.p: # self.p = False # else: # if tag == "div" and self.panelContent: # self.panelContent = False # else: # if tag == "div" and self.panelBody: # self.panelBody = False # else: # if tag == "div" and self.panel: # self.panel = False # else: # if tag == "article" and self.article: # self.article = False # else: # self.skip.pop() def handle_data(self, data): # print("#"+data+"#") # if self.article and self.panel and self.panelBody and self.panelContent and self.p and self.a: # self.vdm.append(data.strip()) data = data.strip() if data.startswith("Aujourd'hui,") and data.endswith("VDM"): self.vdm.append(data.strip()) def getText(self): return self.vdm