87 lines
3.4 KiB
Python
87 lines
3.4 KiB
Python
from html.parser import HTMLParser
|
|
from html.entities import name2codepoint
|
|
|
|
class MyHTMLParser(HTMLParser):
|
|
def __init__(self):
|
|
super(MyHTMLParser, self).__init__()
|
|
self.vdm = []
|
|
|
|
# self.article = False
|
|
# self.panel = False
|
|
# self.panelBody = False
|
|
# self.panelContent = False
|
|
# self.p = False
|
|
# self.a = False
|
|
|
|
# self.skip = []
|
|
|
|
# @classmethod
|
|
# def _hasClass(cls, list_tuple, class_name):
|
|
# for name_value in list_tuple:
|
|
# array_class = name_value[1].strip().split(" ")
|
|
# array_class = [ x.strip() for x in array_class ]
|
|
# contain_all = set(array_class).issubset(class_name)
|
|
|
|
# if name_value[0] == 'class' and contain_all:
|
|
# return True
|
|
# return False
|
|
|
|
# def handle_starttag(self, tag, attrs):
|
|
# # print(self.get_starttag_text())
|
|
# if tag == "article" and self._hasClass(attrs, ["art-panel", "col-xs-12"]):
|
|
# self.article = True
|
|
# self.skip = []
|
|
# else:
|
|
# if tag == "div" and self._hasClass(attrs, ["panel", "panel-default"]) and self.article:
|
|
# self.panel = True
|
|
# else:
|
|
# if tag == "div" and self._hasClass(attrs, ["panel-body"]) and self.article and self.panel:
|
|
# self.panelBody = True
|
|
# else:
|
|
# if tag == "div" and self._hasClass(attrs, ["panel-content"]) and self.article and self.panel and self.panelBody:
|
|
# self.panelContent = True
|
|
# else:
|
|
# if tag == "p" and self._hasClass(attrs, ["block", "hidden-xs"]) and self.article and self.panel and self.panelBody and self.panelContent:
|
|
# self.p = True
|
|
# else:
|
|
# if tag == "a" and self.article and self.panel and self.panelBody and self.panelContent and self.p:
|
|
# self.a = True
|
|
# else:
|
|
# self.skip.append(self.get_starttag_text())
|
|
|
|
# def handle_endtag(self, tag):
|
|
# if len(self.skip) == 0:
|
|
# if tag == "a" and self.a:
|
|
# self.a = False
|
|
# else:
|
|
# if tag == "p" and self.p:
|
|
# self.p = False
|
|
# else:
|
|
# if tag == "div" and self.panelContent:
|
|
# self.panelContent = False
|
|
# else:
|
|
# if tag == "div" and self.panelBody:
|
|
# self.panelBody = False
|
|
# else:
|
|
# if tag == "div" and self.panel:
|
|
# self.panel = False
|
|
# else:
|
|
# if tag == "article" and self.article:
|
|
# self.article = False
|
|
# else:
|
|
# self.skip.pop()
|
|
|
|
def handle_data(self, data):
|
|
# print("#"+data+"#")
|
|
|
|
# if self.article and self.panel and self.panelBody and self.panelContent and self.p and self.a:
|
|
# self.vdm.append(data.strip())
|
|
|
|
data = data.strip()
|
|
if data.startswith("Aujourd'hui,") and data.endswith("VDM"):
|
|
self.vdm.append(data.strip())
|
|
|
|
|
|
def getText(self):
|
|
return self.vdm
|