vdmAPI/VDMAPI/HtmlParser.py

87 lines
3.4 KiB
Python

from html.parser import HTMLParser
from html.entities import name2codepoint
class MyHTMLParser(HTMLParser):
def __init__(self):
super(MyHTMLParser, self).__init__()
self.vdm = []
# self.article = False
# self.panel = False
# self.panelBody = False
# self.panelContent = False
# self.p = False
# self.a = False
# self.skip = []
# @classmethod
# def _hasClass(cls, list_tuple, class_name):
# for name_value in list_tuple:
# array_class = name_value[1].strip().split(" ")
# array_class = [ x.strip() for x in array_class ]
# contain_all = set(array_class).issubset(class_name)
# if name_value[0] == 'class' and contain_all:
# return True
# return False
# def handle_starttag(self, tag, attrs):
# # print(self.get_starttag_text())
# if tag == "article" and self._hasClass(attrs, ["art-panel", "col-xs-12"]):
# self.article = True
# self.skip = []
# else:
# if tag == "div" and self._hasClass(attrs, ["panel", "panel-default"]) and self.article:
# self.panel = True
# else:
# if tag == "div" and self._hasClass(attrs, ["panel-body"]) and self.article and self.panel:
# self.panelBody = True
# else:
# if tag == "div" and self._hasClass(attrs, ["panel-content"]) and self.article and self.panel and self.panelBody:
# self.panelContent = True
# else:
# if tag == "p" and self._hasClass(attrs, ["block", "hidden-xs"]) and self.article and self.panel and self.panelBody and self.panelContent:
# self.p = True
# else:
# if tag == "a" and self.article and self.panel and self.panelBody and self.panelContent and self.p:
# self.a = True
# else:
# self.skip.append(self.get_starttag_text())
# def handle_endtag(self, tag):
# if len(self.skip) == 0:
# if tag == "a" and self.a:
# self.a = False
# else:
# if tag == "p" and self.p:
# self.p = False
# else:
# if tag == "div" and self.panelContent:
# self.panelContent = False
# else:
# if tag == "div" and self.panelBody:
# self.panelBody = False
# else:
# if tag == "div" and self.panel:
# self.panel = False
# else:
# if tag == "article" and self.article:
# self.article = False
# else:
# self.skip.pop()
def handle_data(self, data):
# print("#"+data+"#")
# if self.article and self.panel and self.panelBody and self.panelContent and self.p and self.a:
# self.vdm.append(data.strip())
data = data.strip()
if data.startswith("Aujourd'hui,") and data.endswith("VDM"):
self.vdm.append(data.strip())
def getText(self):
return self.vdm