|
|
@@ -5,25 +5,82 @@ class MyHTMLParser(HTMLParser): |
|
|
|
def __init__(self): |
|
|
|
super(MyHTMLParser, self).__init__() |
|
|
|
self.vdm = [] |
|
|
|
self.save = 0 |
|
|
|
|
|
|
|
def _searchClass(self, listTuple, className="post article"): |
|
|
|
for v in listTuple: |
|
|
|
if v[0] == 'class' and v[1] == className: |
|
|
|
return True |
|
|
|
return False |
|
|
|
# self.article = False |
|
|
|
# self.panel = False |
|
|
|
# self.panelBody = False |
|
|
|
# self.panelContent = False |
|
|
|
# self.p = False |
|
|
|
# self.a = False |
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs): |
|
|
|
if tag == "p" and self._searchClass(attrs, className="block"): |
|
|
|
self.save = 1 |
|
|
|
# self.skip = [] |
|
|
|
|
|
|
|
# @classmethod |
|
|
|
# def _hasClass(cls, list_tuple, class_name): |
|
|
|
# for name_value in list_tuple: |
|
|
|
# array_class = name_value[1].strip().split(" ") |
|
|
|
# array_class = [ x.strip() for x in array_class ] |
|
|
|
# contain_all = set(array_class).issubset(class_name) |
|
|
|
|
|
|
|
# if name_value[0] == 'class' and contain_all: |
|
|
|
# return True |
|
|
|
# return False |
|
|
|
|
|
|
|
# def handle_starttag(self, tag, attrs): |
|
|
|
# # print(self.get_starttag_text()) |
|
|
|
# if tag == "article" and self._hasClass(attrs, ["art-panel", "col-xs-12"]): |
|
|
|
# self.article = True |
|
|
|
# self.skip = [] |
|
|
|
# else: |
|
|
|
# if tag == "div" and self._hasClass(attrs, ["panel", "panel-default"]) and self.article: |
|
|
|
# self.panel = True |
|
|
|
# else: |
|
|
|
# if tag == "div" and self._hasClass(attrs, ["panel-body"]) and self.article and self.panel: |
|
|
|
# self.panelBody = True |
|
|
|
# else: |
|
|
|
# if tag == "div" and self._hasClass(attrs, ["panel-content"]) and self.article and self.panel and self.panelBody: |
|
|
|
# self.panelContent = True |
|
|
|
# else: |
|
|
|
# if tag == "p" and self._hasClass(attrs, ["block", "hidden-xs"]) and self.article and self.panel and self.panelBody and self.panelContent: |
|
|
|
# self.p = True |
|
|
|
# else: |
|
|
|
# if tag == "a" and self.article and self.panel and self.panelBody and self.panelContent and self.p: |
|
|
|
# self.a = True |
|
|
|
# else: |
|
|
|
# self.skip.append(self.get_starttag_text()) |
|
|
|
|
|
|
|
# def handle_endtag(self, tag): |
|
|
|
# if len(self.skip) == 0: |
|
|
|
# if tag == "a" and self.a: |
|
|
|
# self.a = False |
|
|
|
# else: |
|
|
|
# if tag == "p" and self.p: |
|
|
|
# self.p = False |
|
|
|
# else: |
|
|
|
# if tag == "div" and self.panelContent: |
|
|
|
# self.panelContent = False |
|
|
|
# else: |
|
|
|
# if tag == "div" and self.panelBody: |
|
|
|
# self.panelBody = False |
|
|
|
# else: |
|
|
|
# if tag == "div" and self.panel: |
|
|
|
# self.panel = False |
|
|
|
# else: |
|
|
|
# if tag == "article" and self.article: |
|
|
|
# self.article = False |
|
|
|
# else: |
|
|
|
# self.skip.pop() |
|
|
|
|
|
|
|
if tag == "a" and self.save == 1: |
|
|
|
self.save = 2 |
|
|
|
|
|
|
|
def handle_data(self, data): |
|
|
|
if self.save == 2: |
|
|
|
# print("#"+data+"#") |
|
|
|
|
|
|
|
# if self.article and self.panel and self.panelBody and self.panelContent and self.p and self.a: |
|
|
|
# self.vdm.append(data.strip()) |
|
|
|
|
|
|
|
data = data.strip() |
|
|
|
if data.startswith("Aujourd'hui,") and data.endswith("VDM"): |
|
|
|
self.vdm.append(data.strip()) |
|
|
|
self.save = 0 |
|
|
|
|
|
|
|
|
|
|
|
def getText(self): |
|
|
|
return self.vdm |
|
|
|
return self.vdm |