correction du parseur html aprés changement sur le site

This commit is contained in:
antoine 2017-08-14 21:37:46 +02:00
parent 51d120df26
commit 0c56442ad0
8 changed files with 80 additions and 22 deletions

2
.gitignore vendored
View File

@ -1,2 +1,4 @@
.idea/
.vscode/
*__pycache__/
dist/

View File

@ -5,25 +5,82 @@ class MyHTMLParser(HTMLParser):
def __init__(self):
super(MyHTMLParser, self).__init__()
self.vdm = []
self.save = 0
def _searchClass(self, listTuple, className="post article"):
for v in listTuple:
if v[0] == 'class' and v[1] == className:
return True
return False
# self.article = False
# self.panel = False
# self.panelBody = False
# self.panelContent = False
# self.p = False
# self.a = False
def handle_starttag(self, tag, attrs):
if tag == "p" and self._searchClass(attrs, className="block"):
self.save = 1
# self.skip = []
# @classmethod
# def _hasClass(cls, list_tuple, class_name):
# for name_value in list_tuple:
# array_class = name_value[1].strip().split(" ")
# array_class = [ x.strip() for x in array_class ]
# contain_all = set(array_class).issubset(class_name)
# if name_value[0] == 'class' and contain_all:
# return True
# return False
# def handle_starttag(self, tag, attrs):
# # print(self.get_starttag_text())
# if tag == "article" and self._hasClass(attrs, ["art-panel", "col-xs-12"]):
# self.article = True
# self.skip = []
# else:
# if tag == "div" and self._hasClass(attrs, ["panel", "panel-default"]) and self.article:
# self.panel = True
# else:
# if tag == "div" and self._hasClass(attrs, ["panel-body"]) and self.article and self.panel:
# self.panelBody = True
# else:
# if tag == "div" and self._hasClass(attrs, ["panel-content"]) and self.article and self.panel and self.panelBody:
# self.panelContent = True
# else:
# if tag == "p" and self._hasClass(attrs, ["block", "hidden-xs"]) and self.article and self.panel and self.panelBody and self.panelContent:
# self.p = True
# else:
# if tag == "a" and self.article and self.panel and self.panelBody and self.panelContent and self.p:
# self.a = True
# else:
# self.skip.append(self.get_starttag_text())
# def handle_endtag(self, tag):
# if len(self.skip) == 0:
# if tag == "a" and self.a:
# self.a = False
# else:
# if tag == "p" and self.p:
# self.p = False
# else:
# if tag == "div" and self.panelContent:
# self.panelContent = False
# else:
# if tag == "div" and self.panelBody:
# self.panelBody = False
# else:
# if tag == "div" and self.panel:
# self.panel = False
# else:
# if tag == "article" and self.article:
# self.article = False
# else:
# self.skip.pop()
if tag == "a" and self.save == 1:
self.save = 2
def handle_data(self, data):
if self.save == 2:
# print("#"+data+"#")
# if self.article and self.panel and self.panelBody and self.panelContent and self.p and self.a:
# self.vdm.append(data.strip())
data = data.strip()
if data.startswith("Aujourd'hui,") and data.endswith("VDM"):
self.vdm.append(data.strip())
self.save = 0
def getText(self):
return self.vdm
return self.vdm

View File

@ -32,5 +32,4 @@ class VDM(object):
class errorVDM(Exception):
def __init__(self, message):
super(errorVDM, self).__init__(message)
super(errorVDM, self).__init__(message)

View File

@ -1 +1 @@
version = '1.1'
version = '1.2'

Binary file not shown.

Binary file not shown.

View File

@ -2,6 +2,6 @@
pip uninstall VDM-API
pip wheel --wheel-dir=dist ./
pip install dist/VDM_API-1.1-py3-none-any.whl
pip install dist/VDM_API-1.2-py3-none-any.whl
# python setup.py sdist generate tar.gz archive setupable with pip

View File

@ -5,11 +5,11 @@ VDMAPI
import os
import sys
from setuptools import find_packages, setup
from VDMAPI.version import version
module_path = os.path.join(
os.path.dirname(os.path.realpath(__file__)), 'VDMAPI')
sys.path.insert(0, module_path)
from version import version
sys.path.remove(module_path)
@ -28,8 +28,8 @@ setup(
author_email='antoinroux@hotmail.fr',
description='VDM API: an API for recover random VDM from vdm.fr website',
long_description=read('README.rst'),
url='git://176.189.130.29/python/vdmAPI.git',
download_url='git://176.189.130.29/python/vdmAPI.git/tags',
url='git://antoine-roux.fr.to/python/vdmAPI.git',
download_url='git://antoine-roux.fr.to/python/vdmAPI.git/tags',
license='Beerware',
platforms='any',
packages=find_packages(),