cronpy/providers/lequipe.py

60 lines
1.9 KiB
Python
Raw Permalink Normal View History

2020-10-03 21:17:53 +00:00
from datetime import datetime
from html import unescape
from bs4 import BeautifulSoup
import bs4.element
import feedparser
from providers.base import BaseProvider
from lib.news import News, NewsImage
class Lequipe(BaseProvider):
DOMAINS = {'www.lequipe.fr'}
CHARSET = 'utf-8'
IMAGE = 'big-lequipe.png'
@classmethod
def get_newss_from_source(cls, news_source, data):
xml = data.decode()
tree = feedparser.parse(xml)
for item in tree.entries:
news = News()
news.source = 'lequipe'
news.sport = news_source.sport
news.title = item.title
news.url = item.link
news.description = item.description
image_url = item.enclosures[0].href if len(item.enclosures) > 0 else None
news.image = NewsImage(url=image_url, title=news.title, basename=cls.IMAGE, id_news=news.id)
if hasattr(item, 'tags'):
news.tags = [tag.term for tag in item.tags]
# Set current date as pub_date to prevent disorder between id and pub_date
news.pub_date = datetime.now()
yield news
@classmethod
def get_news_content(cls, news, data):
html = unescape(data.decode(encoding=cls.CHARSET, errors='ignore'))
soup = BeautifulSoup(html, 'html.parser')
div_teaser = soup.find('h2', class_='Article__chapo')
if div_teaser is not None:
news.teaser = str(div_teaser.text).strip()
div_paraphs = soup.find('div', class_='article__body')
if div_paraphs is not None:
news.content = ''.join([
str(c) for c in div_paraphs.contents
if isinstance(c, bs4.element.Tag) and c.name != 'script'
])
div_author = soup.find('span', class_='Author__name')
if div_author is not None:
news.author = div_author.text.strip()