cronpy/providers/lequipe.py

60 lines
1.9 KiB
Python

from datetime import datetime
from html import unescape
from bs4 import BeautifulSoup
import bs4.element
import feedparser
from providers.base import BaseProvider
from lib.news import News, NewsImage
class Lequipe(BaseProvider):
DOMAINS = {'www.lequipe.fr'}
CHARSET = 'utf-8'
IMAGE = 'big-lequipe.png'
@classmethod
def get_newss_from_source(cls, news_source, data):
xml = data.decode()
tree = feedparser.parse(xml)
for item in tree.entries:
news = News()
news.source = 'lequipe'
news.sport = news_source.sport
news.title = item.title
news.url = item.link
news.description = item.description
image_url = item.enclosures[0].href if len(item.enclosures) > 0 else None
news.image = NewsImage(url=image_url, title=news.title, basename=cls.IMAGE, id_news=news.id)
if hasattr(item, 'tags'):
news.tags = [tag.term for tag in item.tags]
# Set current date as pub_date to prevent disorder between id and pub_date
news.pub_date = datetime.now()
yield news
@classmethod
def get_news_content(cls, news, data):
html = unescape(data.decode(encoding=cls.CHARSET, errors='ignore'))
soup = BeautifulSoup(html, 'html.parser')
div_teaser = soup.find('h2', class_='Article__chapo')
if div_teaser is not None:
news.teaser = str(div_teaser.text).strip()
div_paraphs = soup.find('div', class_='article__body')
if div_paraphs is not None:
news.content = ''.join([
str(c) for c in div_paraphs.contents
if isinstance(c, bs4.element.Tag) and c.name != 'script'
])
div_author = soup.find('span', class_='Author__name')
if div_author is not None:
news.author = div_author.text.strip()