60 lines
2.0 KiB
Python
60 lines
2.0 KiB
Python
from datetime import datetime
|
|
|
|
from bs4 import BeautifulSoup
|
|
import bs4.element
|
|
import feedparser
|
|
|
|
from providers.base import BaseProvider
|
|
from lib.news import News, NewsImage
|
|
|
|
|
|
class FootMercato(BaseProvider):
|
|
|
|
DOMAINS = {'www.footmercato.net'}
|
|
CHARSET = 'utf-8'
|
|
IMAGE = 'big-foot-mercato.png'
|
|
|
|
@classmethod
|
|
def get_newss_from_source(cls, news_source, data):
|
|
xml = data.decode()
|
|
tree = feedparser.parse(xml)
|
|
for item in tree.entries:
|
|
news = News()
|
|
news.source = 'foot-mercato'
|
|
news.sport = news_source.sport
|
|
news.title = item.title
|
|
news.url = item.link
|
|
news.description = item.description
|
|
|
|
image_url = item.enclosures[0].href if len(item.enclosures) > 0 else None
|
|
news.image = NewsImage(url=image_url, title=news.title, basename=cls.IMAGE, id_news=news.id)
|
|
|
|
if hasattr(item, 'tags'):
|
|
news.tags = [tag.term for tag in item.tags]
|
|
|
|
# Set current date as pub_date to prevent disorder between id and pub_date
|
|
news.pub_date = datetime.now()
|
|
|
|
yield news
|
|
|
|
@classmethod
|
|
def get_news_content(cls, news, data):
|
|
html = data.decode(encoding=cls.CHARSET, errors='ignore')
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
h2_teaser = soup.find('h2', class_='line h3-like')
|
|
if h2_teaser is not None:
|
|
news.teaser = str(h2_teaser.text).strip()
|
|
|
|
div_paraphs = soup.find('div', class_='article-text')
|
|
if div_paraphs is not None:
|
|
news.content = ''.join(
|
|
[str(c) for c in div_paraphs.contents if isinstance(c, bs4.element.Tag) and c.name != 'script']
|
|
)
|
|
|
|
div_author = soup.find('div', class_='article-author')
|
|
if div_author is not None:
|
|
span_author = div_author.find('span', 'name')
|
|
if span_author is not None:
|
|
news.author = str(span_author.contents[0]).strip()
|