from datetime import datetime from bs4 import BeautifulSoup import bs4.element import feedparser from providers.base import BaseProvider from lib.news import News, NewsImage class FootMercato(BaseProvider): DOMAINS = {'www.footmercato.net'} CHARSET = 'utf-8' IMAGE = 'big-foot-mercato.png' @classmethod def get_newss_from_source(cls, news_source, data): xml = data.decode() tree = feedparser.parse(xml) for item in tree.entries: news = News() news.source = 'foot-mercato' news.sport = news_source.sport news.title = item.title news.url = item.link news.description = item.description image_url = item.enclosures[0].href if len(item.enclosures) > 0 else None news.image = NewsImage(url=image_url, title=news.title, basename=cls.IMAGE, id_news=news.id) if hasattr(item, 'tags'): news.tags = [tag.term for tag in item.tags] # Set current date as pub_date to prevent disorder between id and pub_date news.pub_date = datetime.now() yield news @classmethod def get_news_content(cls, news, data): html = data.decode(encoding=cls.CHARSET, errors='ignore') soup = BeautifulSoup(html, 'html.parser') h2_teaser = soup.find('h2', class_='line h3-like') if h2_teaser is not None: news.teaser = str(h2_teaser.text).strip() div_paraphs = soup.find('div', class_='article-text') if div_paraphs is not None: news.content = ''.join( [str(c) for c in div_paraphs.contents if isinstance(c, bs4.element.Tag) and c.name != 'script'] ) div_author = soup.find('div', class_='article-author') if div_author is not None: span_author = div_author.find('span', 'name') if span_author is not None: news.author = str(span_author.contents[0]).strip()