cronpy/providers/footmercato.py

from datetime import datetime

from bs4 import BeautifulSoup
import bs4.element
import feedparser

from providers.base import BaseProvider
from lib.news import News, NewsImage


class FootMercato(BaseProvider):

    DOMAINS = {'www.footmercato.net'}
    CHARSET = 'utf-8'
    IMAGE = 'big-foot-mercato.png'

    @classmethod
    def get_newss_from_source(cls, news_source, data):
        xml = data.decode()
        tree = feedparser.parse(xml)
        for item in tree.entries:
            news = News()
            news.source = 'foot-mercato'
            news.sport = news_source.sport
            news.title = item.title
            news.url = item.link
            news.description = item.description

            image_url = item.enclosures[0].href if len(item.enclosures) > 0 else None
            news.image = NewsImage(url=image_url, title=news.title, basename=cls.IMAGE, id_news=news.id)

            if hasattr(item, 'tags'):
                news.tags = [tag.term for tag in item.tags]

            # Set current date as pub_date to prevent disorder between id and pub_date
            news.pub_date = datetime.now()

            yield news

    @classmethod
    def get_news_content(cls, news, data):
        html = data.decode(encoding=cls.CHARSET, errors='ignore')
        soup = BeautifulSoup(html, 'html.parser')

        h2_teaser = soup.find('h2', class_='line h3-like')
        if h2_teaser is not None:
            news.teaser = str(h2_teaser.text).strip()

        div_paraphs = soup.find('div', class_='article-text')
        if div_paraphs is not None:
            news.content = ''.join(
                [str(c) for c in div_paraphs.contents if isinstance(c, bs4.element.Tag) and c.name != 'script']
            )

        div_author = soup.find('div', class_='article-author')
        if div_author is not None:
            span_author = div_author.find('span', 'name')
            if span_author is not None:
                news.author = str(span_author.contents[0]).strip()
Initial commit 2020-10-03 21:17:53 +00:00			`from datetime import datetime`

			`from bs4 import BeautifulSoup`
			`import bs4.element`
			`import feedparser`

			`from providers.base import BaseProvider`
			`from lib.news import News, NewsImage`


			`class FootMercato(BaseProvider):`

			`DOMAINS = {'www.footmercato.net'}`
			`CHARSET = 'utf-8'`
			`IMAGE = 'big-foot-mercato.png'`

			`@classmethod`
			`def get_newss_from_source(cls, news_source, data):`
			`xml = data.decode()`
			`tree = feedparser.parse(xml)`
			`for item in tree.entries:`
			`news = News()`
			`news.source = 'foot-mercato'`
			`news.sport = news_source.sport`
			`news.title = item.title`
			`news.url = item.link`
			`news.description = item.description`

			`image_url = item.enclosures[0].href if len(item.enclosures) > 0 else None`
			`news.image = NewsImage(url=image_url, title=news.title, basename=cls.IMAGE, id_news=news.id)`

			`if hasattr(item, 'tags'):`
			`news.tags = [tag.term for tag in item.tags]`

			`# Set current date as pub_date to prevent disorder between id and pub_date`
			`news.pub_date = datetime.now()`

			`yield news`

			`@classmethod`
			`def get_news_content(cls, news, data):`
			`html = data.decode(encoding=cls.CHARSET, errors='ignore')`
			`soup = BeautifulSoup(html, 'html.parser')`

			`h2_teaser = soup.find('h2', class_='line h3-like')`
			`if h2_teaser is not None:`
			`news.teaser = str(h2_teaser.text).strip()`

			`div_paraphs = soup.find('div', class_='article-text')`
			`if div_paraphs is not None:`
			`news.content = ''.join(`
			`[str(c) for c in div_paraphs.contents if isinstance(c, bs4.element.Tag) and c.name != 'script']`
			`)`

			`div_author = soup.find('div', class_='article-author')`
			`if div_author is not None:`
			`span_author = div_author.find('span', 'name')`
			`if span_author is not None:`
			`news.author = str(span_author.contents[0]).strip()`