cronpy/providers/eurosport.py

371 lines
17 KiB
Python
Raw Permalink Normal View History

2020-10-03 21:17:53 +00:00
from datetime import datetime, timedelta
from urllib.parse import urljoin
import json
import re
from bs4 import BeautifulSoup
import bs4.element
import feedparser
from lib.news import News, NewsImage
from providers.base import BaseProvider
from lib.match import Comm, Match
from lib.country import Country
from lib.league import Group
from lib.team import Team
class Eurosport(BaseProvider):
DOMAINS = {
'www.eurosport.fr', 'www.rugbyrama.fr', 'video.eurosport.fr', 'video.rugbyrama.fr', 'web-api.eurosport.com'
}
CHARSET = 'utf-8'
IMAGE = 'big-eurosport.png'
@classmethod
def get_match_info(cls, match, data):
html = data.decode(cls.CHARSET)
# Get score in json
if match.json_parser:
json_body = json.loads(html)
player_home = json_body['match']['players'][0]
player_away = json_body['match']['players'][1]
if match.home.name != '{} {}'.format(player_home['firstname'], player_home['lastname']):
raise NameError('home name does not match')
if match.away.name != '{} {}'.format(player_away['firstname'], player_away['lastname']):
raise NameError('away name does not match')
match_datetime = '{} {}'.format(json_body['match']['date']['date'], json_body['match']['date']['time'])
match.start_date = datetime.strptime(match_datetime, '%Y-%m-%d %H:%M')
if 'score' in json_body['match']:
score_sets = {
json_body['match']['score'][0]['playerid']: json_body['match']['score'][0]['sets'],
json_body['match']['score'][1]['playerid']: json_body['match']['score'][1]['sets']
}
match.score_sets = {
'home': score_sets[player_home['id']],
'away': score_sets[player_away['id']]
}
if 'name' in json_body['match']['status']:
match.minute = json_body['match']['status']['name']
# Get all data in html
else:
soup = BeautifulSoup(html, 'html.parser')
div_match = soup.find(id='livehero')
if div_match is None:
raise NameError('div livehero not found')
# Check team names
div_teams = div_match.find_all('div', class_='heromatch__team-name')
if len(div_teams) != 2 or not div_teams[0].a.text or not div_teams[1].a.text:
raise NameError('divs team not found')
if div_teams[0].a.text.strip() != match.home.names['eurosport']:
raise NameError('home name does not match')
if div_teams[1].a.text.strip() != match.away.names['eurosport']:
raise NameError('away name does not match')
# Check start_date
div_date = div_match.find('div', class_='heromatch__date')
if div_date is None:
raise NameError('div date not found')
div_time = div_match.find('div', class_='heromatch__time')
if div_time is None:
raise NameError('div time not found')
date_ = div_date.text.strip()
time_ = div_time.text.strip()
match.start_date = datetime.strptime('{} {}:00'.format(date_, time_), '%d/%m/%y %H:%M:%S')
# Get score
div_scores = div_match.find_all('div', class_='heromatch__score')
if len(div_scores) < 2:
raise NameError('divs score not found')
if div_scores[0].text.strip().isnumeric():
match.score_home = int(div_scores[0].text.strip())
if div_scores[1].text.strip().isnumeric():
match.score_away = int(div_scores[1].text.strip())
# Get minute
div_minute = div_match.find('div', class_='heromatch__minute')
if div_minute is None or not div_minute.text:
div_minute = div_match.find('div', class_='heromatch__status')
if div_minute is None:
raise NameError('div minute not found')
match.minute = div_minute.text.strip().lower()
# Get live comments
div_comms = soup.find(class_='live_comments_v8_5_bis')
if div_comms is not None:
for article_comm in div_comms.find_all('article'):
comm = Comm(type_='', minute='', text='')
left_col = article_comm.find(class_='left-col')
if left_col is not None:
left_span = left_col.find('span')
if left_span is not None:
if len(left_span.contents) > 0 and isinstance(left_span.contents[0], str):
comm.minute = left_span.contents[0].strip()
right_col = article_comm.find(class_='right-col')
if right_col is not None:
right_p = right_col.find('p')
if right_p is not None:
comm.text = right_p.text.strip()
if comm.text:
if comm.text == comm.text.upper():
comm.text = '<strong>{}</strong>'.format(comm.text)
else:
continue
match.comms.append(comm)
@classmethod
def get_match_comms(cls, match, data):
html = data.decode(cls.CHARSET)
json_body = json.loads(html)
if json_body is not None:
for comment in json_body['livecomments']:
if all([key in comment for key in ('marker', 'text')]):
match.comms.append(
Comm(minute=comment['marker'], type_='', text=comment['text'])
)
@classmethod
def get_team_staff(cls, data):
html = data.decode(cls.CHARSET)
soup = BeautifulSoup(html, 'html.parser')
players = list()
staff = dict()
for li in soup.find_all('li'):
if 'class' in li.attrs and 'team_global_title' in li.attrs['class']:
if li.text == 'Défenseur(s)':
staff['goalkeepers'] = players
elif li.text == 'Milieu(x)':
staff['defenders'] = players
elif li.text == 'Attaquant(s)':
staff['midfielders'] = players
elif li.text == 'Entraîneur':
staff['attackers'] = players
break
players = list()
else:
players.append(li.find('a').text.replace('\xa0', ' '))
return staff
@classmethod
def get_league_ranking(cls, league, data):
html = data.decode(cls.CHARSET)
soup = BeautifulSoup(html, 'html.parser')
div_standing = soup.find('div', class_='standing_v8_5')
groups = list()
for div in div_standing.find_all('div', class_='tab-content'):
if 'data-ajax-url' in div.attrs:
link = div.attrs['data-ajax-url']
group_class = div.attrs['data-navtab-content-id'].split('_')[1]
group = soup.find('a', class_=group_class).find('span', class_='navtab-label').text.strip()
groups.append(Group(name=group, url=urljoin(league.url, link), league=league))
else:
groups.append(Group(name='0', url=league.url, league=league))
return groups
@classmethod
def get_group_ranking(cls, group, data):
html = data.decode(cls.CHARSET)
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table')
for tr in table.find_all('tr', class_='standing-table__row'):
eur_name = tr.find('span', class_='text').text.strip()
for tm in group.league.teams:
if 'eurosport' in tm.names and tm.names['eurosport'] == eur_name:
team = tm
break
else:
continue
tds = tr.find_all('td')
team.group = group
team.rank = int(tds[0].text.strip())
team.played = int(tds[-8].text.strip())
team.wins = int(tds[-7].text.strip())
team.ties = int(tds[-6].text.strip())
team.loss = int(tds[-5].text.strip())
team.g_for = int(tds[-4].text.strip())
team.g_against = int(tds[-3].text.strip())
team.g_diff = int(tds[-2].text.strip())
team.points = int(tds[-1].text.strip())
@classmethod
def get_newss_from_source(cls, news_source, data):
xml = data.decode()
tree = feedparser.parse(xml)
for item in tree.entries:
news = News()
news.source = 'eurosport'
news.sport = news_source.sport
news.title = item.title
news.url = item.link
news.description = json.loads(re.sub(r'<.*>', '', item.description))
summary = BeautifulSoup(item.summary, 'html.parser')
image_object = summary.find('img')
image_url = image_object.attrs['src'] if image_object and 'src' in image_object.attrs is not None else None
news.image = NewsImage(url=image_url, title=news.title, basename=cls.IMAGE, id_news=news.id)
if hasattr(item, 'tags'):
news.tags = [tag.term for tag in item.tags]
# Set current date as pub_date to prevent disorder between id and pub_date
news.pub_date = datetime.now()
yield news
@classmethod
def get_news_content(cls, news, data):
html = data.decode(encoding=cls.CHARSET, errors='ignore')
soup = BeautifulSoup(html, 'html.parser')
if news.url.endswith('/video.shtml'):
res = re.search(r'https://vod-eurosport.akamaized.net/[^"]*', html)
if res is not None:
news.video_src = res.group(0)
news.content = '<video controls src="{}" type="video/mp4"></video>'.format(news.video_src)
else:
news.content = ''
div_paraphs = soup.find('div', class_='teaser_container')
if div_paraphs is not None:
news.content += ''.join(
[str(c) for c in div_paraphs.contents if isinstance(c, bs4.element.Tag) and c.name != 'script']
)
else:
h2_teaser = soup.find('h2', class_='storyfull__teaser')
if h2_teaser is not None:
news.teaser = str(h2_teaser.text).strip()
div_paraphs = soup.find('div', class_='storyfull__paragraphs')
if div_paraphs is not None:
news.content = ''.join(
[str(c) for c in div_paraphs.contents if isinstance(c, bs4.element.Tag) and c.name != 'script']
)
div_author = soup.find('div', class_='storyfull__publisher-author-name')
if div_author is not None:
a_author = div_author.find('a')
if a_author is not None:
news.author = a_author.text.strip()
else:
news.author = str(div_author.contents[0]).strip()
if news.author.startswith('Par '):
news.author = news.author.replace('Par ', '', 1)
@classmethod
def get_schedule_url(cls, match):
return match.url
@classmethod
def get_schedule(cls, scheduler, data):
html = data.decode(cls.CHARSET)
soup = BeautifulSoup(html, 'html.parser')
divs_name = soup.find_all('span', class_='tennismatch--hidemobile')
if len(divs_name) == 2 and divs_name[0].text and divs_name[1].text:
home = divs_name[0].text.strip()
away = divs_name[1].text.strip()
for match in scheduler.matches:
if match.home.names['eurosport'] == home and match.away.names['eurosport'] == away:
date = datetime.strptime(soup.find('div', class_='livehero__date').contents[0], '%d/%m/%y')
hours, minutes = soup.find('div', class_='tennismatch__time-value').text.strip().split(':')
match.new_start_date = date + timedelta(hours=int(hours), minutes=int(minutes))
match.task_done = True
@classmethod
def create_schedule(cls, league, data):
html = data.decode(cls.CHARSET)
soup = BeautifulSoup(html, 'html.parser')
current_year = datetime.now().year
droplet_id = 0
if league.sport.id == 2:
ajax_container = soup.find('div', class_='ajax-container')
droplet_match = re.search(r'&dropletid=(\d+)&', ajax_container.attrs['data-ajax-url'])
if droplet_match is not None:
droplet_id = int(droplet_match.group(1))
else:
raise Exception('no droplet_id found')
rounds = list()
div_rounds = soup.find('div', class_='rounds-dropdown__rounds')
for div_round in div_rounds.find_all('div', class_='rounds-dropdown__round'):
rounds.append(div_round.text.strip())
div_matches = soup.find('div', class_='bracket-matches-wrapper')
for div_matches_round in div_matches.find_all('div', class_='bracket-matches'):
for class_ in div_matches_round.attrs['class']:
if class_.startswith('bracket-round--'):
nb_round = class_.replace('bracket-round--', '')
if nb_round.isnumeric():
nb_round = int(nb_round)
if nb_round <= len(rounds):
current_round = rounds[nb_round - 1]
idof10 = 0
for a_match in div_matches_round.find_all('a', class_='match-sets'):
match = Match(idt=0)
match.idof10 = idof10
match.url = urljoin(league.url, a_match.attrs['href'])
if droplet_id is not None:
match_id = int(a_match.attrs['href'].split('/')[-2].split('mtc')[-1])
score_url = 'https://web-api.eurosport.com/json/getmatchheaderweb.json'
comms_url = 'https://web-api.eurosport.com/json/getlivecomments.json'
match.url_score = '{}?d={}&ids={}'.format(score_url, droplet_id, match_id)
match.url_comms = '{}?d={}&ids={}'.format(comms_url, droplet_id, match_id)
match.league = league
match.leg = 0
match.round = current_round
match.mday = 0
div_time = a_match.find('div', class_='match-sets__start-time')
match_date = '{}/{}'.format(div_time.text.strip(), current_year)
match.start_date = datetime.strptime(match_date, '%d/%m/%Y')
divs_name = a_match.find_all('div', class_='player__name')
divs_logo = a_match.find_all('div', class_='player__logo')
if len(divs_name) == 2 and divs_name[0].text and divs_name[1].text:
match.home = Team(idt=0)
match.home.league = league
match.home.country = Country(idt=0)
img_country_home = divs_logo[0].find('img')
if img_country_home is None:
continue
match.home.country.name = img_country_home.attrs['title'].strip()
match.home.name = divs_name[0].text.strip()
match.home.long_name = match.home.name
words = match.home.name.split(' ')
for idx in range(len(words)):
if idx < len(words) - 1 and len(words[idx]) > 3:
words[idx] = words[idx][0] + '.'
match.home.short_name = ' '.join(words)
match.home.names = {'eurosport': match.home.name}
match.away = Team(idt=0)
match.away.league = league
match.away.country = Country(idt=0)
img_country_away = divs_logo[1].find('img')
if img_country_away is None:
continue
match.away.country.name = img_country_away.attrs['title'].strip()
match.away.name = divs_name[1].text.strip()
match.away.long_name = match.away.name
words = match.away.name.split(' ')
for idx in range(len(words)):
if idx < len(words) - 1 and len(words[idx]) > 3:
words[idx] = words[idx][0] + '.'
match.away.short_name = ' '.join(words)
match.away.names = {'eurosport': match.away.name}
idof10 += 1
yield match