from datetime import datetime, timedelta from urllib.parse import urljoin import json import re from bs4 import BeautifulSoup import bs4.element import feedparser from lib.news import News, NewsImage from providers.base import BaseProvider from lib.match import Comm, Match from lib.country import Country from lib.league import Group from lib.team import Team class Eurosport(BaseProvider): DOMAINS = { 'www.eurosport.fr', 'www.rugbyrama.fr', 'video.eurosport.fr', 'video.rugbyrama.fr', 'web-api.eurosport.com' } CHARSET = 'utf-8' IMAGE = 'big-eurosport.png' @classmethod def get_match_info(cls, match, data): html = data.decode(cls.CHARSET) # Get score in json if match.json_parser: json_body = json.loads(html) player_home = json_body['match']['players'][0] player_away = json_body['match']['players'][1] if match.home.name != '{} {}'.format(player_home['firstname'], player_home['lastname']): raise NameError('home name does not match') if match.away.name != '{} {}'.format(player_away['firstname'], player_away['lastname']): raise NameError('away name does not match') match_datetime = '{} {}'.format(json_body['match']['date']['date'], json_body['match']['date']['time']) match.start_date = datetime.strptime(match_datetime, '%Y-%m-%d %H:%M') if 'score' in json_body['match']: score_sets = { json_body['match']['score'][0]['playerid']: json_body['match']['score'][0]['sets'], json_body['match']['score'][1]['playerid']: json_body['match']['score'][1]['sets'] } match.score_sets = { 'home': score_sets[player_home['id']], 'away': score_sets[player_away['id']] } if 'name' in json_body['match']['status']: match.minute = json_body['match']['status']['name'] # Get all data in html else: soup = BeautifulSoup(html, 'html.parser') div_match = soup.find(id='livehero') if div_match is None: raise NameError('div livehero not found') # Check team names div_teams = div_match.find_all('div', class_='heromatch__team-name') if len(div_teams) != 2 or not div_teams[0].a.text or not div_teams[1].a.text: raise NameError('divs team not found') if div_teams[0].a.text.strip() != match.home.names['eurosport']: raise NameError('home name does not match') if div_teams[1].a.text.strip() != match.away.names['eurosport']: raise NameError('away name does not match') # Check start_date div_date = div_match.find('div', class_='heromatch__date') if div_date is None: raise NameError('div date not found') div_time = div_match.find('div', class_='heromatch__time') if div_time is None: raise NameError('div time not found') date_ = div_date.text.strip() time_ = div_time.text.strip() match.start_date = datetime.strptime('{} {}:00'.format(date_, time_), '%d/%m/%y %H:%M:%S') # Get score div_scores = div_match.find_all('div', class_='heromatch__score') if len(div_scores) < 2: raise NameError('divs score not found') if div_scores[0].text.strip().isnumeric(): match.score_home = int(div_scores[0].text.strip()) if div_scores[1].text.strip().isnumeric(): match.score_away = int(div_scores[1].text.strip()) # Get minute div_minute = div_match.find('div', class_='heromatch__minute') if div_minute is None or not div_minute.text: div_minute = div_match.find('div', class_='heromatch__status') if div_minute is None: raise NameError('div minute not found') match.minute = div_minute.text.strip().lower() # Get live comments div_comms = soup.find(class_='live_comments_v8_5_bis') if div_comms is not None: for article_comm in div_comms.find_all('article'): comm = Comm(type_='', minute='', text='') left_col = article_comm.find(class_='left-col') if left_col is not None: left_span = left_col.find('span') if left_span is not None: if len(left_span.contents) > 0 and isinstance(left_span.contents[0], str): comm.minute = left_span.contents[0].strip() right_col = article_comm.find(class_='right-col') if right_col is not None: right_p = right_col.find('p') if right_p is not None: comm.text = right_p.text.strip() if comm.text: if comm.text == comm.text.upper(): comm.text = '{}'.format(comm.text) else: continue match.comms.append(comm) @classmethod def get_match_comms(cls, match, data): html = data.decode(cls.CHARSET) json_body = json.loads(html) if json_body is not None: for comment in json_body['livecomments']: if all([key in comment for key in ('marker', 'text')]): match.comms.append( Comm(minute=comment['marker'], type_='', text=comment['text']) ) @classmethod def get_team_staff(cls, data): html = data.decode(cls.CHARSET) soup = BeautifulSoup(html, 'html.parser') players = list() staff = dict() for li in soup.find_all('li'): if 'class' in li.attrs and 'team_global_title' in li.attrs['class']: if li.text == 'Défenseur(s)': staff['goalkeepers'] = players elif li.text == 'Milieu(x)': staff['defenders'] = players elif li.text == 'Attaquant(s)': staff['midfielders'] = players elif li.text == 'Entraîneur': staff['attackers'] = players break players = list() else: players.append(li.find('a').text.replace('\xa0', ' ')) return staff @classmethod def get_league_ranking(cls, league, data): html = data.decode(cls.CHARSET) soup = BeautifulSoup(html, 'html.parser') div_standing = soup.find('div', class_='standing_v8_5') groups = list() for div in div_standing.find_all('div', class_='tab-content'): if 'data-ajax-url' in div.attrs: link = div.attrs['data-ajax-url'] group_class = div.attrs['data-navtab-content-id'].split('_')[1] group = soup.find('a', class_=group_class).find('span', class_='navtab-label').text.strip() groups.append(Group(name=group, url=urljoin(league.url, link), league=league)) else: groups.append(Group(name='0', url=league.url, league=league)) return groups @classmethod def get_group_ranking(cls, group, data): html = data.decode(cls.CHARSET) soup = BeautifulSoup(html, 'html.parser') table = soup.find('table') for tr in table.find_all('tr', class_='standing-table__row'): eur_name = tr.find('span', class_='text').text.strip() for tm in group.league.teams: if 'eurosport' in tm.names and tm.names['eurosport'] == eur_name: team = tm break else: continue tds = tr.find_all('td') team.group = group team.rank = int(tds[0].text.strip()) team.played = int(tds[-8].text.strip()) team.wins = int(tds[-7].text.strip()) team.ties = int(tds[-6].text.strip()) team.loss = int(tds[-5].text.strip()) team.g_for = int(tds[-4].text.strip()) team.g_against = int(tds[-3].text.strip()) team.g_diff = int(tds[-2].text.strip()) team.points = int(tds[-1].text.strip()) @classmethod def get_newss_from_source(cls, news_source, data): xml = data.decode() tree = feedparser.parse(xml) for item in tree.entries: news = News() news.source = 'eurosport' news.sport = news_source.sport news.title = item.title news.url = item.link news.description = json.loads(re.sub(r'<.*>', '', item.description)) summary = BeautifulSoup(item.summary, 'html.parser') image_object = summary.find('img') image_url = image_object.attrs['src'] if image_object and 'src' in image_object.attrs is not None else None news.image = NewsImage(url=image_url, title=news.title, basename=cls.IMAGE, id_news=news.id) if hasattr(item, 'tags'): news.tags = [tag.term for tag in item.tags] # Set current date as pub_date to prevent disorder between id and pub_date news.pub_date = datetime.now() yield news @classmethod def get_news_content(cls, news, data): html = data.decode(encoding=cls.CHARSET, errors='ignore') soup = BeautifulSoup(html, 'html.parser') if news.url.endswith('/video.shtml'): res = re.search(r'https://vod-eurosport.akamaized.net/[^"]*', html) if res is not None: news.video_src = res.group(0) news.content = ''.format(news.video_src) else: news.content = '' div_paraphs = soup.find('div', class_='teaser_container') if div_paraphs is not None: news.content += ''.join( [str(c) for c in div_paraphs.contents if isinstance(c, bs4.element.Tag) and c.name != 'script'] ) else: h2_teaser = soup.find('h2', class_='storyfull__teaser') if h2_teaser is not None: news.teaser = str(h2_teaser.text).strip() div_paraphs = soup.find('div', class_='storyfull__paragraphs') if div_paraphs is not None: news.content = ''.join( [str(c) for c in div_paraphs.contents if isinstance(c, bs4.element.Tag) and c.name != 'script'] ) div_author = soup.find('div', class_='storyfull__publisher-author-name') if div_author is not None: a_author = div_author.find('a') if a_author is not None: news.author = a_author.text.strip() else: news.author = str(div_author.contents[0]).strip() if news.author.startswith('Par '): news.author = news.author.replace('Par ', '', 1) @classmethod def get_schedule_url(cls, match): return match.url @classmethod def get_schedule(cls, scheduler, data): html = data.decode(cls.CHARSET) soup = BeautifulSoup(html, 'html.parser') divs_name = soup.find_all('span', class_='tennismatch--hidemobile') if len(divs_name) == 2 and divs_name[0].text and divs_name[1].text: home = divs_name[0].text.strip() away = divs_name[1].text.strip() for match in scheduler.matches: if match.home.names['eurosport'] == home and match.away.names['eurosport'] == away: date = datetime.strptime(soup.find('div', class_='livehero__date').contents[0], '%d/%m/%y') hours, minutes = soup.find('div', class_='tennismatch__time-value').text.strip().split(':') match.new_start_date = date + timedelta(hours=int(hours), minutes=int(minutes)) match.task_done = True @classmethod def create_schedule(cls, league, data): html = data.decode(cls.CHARSET) soup = BeautifulSoup(html, 'html.parser') current_year = datetime.now().year droplet_id = 0 if league.sport.id == 2: ajax_container = soup.find('div', class_='ajax-container') droplet_match = re.search(r'&dropletid=(\d+)&', ajax_container.attrs['data-ajax-url']) if droplet_match is not None: droplet_id = int(droplet_match.group(1)) else: raise Exception('no droplet_id found') rounds = list() div_rounds = soup.find('div', class_='rounds-dropdown__rounds') for div_round in div_rounds.find_all('div', class_='rounds-dropdown__round'): rounds.append(div_round.text.strip()) div_matches = soup.find('div', class_='bracket-matches-wrapper') for div_matches_round in div_matches.find_all('div', class_='bracket-matches'): for class_ in div_matches_round.attrs['class']: if class_.startswith('bracket-round--'): nb_round = class_.replace('bracket-round--', '') if nb_round.isnumeric(): nb_round = int(nb_round) if nb_round <= len(rounds): current_round = rounds[nb_round - 1] idof10 = 0 for a_match in div_matches_round.find_all('a', class_='match-sets'): match = Match(idt=0) match.idof10 = idof10 match.url = urljoin(league.url, a_match.attrs['href']) if droplet_id is not None: match_id = int(a_match.attrs['href'].split('/')[-2].split('mtc')[-1]) score_url = 'https://web-api.eurosport.com/json/getmatchheaderweb.json' comms_url = 'https://web-api.eurosport.com/json/getlivecomments.json' match.url_score = '{}?d={}&ids={}'.format(score_url, droplet_id, match_id) match.url_comms = '{}?d={}&ids={}'.format(comms_url, droplet_id, match_id) match.league = league match.leg = 0 match.round = current_round match.mday = 0 div_time = a_match.find('div', class_='match-sets__start-time') match_date = '{}/{}'.format(div_time.text.strip(), current_year) match.start_date = datetime.strptime(match_date, '%d/%m/%Y') divs_name = a_match.find_all('div', class_='player__name') divs_logo = a_match.find_all('div', class_='player__logo') if len(divs_name) == 2 and divs_name[0].text and divs_name[1].text: match.home = Team(idt=0) match.home.league = league match.home.country = Country(idt=0) img_country_home = divs_logo[0].find('img') if img_country_home is None: continue match.home.country.name = img_country_home.attrs['title'].strip() match.home.name = divs_name[0].text.strip() match.home.long_name = match.home.name words = match.home.name.split(' ') for idx in range(len(words)): if idx < len(words) - 1 and len(words[idx]) > 3: words[idx] = words[idx][0] + '.' match.home.short_name = ' '.join(words) match.home.names = {'eurosport': match.home.name} match.away = Team(idt=0) match.away.league = league match.away.country = Country(idt=0) img_country_away = divs_logo[1].find('img') if img_country_away is None: continue match.away.country.name = img_country_away.attrs['title'].strip() match.away.name = divs_name[1].text.strip() match.away.long_name = match.away.name words = match.away.name.split(' ') for idx in range(len(words)): if idx < len(words) - 1 and len(words[idx]) > 3: words[idx] = words[idx][0] + '.' match.away.short_name = ' '.join(words) match.away.names = {'eurosport': match.away.name} idof10 += 1 yield match