cronpy/providers/matchendirect.py

515 lines
24 KiB
Python
Raw Permalink Normal View History

2020-10-03 21:17:53 +00:00
from datetime import datetime, timedelta
from urllib.parse import urljoin
import locale
# noinspection PyProtectedMember
from bs4 import BeautifulSoup, NavigableString
from lib.match import Match, Event, Squad, Stat, Comm
from providers.base import BaseProvider
from lib.country import Country
from lib.league import Group
from lib.team import Team
class Matchendirect(BaseProvider):
DOMAINS = {'www.matchendirect.fr'}
CHARSET = 'utf-8'
ROLES = [
[
[None, 'DLG', 'DLG', 'MDG', 'MG', 'MOG', 'ALG'],
[None, 'DCG', None, 'MDG', 'MCG', 'MOG', 'AG'],
['G', 'DC', None, 'MDC', 'MC', 'MOC', 'AC'],
[None, 'DCD', None, 'MDD', 'MCD', 'MOD', 'AD'],
[None, 'DLD', 'DLD', 'MDD', 'MD', 'MOD', 'ALD']
],
[
['ALD', 'MOD', 'MD', 'MDD', 'DLD', 'DLD', None],
['AD', 'MOD', 'MCD', 'MDD', None, 'DCD', None],
['AC', 'MOC', 'MC', 'MDC', None, 'DC', 'G'],
['AG', 'MOG', 'MCG', 'MDG', None, 'DCG', None],
['ALG', 'MOG', 'MG', 'MDG', 'DLG', 'DLG', None]
]
]
MONTH_NUMBERS = {
'janvier': '01',
'février': '02',
'mars': '03',
'avril': '04',
'mai': '05',
'juin': '06',
'juillet': '07',
'août': '08',
'septembre': '09',
'octobre': '10',
'novembre': '11',
'décembre': '12'
}
EVENT_TYPES = {
'ico_evenement1': ('goal', None),
'ico_evenement2': ('goal', 'P'),
'ico_evenement3': ('red-card', None),
'ico_evenement4': ('yellow-card', None),
'ico_evenement5': ('yellow-red-card', None),
'ico_evenement7': ('goal', 'CSC'),
'ico_evenement81': ('switch-out', None),
'ico_evenement82': ('switch-out', None),
'ico_evenement91': ('switch-in', None),
'ico_evenement92': ('switch-in', None)
}
STAT_NAMES = {
'Possession': 'possession',
'Buts': 'goals',
'Tirs': 'attempts',
'Corners': 'corners',
'Hors-jeu': 'offsides',
'Fautes': 'fouls',
'Carton jaune': 'yellow_cards',
'Carton rouge': 'red_cards'
}
COMM_TYPES = {
'ico_com_occasion': 'chance',
'ico_com_but': 'goal',
'ico_com_carton-jaune': 'yellow-card',
'ico_com_remplacement': 'switch',
'ico_com_sifflet': 'whistle',
'ico_com_carton-rouge': 'red-card'
}
@classmethod
def get_match_info(cls, match, data):
html = data.decode(cls.CHARSET)
soup = BeautifulSoup(html, 'html.parser')
div_match = soup.find(id='ajax-match-detail-1')
if div_match is None:
raise NameError('div ajax-match-detail-1 not found')
# check team names
div_teams = div_match.find_all('div', class_='team')
if len(div_teams) < 2:
raise NameError('divs team not found')
if div_teams[0].a.text.strip() == match.away.names['matchendirect'] \
and div_teams[1].a.text.strip() == match.home.names['matchendirect']:
raise NameError('team names are inverted')
if div_teams[0].a.text.strip() != match.home.names['matchendirect']:
raise NameError('home name does not match')
if div_teams[1].a.text.strip() != match.away.names['matchendirect']:
raise NameError('away name does not match')
# check start_date
div_info = div_match.find('div', class_='info1')
if div_info is None:
raise NameError('div info1 is not found')
content_date = div_info.contents[0]
exp_date = content_date.text.split(' ')
day = exp_date[1]
month = cls.MONTH_NUMBERS[exp_date[2]]
year = exp_date[3]
content_time = div_info.contents[1]
exp_time = content_time.strip(' à').split('h')
if len(exp_time) == 2:
hour = exp_time[0]
minute = exp_time[1]
else:
hour = 0
minute = 0
match.start_date = datetime.strptime(
'{}-{}-{} {}:{}:00'.format(year, month, day, hour, minute), '%Y-%m-%d %H:%M:%S'
)
# get shootout
table_shootout = soup.find('table', id='match_evenement_score')
if table_shootout is not None:
for tr in table_shootout.find_all('tr'):
tds = tr.find_all('td')
if len(tds) == 3:
if tds[0].text.strip() == 'Score après prolongation':
match.extra_time = 'extratime'
if tds[0].text.strip() == 'Tirs au but':
exp_shootout = tds[1].text.split(' - ')
match.extra_time = 'shootout'
match.shootout_home = int(exp_shootout[0])
match.shootout_away = int(exp_shootout[1])
# get score
span_scores = div_match.find_all('span', class_='score')
if len(span_scores) < 2:
raise NameError('spans score not found')
if span_scores[0].text.strip().isnumeric():
match.score_home = int(span_scores[0].text.strip())
if span_scores[1].text.strip().isnumeric():
match.score_away = int(span_scores[1].text.strip())
# get minute
div_status = div_match.find('div', class_='status')
if div_status is None:
raise NameError('div status not found')
content_minute = div_status.contents[-1]
if isinstance(content_minute, NavigableString):
match.minute = content_minute.strip().lower()
else:
match.minute = content_minute.text.strip().lower()
# get events
table_events = soup.find('table', id='match_evenement')
if table_events is not None:
nb_goals = {'home': 0, 'away': 0}
for span_event in table_events.find_all('span'):
if span_event['class'][2] in cls.EVENT_TYPES:
type_, particularity = cls.EVENT_TYPES[span_event['class'][2]]
event = Event(type_=type_)
td_event = span_event.parent
if td_event['class'][0] == 'c1':
event.side = 'home'
else:
event.side = 'away'
event.player = td_event.find('a').text.strip()
if particularity is not None:
event.player += ' ({})'.format(particularity)
event.minute = td_event.parent.find('td', class_='c2').text
if event.type == 'goal':
nb_goals[event.side] += 1
if nb_goals[event.side] > getattr(match, 'score_' + event.side):
continue
match.events.append(event)
# get squad
div_squad = soup.find('div', class_='MEDpanelcomposition')
if div_squad is not None:
td_squads = div_squad.find_all('td')
if len(td_squads) > 1:
for span_squad in td_squads[0].find_all('span'):
squad_name = span_squad.previous_sibling.previous_sibling.text.strip() if \
span_squad.previous_sibling.previous_sibling is not None else span_squad.previous_sibling
if 'ico_compo_titulaire' in span_squad.attrs['class']:
role = 'STR'
else:
role = 'SUB'
squad = Squad(role=role, name=squad_name, side='home')
for event in match.events:
if event.player.replace(' (P)', '') == squad.name:
squad.events.append(event.type)
match.squad.append(squad)
for span_squad in td_squads[1].find_all('span'):
squad_name = span_squad.next_sibling.next_sibling.text.strip() if \
span_squad.next_sibling.next_sibling is not None else span_squad.next_sibling
if 'ico_compo_titulaire' in span_squad.attrs['class']:
role = 'STR'
else:
role = 'SUB'
squad = Squad(role=role, name=squad_name, side='away')
for event in match.events:
if event.player.replace(' (P)', '') == squad.name:
squad.events.append(event.type)
match.squad.append(squad)
# get squad roles
table_squad = soup.find('table', id='schema_compo')
sides = [None, None]
if table_squad is not None:
tables = table_squad.find_all('table')
for id_table in range(len(tables)):
table = tables[id_table]
tds = table.find_all('td')
for id_td in range(len(tds)):
td = tds[id_td].find('b')
if td is not None and td.text:
for id_squad in range(len(match.squad)):
squad = match.squad[id_squad]
if squad.role in ('SUB', 'STR') and all([name in squad.name for name in td.text.split()]):
if squad.side not in sides:
sides[id_table // 5] = squad.side
sides[1 - id_table // 5] = 'home' if squad.side == 'away' else 'away'
if squad.side == sides[id_table // 5]:
squad.lastname = td.text.strip()
squad.role = cls.ROLES[id_table // 5][id_table % 5][id_td]
break
# re-order squad lines
for side in ('home', 'away'):
if len([squad for squad in match.squad if squad.role in ('DCG', 'DC', 'DCD') and squad.side == side]) > 2:
for ids in range(len(match.squad)):
squad = match.squad[ids]
squad.role = 'DG' if squad.side == side and squad.role == 'DCG' else squad.role
squad.role = 'DD' if squad.side == side and squad.role == 'DCD' else squad.role
if len([squad for squad in match.squad if squad.role.startswith('D') and squad.side == side]) < 4:
for ids in range(len(match.squad)):
squad = match.squad[ids]
squad.role = 'DLG' if squad.side == side and squad.role == 'MG' else squad.role
squad.role = 'DLD' if squad.side == side and squad.role == 'MD' else squad.role
if len([squad for squad in match.squad if squad.role in ('MDG', 'MDC', 'MDD') and squad.side == side]) > 2:
for ids in range(len(match.squad)):
squad = match.squad[ids]
squad.role = 'MG' if squad.side == side and squad.role == 'MDG' else squad.role
squad.role = 'MD' if squad.side == side and squad.role == 'MDD' else squad.role
if len([squad for squad in match.squad if squad.role in ('MCG', 'MC', 'MCD') and squad.side == side]) > 2:
if len([sq for sq in match.squad if sq.role in ('MDG', 'MDC', 'MDD') and sq.side == side]) == 0:
for ids in range(len(match.squad)):
squad = match.squad[ids]
squad.role = 'MDC' if squad.side == side and squad.role == 'MC' else squad.role
else:
for ids in range(len(match.squad)):
squad = match.squad[ids]
squad.role = 'MG' if squad.side == side and squad.role == 'MCG' else squad.role
squad.role = 'MD' if squad.side == side and squad.role == 'MCD' else squad.role
if len([sq for sq in match.squad if sq.role in ('MG', 'MCG', 'MC', 'MCD', 'MD') and sq.side == side]) > 3:
if len([sq for sq in match.squad if sq.role in ('MDG', 'MDC', 'MDD') and sq.side == side]) == 0:
for ids in range(len(match.squad)):
squad = match.squad[ids]
squad.role = 'MDC' if squad.side == side and squad.role == 'MC' else squad.role
squad.role = 'MDG' if squad.side == side and squad.role == 'MCG' else squad.role
squad.role = 'MDD' if squad.side == side and squad.role == 'MCD' else squad.role
else:
for ids in range(len(match.squad)):
squad = match.squad[ids]
squad.role = 'MOC' if squad.side == side and squad.role == 'MC' else squad.role
squad.role = 'MOG' if squad.side == side and squad.role == 'MG' else squad.role
squad.role = 'MOD' if squad.side == side and squad.role == 'MD' else squad.role
if len([squad for squad in match.squad if squad.role in ('AG', 'AC', 'AD') and squad.side == side]) > 2:
for id_squad in range(len(match.squad)):
squad = match.squad[id_squad]
squad.role = 'ALG' if squad.side == side and squad.role == 'AG' else squad.role
squad.role = 'ALD' if squad.side == side and squad.role == 'AD' else squad.role
# get stats
div_stats = soup.find('div', class_='MEDpanelstats')
if div_stats is not None:
match.stats = {name: Stat() for name in cls.STAT_NAMES.values()}
for tr in div_stats.find_all('tr'):
tds = tr.find_all('td')
stat_name = tds[2].text.strip()
stat = Stat(home=int(tds[0].text.strip()), away=int(tds[4].text.strip()))
if stat_name in cls.STAT_NAMES:
match.stats[cls.STAT_NAMES[stat_name]] = stat
elif stat_name == 'Tirs cadrés':
match.stats['attempts'].home += stat.home
match.stats['attempts'].away += stat.away
match.stats['in_attempts'] = Stat(home=stat.home, away=stat.away)
elif stat_name == 'Tirs non cadrés':
match.stats['attempts'].home += stat.home
match.stats['attempts'].away += stat.away
elif stat_name == 'Tirs arrêtés':
match.stats['block_attempts'] = stat
match.stats['attempts'].home += stat.home
match.stats['attempts'].away += stat.away
elif stat_name == 'Tirs sur le poteau':
match.stats['pole_attempts'] = stat
match.stats['attempts'].home += stat.home
match.stats['attempts'].away += stat.away
# get live comments
table_comms = soup.find('table', id='commentaire')
if table_comms is not None:
for tr in table_comms.find_all('tr'):
tds = tr.find_all('td')
if len(tds) == 3:
span_icon = tds[1].find('span')
minute = tds[0].text.strip().replace('+', "'+")
if minute and not minute.endswith("'") and '+' not in minute:
minute += "'"
comm_type = ''
if span_icon is not None:
icon_class = span_icon.attrs['class'][1]
if icon_class in cls.COMM_TYPES:
comm_type = cls.COMM_TYPES[icon_class]
else:
continue
match.comms.append(Comm(minute=minute, type_=comm_type, text=tds[2].text.strip()))
@classmethod
def get_league_ranking(cls, league, data):
html = data.decode(cls.CHARSET)
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', id='tableau_classement')
groups = list()
group = Group(name='0', url=None, league=league)
for tr in table.find_all('tr'):
tds = tr.find_all('td')
# row titles
if len(tds) == 0:
continue
# row group
if len(tds) == 1:
group_name = tds[0].text.strip()
for grp in groups:
if grp.name == group_name:
group = grp
break
else:
group = Group(name=group_name, url=None, league=league)
groups.append(group)
continue
# get team
med_name = tds[0].find('a').contents[-1].strip()
for tm in league.teams:
if 'matchendirect' in tm.names and tm.names['matchendirect'] == med_name:
team = tm
break
else:
continue
# get rank
th = tr.find('th')
span = th.find('span')
if span is not None:
team.rank = int(span.text.strip())
else:
team.rank = int(th.text.strip())
# get stats
team.group = group
team.points = int(tds[1].text.strip())
team.played = int(tds[2].text.strip())
team.wins = int(tds[3].text.strip())
team.ties = int(tds[4].text.strip())
team.loss = int(tds[5].text.strip())
team.g_for = int(tds[6].text.strip())
team.g_against = int(tds[7].text.strip())
team.g_diff = int(tds[8].text.strip())
return groups
@classmethod
def get_schedule_url(cls, match):
# In matchendirect.fr dates are shifted in 2019
shift_date = match.start_date + timedelta(days=7)
return '{}/{}'.format(match.league.url.rstrip('/'), shift_date.strftime('%Y-%W'))
@classmethod
def get_schedule(cls, scheduler, data):
locale.setlocale(locale.LC_ALL, 'fr_FR.utf-8')
html = data.decode()
soup = BeautifulSoup(html, 'html.parser')
date = None
for table in soup.find_all('table', class_='table table-striped table-hover'):
for tr in table.find_all('tr'):
th = tr.find('th')
if th is not None:
date = datetime.strptime(th.text.strip(), '%A %d %B %Y')
elif 'data-matchid' in tr.attrs and date is not None:
td_hour = tr.find('td', class_='lm1')
hours, minutes = td_hour.text.strip().split(':')
if hours.isnumeric() and minutes.isnumeric():
start_date = date + timedelta(hours=int(hours), minutes=int(minutes))
else:
start_date = date
td_score = tr.find('td', class_='lm3')
home = td_score.find('span', class_='lm3_eq1').contents[0].strip(' \n\t*')
away = td_score.find('span', class_='lm3_eq2').contents[-1].strip(' \n\t*')
url = urljoin('http://www.matchendirect.fr/', td_score.find('a').attrs['href'])
for match in scheduler.matches:
if match.home.names['matchendirect'] == home and match.away.names['matchendirect'] == away:
match.new_url = url
match.new_start_date = start_date
match.task_done = True
break
a_previous = soup.find('a', class_='objselect_prevnext objselect_prec')
if a_previous is not None:
scheduler.previous_url = urljoin('http://www.matchendirect.fr/', a_previous.attrs['href'])
a_next = soup.find('a', class_='objselect_prevnext objselect_suiv')
if a_next is not None:
scheduler.next_url = urljoin('http://www.matchendirect.fr/', a_next.attrs['href'])
@classmethod
def create_schedule(cls, league, data):
html = data.decode(cls.CHARSET)
soup = BeautifulSoup(html, 'html.parser')
div_top = soup.find('div', id='filtre_haut')
select_url = div_top.find('select')
if select_url is not None:
selected = False
for option_url in select_url.find_all('option'):
if selected or 'selected' in option_url.attrs:
yield urljoin(league.url, option_url.attrs['value'])
selected = True
elif 'selected' in option_url.attrs:
selected = True
@classmethod
def _current_mday_round_leg(cls, league, date):
_mday = 0
_round = None
_leg = 0
if league.round_dates is not None:
for key, value in league.round_dates.items():
if date >= datetime.strptime(key, '%Y-%m-%d'):
_mday = value['mday']
_round = value['round']
_leg = value['leg']
return _mday, _round, _leg
@classmethod
def create_schedule_from_url(cls, league, data):
locale.setlocale(locale.LC_ALL, 'fr_FR.utf-8')
html = data.decode(cls.CHARSET)
soup = BeautifulSoup(html, 'html.parser')
date = None
current_mday = 0
current_round = None
current_leg = 0
idof10 = 0
for table in soup.find_all('table', class_='table table-striped table-hover'):
for tr in table.find_all('tr'):
th = tr.find('th')
if th is not None:
date = datetime.strptime(th.text.strip(), '%A %d %B %Y')
_mday, _round, _leg = cls._current_mday_round_leg(league, date)
if _mday != current_mday or _round != current_round or _leg != current_leg:
current_mday, current_round, current_leg = _mday, _round, _leg
idof10 = 0
elif 'data-matchid' in tr.attrs and date is not None:
match = Match(idt=0)
match.idof10 = idof10
match.league = league
match.mday = current_mday
match.round = current_round
match.leg = current_leg
td_hour = tr.find('td', class_='lm1')
hours, minutes = td_hour.text.strip().split(':')
if hours.isnumeric() and minutes.isnumeric():
match.start_date = date + timedelta(hours=int(hours), minutes=int(minutes))
else:
match.start_date = date
td_score = tr.find('td', class_='lm3')
match.url = urljoin('http://www.matchendirect.fr/', td_score.find('a').attrs['href'])
home_name = td_score.find('span', class_='lm3_eq1').contents[0].strip(' \n\t*')
match.home = Team(idt=0)
match.home.league = league
match.home.name = home_name + ' F' if league.gender == 'F' else home_name
match.home.short_name = match.home.name[:3].upper()
match.home.long_name = match.home.name
match.home.names = {cls.__name__.lower(): home_name}
match.home.id_sport = league.sport.id
match.home.country = Country(idt=league.country.id)
match.home.gender = league.gender
match.home.images = {'png': 'default-team.png', '50': 'h50-default-team.svg',
'30': 'h30-default-team.svg', '80': 'h80-default-team.svg'}
away_name = td_score.find('span', class_='lm3_eq2').contents[-1].strip(' \n\t*')
match.away = Team(idt=0)
match.away.league = league
match.away.name = away_name + ' F' if league.gender == 'F' else away_name
match.away.short_name = match.away.name[:3].upper()
match.away.long_name = match.away.name
match.away.names = {cls.__name__.lower(): away_name}
match.away.id_sport = league.sport.id
match.away.country = Country(idt=league.country.id)
match.away.gender = league.gender
match.away.images = {'png': 'default-team.png', '50': 'h50-default-team.svg',
'30': 'h30-default-team.svg', '80': 'h80-default-team.svg'}
idof10 += 1
yield match