cronpy/providers/transfermarkt.py

157 lines
6.8 KiB
Python

from urllib.parse import urlsplit
from bs4 import BeautifulSoup
from datetime import datetime
import locale
from providers.base import BaseProvider
from lib.player import Player
class TransferMarkt(BaseProvider):
DOMAINS = {'www.transfermarkt.fr'}
CHARSET = 'UTF-8'
ROLES = {
'Gardien': Player.ROLE_GOALKEEPER,
'Défense': Player.ROLE_DEFENDER,
'Milieu de terrain': Player.ROLE_MIDFIELDER,
'Attaquant': Player.ROLE_ATTACKER
}
FEET = {
'droit': Player.FOOT_RIGHT,
'gauche': Player.FOOT_LEFT,
'des deux pieds': Player.FOOT_BOTH
}
@classmethod
def get_team_staff(cls, data):
html = data.decode(cls.CHARSET)
soup = BeautifulSoup(html, 'html.parser')
select = soup.find('select', id='spieler_select_breadcrumb')
staff = dict()
for optgroup in select.find_all('optgroup'):
players = list()
for option in optgroup.find_all('option'):
player_split = option.text.strip().split(' ')
player_name = ' '.join(player_split[1:])
players.append(player_name)
if optgroup.attrs['label'] == 'Gardien':
staff['goalkeepers'] = players
elif optgroup.attrs['label'] == 'Défense':
staff['defenders'] = players
elif optgroup.attrs['label'] == 'Milieu de terrain':
staff['midfielders'] = players
else:
staff['attackers'] = players
return staff
@classmethod
def get_team_players(cls, data, team, countries):
locale.setlocale(locale.LC_ALL, 'fr_FR.utf-8')
html = data.decode(cls.CHARSET)
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('div', id='yw1')
if table is not None:
for tr in table.find_all('tr'):
tds = tr.find_all('td', recursive=False)
if len(tds) == 10:
player = Player(team=team, error=list())
number = tds[0].text.strip()
if number.isnumeric():
player.number = int(number)
elif number != '-':
player.error.append("bad format number '{}'".format(number))
role = tds[0].attrs.get('title')
if role in cls.ROLES:
player.role = cls.ROLES[role]
else:
player.error.append("bad format role '{}'".format(role))
birth_date = tds[2].text.split('(')[0].strip().replace('avr.', 'avril')
try:
player.set_age(datetime.strptime(birth_date, '%d %b %Y'))
except ValueError:
player.error.append("bad format birth_date '{}'".format(birth_date))
imgs = tds[3].find_all('img')
if len(imgs) > 0:
for country in countries:
if country.names['transfermarkt'] == imgs[0].attrs.get('alt'):
player.country1 = country
break
else:
player.error.append("unknown country1 '{}'".format(imgs[0].attrs.get('title')))
if len(imgs) > 1:
for country in countries:
if country.names['transfermarkt'] == imgs[1].attrs.get('alt'):
player.country2 = country
break
else:
player.error.append("unknown country2 '{}'".format(imgs[1].attrs.get('title')))
else:
player.error.append("no country found")
size = tds[4].text.split('m')[0].strip().replace(',', '')
if size.isnumeric():
player.size = int(size)
elif size:
player.error.append("bad format size '{}'".format(size))
foot = tds[5].text.strip()
if foot in cls.FEET:
player.foot = cls.FEET[foot]
elif foot != '-':
player.error.append("bad format foot '{}'".format(foot))
contract_date = tds[8].text.strip().replace('avr.', 'avril')
if contract_date != '-':
try:
player.contract_end = datetime.strptime(contract_date, '%d.%m.%Y')
except ValueError:
player.error.append("bad format contract_end '{}'".format(contract_date))
price = tds[9].text.strip()
if price.endswith('mio. €'):
try:
player.set_price(int(float(price.split(' ')[0].replace(',', '.')) * 1e6))
except ValueError:
player.error.append("price '{}' bad format".format(price))
elif price.endswith('K €'):
try:
player.set_price(int(float(price.split(' ')[0].replace(',', '.')) * 1e3))
except ValueError:
player.error.append("price '{}' bad format".format(price))
elif price != '-':
player.error.append("bad format price '{}'".format(price))
name_trs = tds[1].find_all('tr')
if len(name_trs) > 0:
span_name = name_trs[0].find('span', class_='hide-for-small')
if span_name is not None:
player.set_names(span_name.text.strip())
else:
player.error.append("span containing full name not found")
image = name_trs[0].find('img', class_='bilderrahmen-fixed')
if image is not None:
image_url = image.attrs.get('src')
player.set_image(image_url)
player.image.set_lm(urlsplit(image_url).query.replace('lm=', ''))
else:
player.error.append('no image found')
if len(name_trs) > 1:
player.position = name_trs[1].text.strip()
else:
player.error.append("tr containing position not found")
else:
player.error.append("tr containing full name not found")
if not player.error:
player.error = None
yield player