New function updateSchedule

This commit is contained in:
Samuel Campos 2020-10-19 11:26:23 +02:00
parent bd0366fdc3
commit f07ed95702
13 changed files with 500 additions and 212 deletions

1
go.mod
View File

@ -4,6 +4,7 @@ go 1.12
require ( require (
github.com/PuerkitoBio/goquery v1.6.0 github.com/PuerkitoBio/goquery v1.6.0
github.com/go-redis/redis v6.15.9+incompatible
github.com/lib/pq v1.8.0 github.com/lib/pq v1.8.0
github.com/mmcdole/gofeed v1.1.0 github.com/mmcdole/gofeed v1.1.0
golang.org/x/net v0.0.0-20201009032441-dbdefad45b89 golang.org/x/net v0.0.0-20201009032441-dbdefad45b89

2
go.sum
View File

@ -7,6 +7,8 @@ github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9Pq
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-redis/redis v6.15.9+incompatible h1:K0pv1D7EQUjfyoMql+r/jZqCLizCGKFlFgcHWWmHQjg=
github.com/go-redis/redis v6.15.9+incompatible/go.mod h1:NAIEuMOZ/fxfXJIrKDQDz8wamY7mA7PouImQ2Jvg6kA=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/json-iterator/go v1.1.10 h1:Kz6Cvnvv2wGdaG/V8yMvfkmNiXq9Ya2KUv4rouJJr68= github.com/json-iterator/go v1.1.10 h1:Kz6Cvnvv2wGdaG/V8yMvfkmNiXq9Ya2KUv4rouJJr68=
github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=

67
main.go
View File

@ -1,13 +1,13 @@
package main package main
import ( import (
"1bet.fr/scraper/utils"
"flag" "flag"
"fmt" "fmt"
"github.com/lib/pq"
"log" "log"
"sync" "sync"
"github.com/lib/pq"
"1bet.fr/scraper/match" "1bet.fr/scraper/match"
"1bet.fr/scraper/news" "1bet.fr/scraper/news"
"1bet.fr/scraper/postgres" "1bet.fr/scraper/postgres"
@ -84,38 +84,59 @@ func updateSchedule() {
} }
waitGroup := sync.WaitGroup{} waitGroup := sync.WaitGroup{}
sourceChannel := make(chan *match.Source) leagueChannel := make(chan *match.League)
for i := 0; i < nbProcesses; i++ { for i := 0; i < nbProcesses; i++ {
waitGroup.Add(1) waitGroup.Add(1)
go func(sc chan *match.Source, wg *sync.WaitGroup) { go func(lc chan *match.League, wg *sync.WaitGroup) {
defer wg.Done() defer wg.Done()
for s := range sc { for l := range lc {
matches, err := s.GetMatches() sources, err := l.ListSources()
if err != nil { if err != nil {
log.Fatal(err) l.Error = utils.StringPointer("list sources error")
l.Trace = utils.StringPointer(fmt.Sprint(err))
if updated, err := postgres.UpdateLeague(l); err != nil {
log.Fatalf("error while updating league : %s", err)
} else if updated != 1 {
log.Fatalf("error while updating league : %d league(s) updated", updated)
}
break
} }
for _, s := range sources {
for _, m := range matches { log.Printf("[+] Parsing source %s", s.URL)
fmt.Println(m) matches, err := s.GetMatches()
//if err = postgres.InsertMatch(m); err != nil { if err != nil {
// log.Fatal(err) log.Printf("[-] error while getting matches from league source %s : %s", s.URL, err)
//} s.League.Error = utils.StringPointer("league source error")
s.League.Trace = utils.StringPointer(fmt.Sprint(err))
if updated, err := postgres.UpdateLeague(s.League); err != nil {
log.Fatalf("error while updating league : %s", err)
} else if updated != 1 {
log.Fatalf("error while updating league : %d league(s) updated", updated)
}
break
}
for _, m := range matches {
if err := postgres.InsertTeamBySourceName(m.TeamHome, s.League); err != nil {
log.Fatalf("error while saving team home : %s", err)
}
if err := postgres.InsertTeamBySourceName(m.TeamAway, s.League); err != nil {
log.Fatalf("error while saving team away : %s", err)
}
if err := postgres.InsertMatch(m); err != nil {
log.Fatalf("error while saving match : %s", err)
}
log.Printf("New match #%d (%s - %s)", m.Id, m.TeamHome.Name, m.TeamAway.Name)
}
} }
continue
} }
}(sourceChannel, &waitGroup) }(leagueChannel, &waitGroup)
} }
for _, league := range leagues { for _, league := range leagues {
sources, err := league.ListSources() leagueChannel <- league
if err != nil {
log.Fatal(err)
}
for _, s := range sources {
log.Printf("Adding source %s", s.Url.String())
sourceChannel <- s
}
} }
close(leagueChannel)
waitGroup.Wait()
} }
func main() { func main() {

View File

@ -2,6 +2,7 @@ package match
import ( import (
"fmt" "fmt"
"log"
"net/url" "net/url"
"regexp" "regexp"
"strings" "strings"
@ -13,122 +14,206 @@ import (
"1bet.fr/scraper/utils" "1bet.fr/scraper/utils"
) )
const (
_ = iota
GenderMale = iota
//GenderFemale = iota
)
type Sport struct { type Sport struct {
Id int Id int
Name string Name string
CleanName string CleanName string
} }
const ( type Country struct {
_ = iota Id int
GenderMale = iota }
GenderFemale = iota
) type Player struct {
Id int
FullName string
CleanName string
Gender int
}
type Team struct { type Team struct {
Id int Id int
SportId int Sport *Sport
CountryId int Country *Country
Name string Name string
CleanName string CleanName string
ShortName string ShortName *string
LongName string LongName *string
Gender int Gender *int
Names interface{} Names *map[string]string
Url string PlayersUrl *string
Images interface{} Images *interface{}
Tags []string Tags *[]string
CleanTags []string CleanTags *[]string
NewsCount int NewsCount int
Error string Error *string
Trace string Trace *string
} }
const (
_ = iota
LegFirst = iota
LegSecond = iota
LegReplay = iota
)
const (
_ = iota
StatusFirstTime = iota
StatusHalfTime = iota
StatusSecondTime = iota
StatusFirstExtra = iota
StatusHalfExtra = iota
StatusSecondExtra = iota
StatusShootout = iota
StatusWaitScores = iota
StatusOver = iota
StatusPostponed = iota
StatusCancelled = iota
)
const (
_ = iota
WinnerHome = iota
WinnerAway = iota
WinnerDraw = iota
)
const (
_ = iota
ExtraTimeExtraTime = iota
ExtraTimeShootout = iota
)
type Match struct { type Match struct {
Id int Id int
LeagueId int League *League
TeamHomeId int TeamHome *Team
TeamAwayId int TeamAway *Team
PlayerHomeId int PlayerHome *Player
PlayerAwayId int PlayerAway *Player
MatchDay int MatchDay *int
MatchDayId int Round *string
Round string Leg *int
Leg int
BaseUrl string BaseUrl *string
ScoreUrl string ScoreUrl *string
LiveUrl string LiveUrl *string
TvChannels []string TvChannels *[]string
Status int MatchDayId *int
Minute int Status *int
Minute *string
StartDate *time.Time StartDate *time.Time
EndDate *time.Time EndDate *time.Time
HomeScore int HomeScore *int
AwayScore int AwayScore *int
SetsScore int SetsScore *[]interface{}
Winner int Winner *int
ExtraTime int ExtraTime *int
ShootoutHome int ShootoutHome *int
ShootoutAway int ShootoutAway *int
Squad []interface{} Squad *[]interface{}
Events []interface{} Events *[]interface{}
Stats []interface{} Stats *[]interface{}
Live []interface{} Live *[]interface{}
LastEvent interface{} LastEvent *interface{}
LastEventDate *time.Time LastEventDate *time.Time
Error string Error *string
Trace string Trace *string
}
func NewMatch(league *League, teamHome *Team, teamAway *Team, playerHome *Player, playerAway *Player, round string, matchDay int, leg int) *Match {
return &Match{
League: league,
TeamHome: teamHome,
TeamAway: teamAway,
PlayerHome: playerHome,
PlayerAway: playerAway,
MatchDay: utils.IntPointer(matchDay),
Round: utils.StringPointer(round),
Leg: utils.IntPointer(leg),
}
}
type source struct {
League *League
URL *url.URL
matchDay int
round string
currentDate *time.Time
}
func newSource(league *League, scheme string, host string, path string, query string, round string, matchDay int) *source {
return &source{
League: league,
URL: &url.URL{
Scheme: scheme,
Host: host,
Path: path,
RawQuery: query,
},
round: round,
matchDay: matchDay,
}
}
func (s *source) GetMatches() ([]*Match, error) {
var matches []*Match
switch s.URL.Host {
case utils.HostMatchendirect:
doc, err := requests.GetDocumentFromURL(s.URL.String())
if err != nil {
return nil, err
}
doc.Find("#livescore tr").Each(func (i int, row *goquery.Selection) {
headCell := row.Find("th")
if len(headCell.Nodes) == 1 {
curDate, err := time.Parse("Monday 02 January 2006", utils.EnglishDateString(headCell.Text()))
if err != nil {
log.Fatalf("unexpected error while parsing date : %s", err)
return
}
s.currentDate = &curDate
} else {
match := NewMatch(s.League, &Team{Gender: s.League.Gender}, &Team{Gender: s.League.Gender}, &Player{}, &Player{}, s.round, s.matchDay, 0)
startTime := strings.TrimSpace(row.Find("td.lm1").Text())
if startTime == "-- : --" {
startTime = "00:00"
}
startDate, err := time.Parse(
"2006-01-02 15:04 MST",
s.currentDate.Format("2006-01-02 ") + startTime + " CEST",
)
if err != nil {
match.Error = utils.StringPointer("parse date error")
match.Trace = utils.StringPointer(fmt.Sprint(err))
matches = append(matches, match)
return
}
match.StartDate = &startDate
homeNames := map[string]string{utils.HostMatchendirect: strings.TrimSuffix(strings.TrimSpace(row.Find(".lm3_eq1").Text()), "*")}
match.TeamHome.Names = &homeNames
awayNames := map[string]string{utils.HostMatchendirect: strings.TrimSuffix(strings.TrimSpace(row.Find(".lm3_eq2").Text()), "*")}
match.TeamAway.Names = &awayNames
basePath, ok := row.Find(".lm3 a").First().Attr("href")
if !ok {
match.Error = utils.StringPointer("unable to find href attr")
match.Trace = utils.StringPointer("unable to find href attribute for element '.lm3 a'")
matches = append(matches, match)
return
}
baseUrl := &url.URL{Scheme: s.URL.Scheme, Host: s.URL.Host, Path: basePath}
match.BaseUrl = utils.StringPointer(baseUrl.String())
if match.Round == nil && match.MatchDay == nil && s.League.MatchDays != nil && s.League.MatchesByMatchDay != nil {
if s.League.currentMatchDayId == 0 {
s.League.currentMatchDay ++
}
curMatchDay := s.League.currentMatchDay
curMatchDayId := s.League.currentMatchDayId
match.MatchDay = &curMatchDay
match.MatchDayId = &curMatchDayId
s.League.currentMatchDayId = (s.League.currentMatchDayId + 1) % *s.League.MatchesByMatchDay
}
matches = append(matches, match)
}
})
default:
return nil, fmt.Errorf("unexpected source url %s", s.URL.String())
}
return matches, nil
} }
type League struct { type League struct {
Id int Id int
Sport *Sport Sport *Sport
CountryId int Country *Country
Name string Name string
CleanName string CleanName string
@ -140,7 +225,6 @@ type League struct {
ChannelUrl *string ChannelUrl *string
MatchDays *int MatchDays *int
CurrentMatchDay *int
MatchesByMatchDay *int MatchesByMatchDay *int
TeamCount int TeamCount int
@ -158,10 +242,13 @@ type League struct {
Error *string Error *string
Trace *string Trace *string
currentMatchDay int
currentMatchDayId int
} }
func (l *League) ListSources() ([]*Source, error) { func (l *League) ListSources() ([]*source, error) {
var sources []*Source var sources []*source
if l.ScheduleUrl == nil { if l.ScheduleUrl == nil {
return nil, nil return nil, nil
@ -187,19 +274,13 @@ func (l *League) ListSources() ([]*Source, error) {
parsedDate := strings.Split(strings.Split(value, "/")[3], "-") parsedDate := strings.Split(strings.Split(value, "/")[3], "-")
year := utils.AtoI(parsedDate[0]) year := utils.AtoI(parsedDate[0])
week := utils.AtoI(parsedDate[1]) week := utils.AtoI(parsedDate[1])
if year >= 2020 && week >= 34 { if (year == 2020 && week >= 34) || year > 2020 {
sources = append(sources, &Source{ sources = append(sources, newSource(l, originUrl.Scheme, originUrl.Host, value, "", "", 0))
League: l,
Url: &url.URL{
Scheme: originUrl.Scheme,
Host: originUrl.Host,
Path: value,
},
})
} }
}) })
case utils.HostEurosport, utils.HostRugbyrama: case utils.HostEurosport, utils.HostRugbyrama:
eurosportRegexp := regexp.MustCompile(`(\d)+e\s+Journée`) curRegexp := regexp.MustCompile(`(\d)+e\s+Journée`)
ajaxUrl, ok := doc.Find(".ajax-container").Attr("data-ajax-url") ajaxUrl, ok := doc.Find(".ajax-container").Attr("data-ajax-url")
if !ok { if !ok {
@ -212,74 +293,23 @@ func (l *League) ListSources() ([]*Source, error) {
ajaxQuery := ajaxParsedUrl.Query() ajaxQuery := ajaxParsedUrl.Query()
doc.Find("#results-match-nav .rounds-dropdown__round").Each(func (i int, s *goquery.Selection) { doc.Find("#results-match-nav .rounds-dropdown__round").Each(func (i int, s *goquery.Selection) {
var round *string matchDay := 0
var matchDay *int round, _ := s.Attr("data-label")
reMatch := curRegexp.FindStringSubmatch(round)
roundStr, _ := s.Attr("data-label")
reMatch := eurosportRegexp.FindStringSubmatch(roundStr)
if reMatch != nil { if reMatch != nil {
mdayInt := utils.AtoI(reMatch[1]) round = ""
matchDay = &mdayInt matchDay = utils.AtoI(reMatch[1])
} else {
round = &roundStr
} }
roundId, _ := s.Attr("data-round-id") roundId, _ := s.Attr("data-round-id")
ajaxQuery.Set("roundid", roundId) ajaxQuery.Set("roundid", roundId)
sources = append(sources, newSource(l, originUrl.Scheme, originUrl.Host, ajaxParsedUrl.Path, ajaxQuery.Encode(), round, matchDay))
sources = append(sources, &Source{
League: l,
Url: &url.URL{
Scheme: originUrl.Scheme,
Host: originUrl.Host,
Path: ajaxParsedUrl.Path,
RawQuery: ajaxQuery.Encode(),
},
Round: round,
MatchDay: matchDay,
})
}) })
default: default:
return nil, fmt.Errorf("unknown source url : %s", *l.ScheduleUrl) return nil, fmt.Errorf("unknown source url : %s", *l.ScheduleUrl)
} }
return sources, nil return sources, nil
} }
type Source struct {
League *League
Url *url.URL
MatchDay *int
Round *string
currentDate *time.Time
}
func (s *Source) GetMatches() ([]*Match, error) {
var matches []*Match
switch s.Url.Host {
case utils.HostMatchendirect:
doc, err := requests.GetDocumentFromURL(s.Url.String())
if err != nil {
return nil, err
}
doc.Find("#livescore tr").Each(func (i int, row *goquery.Selection) {
row.Children().Each(func (j int, col *goquery.Selection) {
colspan, ok := col.Attr("colspan")
if ok && colspan == "4" {
currentDate, err := time.Parse("Monday 02 January 2006", utils.EnglishDateString(col.Text()))
if err != nil {
fmt.Println(err)
return
}
s.currentDate = &currentDate
fmt.Println(s.currentDate)
}
})
})
case utils.HostEurosport, utils.HostRugbyrama:
default:
return nil, fmt.Errorf("unexpected source url %s", s.Url.String())
}
return matches, nil
}

View File

@ -1,6 +1,7 @@
package match package match
import ( import (
"1bet.fr/scraper/utils"
"net/url" "net/url"
"strings" "strings"
@ -21,8 +22,8 @@ func TestLeague_ListSources(t *testing.T) {
t.Errorf("no sources found") t.Errorf("no sources found")
} }
for _, s := range sources { for _, s := range sources {
if !strings.HasPrefix(s.Url.String(), "http://www.matchendirect.fr/france/ligue-1/") { if !strings.HasPrefix(s.URL.String(), "http://www.matchendirect.fr/france/ligue-1/") {
t.Errorf("unexpected source url %s", s.Url) t.Errorf("unexpected source url %s", s.URL)
} }
} }
@ -39,8 +40,8 @@ func TestLeague_ListSources(t *testing.T) {
t.Errorf("no sources found") t.Errorf("no sources found")
} }
for _, s := range sources { for _, s := range sources {
if !strings.HasPrefix(s.Url.String(), "https://www.eurosport.fr/") { if !strings.HasPrefix(s.URL.String(), "https://www.eurosport.fr/") {
t.Errorf("unexpected source url %s", s.Url) t.Errorf("unexpected source url %s", s.URL)
} }
} }
@ -57,17 +58,47 @@ func TestLeague_ListSources(t *testing.T) {
t.Errorf("no sources found") t.Errorf("no sources found")
} }
for _, s := range sources { for _, s := range sources {
if !strings.HasPrefix(s.Url.String(), "https://www.rugbyrama.fr/") { if !strings.HasPrefix(s.URL.String(), "https://www.rugbyrama.fr/") {
t.Errorf("unexpected source url %s", s.Url) t.Errorf("unexpected source url %s", s.URL)
} }
} }
} }
func TestSource_GetMatches(t *testing.T) { func TestSource_GetMatches(t *testing.T) {
sourceUrl, _ := url.Parse("https://www.matchendirect.fr/france/ligue-1/2020-37/") sourceUrl, _ := url.Parse("https://www.matchendirect.fr/france/ligue-1/2020-37/")
source := &Source{ league := &League{Id: 1, MatchDays: utils.IntPointer(38), MatchesByMatchDay: utils.IntPointer(10)}
League: &League{Id: 1}, source := newSource(league, sourceUrl.Scheme, sourceUrl.Host, sourceUrl.Path, "", "", 0)
Url: sourceUrl, matches, err := source.GetMatches()
if err != nil {
t.Error(err)
}
for _, m := range matches {
if m.League == nil {
t.Error("unexpected nil match.League")
}
if m.StartDate == nil {
t.Error("unexpected nil match.StartDate")
}
if m.TeamHome.Names == nil {
t.Error("unexpected nil match.TeamHome.Names")
}
if m.TeamAway.Names == nil {
t.Error("unexpected nil match.TeamAway.Names")
}
if m.PlayerHome == nil {
t.Error("unexpected nil match.PlayerHome")
}
if m.PlayerAway == nil {
t.Error("unexpected nil match.PlayerAway")
}
if m.MatchDay == nil {
t.Error("unexpected nil match.MatchDay")
}
if m.MatchDayId == nil {
t.Error("unexpected nil match.MatchDayId")
}
if m.Error != nil {
t.Errorf("unexpected not nil match.Error : %s", *m.Error)
}
} }
source.GetMatches()
} }

View File

@ -137,7 +137,8 @@ func (n *News) Feed() error {
n.Author = utils.StringPointer(doc.Find(".article__author a").Text()) n.Author = utils.StringPointer(doc.Find(".article__author a").Text())
default: default:
n.Error = utils.StringPointer("unknown host " + parsedLink.Host) n.Error = utils.StringPointer("unknown link host")
n.Trace = utils.StringPointer("unknown link host : " + n.Link)
} }
if n.Content == nil { if n.Content == nil {

View File

@ -94,7 +94,7 @@ func TestNews_Feed(t *testing.T) {
t.Errorf("cleanTags does not contain football") t.Errorf("cleanTags does not contain football")
} }
t.Logf("testing feed from Foot Mercato") t.Logf("testing feed from Rugbyrama")
n = &News{ n = &News{
Source: &Source{Sport: &Sport{Name: "Rugby", CleanName: "rugby"}}, Source: &Source{Sport: &Sport{Name: "Rugby", CleanName: "rugby"}},
Link: "https://www.rugbyrama.fr/rugby/top-14/2018-2019/top-14-face-au-racing-92-toulouse-n-aura-pas-de-marge-de-manoeuvre_sto7939622/story.shtml", Link: "https://www.rugbyrama.fr/rugby/top-14/2018-2019/top-14-face-au-racing-92-toulouse-n-aura-pas-de-marge-de-manoeuvre_sto7939622/story.shtml",
@ -136,10 +136,10 @@ func TestSource_ListNews(t *testing.T) {
} }
for _, n := range newsList { for _, n := range newsList {
if n.Title == "" { if n.Title == "" {
t.Errorf("unexpected empty title") t.Errorf("unexpected empty news title")
} }
if n.Image == nil { if n.Link == "" {
t.Errorf("unexpected nil image") t.Errorf("unexpected empty news link")
} }
} }
} }

View File

@ -1,10 +1,13 @@
package postgres package postgres
import ( import (
"crypto/sha256"
"database/sql" "database/sql"
"fmt" "fmt"
"log" "log"
"math"
"os" "os"
"strings"
"github.com/lib/pq" "github.com/lib/pq"
@ -46,6 +49,25 @@ func aValue(a *[]string) interface{} {
return pq.Array(*a) return pq.Array(*a)
} }
func foreignId(i int) interface{} {
if i == 0 {
return nil
}
return i
}
func concatWS(a []interface{}, s string) string {
var b []string
for _, x := range a {
if x == nil {
b = append(b, "")
} else {
b = append(b, fmt.Sprint(x))
}
}
return strings.Join(b, s)
}
var pg *Postgres var pg *Postgres
func init() { func init() {
@ -131,9 +153,9 @@ func ListLeagues() ([]*match.League, error) {
} }
for rows.Next() { for rows.Next() {
league := &match.League{Sport: &match.Sport{}} league := &match.League{Sport: &match.Sport{}, Country: &match.Country{}}
if err = rows.Scan( if err = rows.Scan(
&league.Id, &league.Sport.Id, &league.CountryId, &league.Name, &league.CleanName, &league.Gender, &league.Id, &league.Sport.Id, &league.Country.Id, &league.Name, &league.CleanName, &league.Gender,
&league.ScheduleUrl, &league.RankingUrl, &league.ChannelUrl, &league.ScheduleUrl, &league.RankingUrl, &league.ChannelUrl,
&league.MatchDays, &league.MatchesByMatchDay, pq.Array(&league.Rounds), pq.Array(&league.Groups), &league.MatchDays, &league.MatchesByMatchDay, pq.Array(&league.Rounds), pq.Array(&league.Groups),
&league.Sport.Name, &league.Sport.CleanName, &league.Sport.Name, &league.Sport.CleanName,
@ -145,9 +167,22 @@ func ListLeagues() ([]*match.League, error) {
return leagues, nil return leagues, nil
} }
func UpdateLeague(l *match.League) (int64, error) {
res, err := pg.psqlConn.Exec(`
UPDATE mainapp_league
SET error = $1, trace = $2
WHERE id = $3
`, sValue(l.Error), sValue(l.Trace), l.Id,
)
if err != nil {
return 0, err
}
return res.RowsAffected()
}
func InsertNews(n *news.News) error { func InsertNews(n *news.News) error {
return pg.psqlConn.QueryRow(` return pg.psqlConn.QueryRow(`
INSERT INTO public.mainapp_news INSERT INTO mainapp_news
(title, clean_title, link, pub_date, description, image, teaser, author, (title, clean_title, link, pub_date, description, image, teaser, author,
content, redirect, haystack, tags, clean_tags, error, trace, content, redirect, haystack, tags, clean_tags, error, trace,
league_id, source_id, team_id) league_id, source_id, team_id)
@ -165,7 +200,7 @@ func InsertNews(n *news.News) error {
func UpdateNews(n *news.News) (int64, error) { func UpdateNews(n *news.News) (int64, error) {
res, err := pg.psqlConn.Exec(` res, err := pg.psqlConn.Exec(`
UPDATE public.mainapp_news UPDATE mainapp_news
SET title = $1, clean_title = $2, pub_date = $3, link = $4, description = $5, SET title = $1, clean_title = $2, pub_date = $3, link = $4, description = $5,
image = $6, teaser = $7, author = $8, content = $9, redirect = $10, image = $6, teaser = $7, author = $8, content = $9, redirect = $10,
haystack = $11, tags = $12, clean_tags = $13, error = $14, trace = $15, haystack = $11, tags = $12, clean_tags = $13, error = $14, trace = $15,
@ -185,7 +220,7 @@ func UpdateNews(n *news.News) (int64, error) {
func DeleteNews(n *news.News) (int64, error) { func DeleteNews(n *news.News) (int64, error) {
res, err := pg.psqlConn.Exec(` res, err := pg.psqlConn.Exec(`
DELETE FROM public.mainapp_news DELETE FROM mainapp_news
WHERE id = $1 WHERE id = $1
`, n.Id) `, n.Id)
if err != nil { if err != nil {
@ -193,3 +228,84 @@ func DeleteNews(n *news.News) (int64, error) {
} }
return res.RowsAffected() return res.RowsAffected()
} }
func InsertTeamBySourceName(t *match.Team, l *match.League) error {
var host, name string
if t.Names == nil {
return fmt.Errorf("no source name given")
}
shortName := ""
for host, name = range *t.Names {
runeName := []rune(strings.ToUpper(name))
shortName = string(runeName[:int(math.Min(3, float64(len(runeName))))])
break
}
err := pg.psqlConn.QueryRow("SELECT id, name FROM mainapp_team WHERE names->>$1 = $2", host, name).Scan(&t.Id, &t.Name)
if err != nil {
cleanName := utils.Sanitize(name)
jsonHost := fmt.Sprintf("{\"%s\"}", utils.Sanitize(host))
return pg.psqlConn.QueryRow(`
INSERT INTO mainapp_team
(sport_id, country_id, name, clean_name, short_name, long_name, gender,
names, tags, clean_tags, news_count)
VALUES
($1, $2, $3, $4, $5, $6, $7,
jsonb_set('{}', $8, to_jsonb($9::text), true), $10, $11, 0)
ON CONFLICT ON CONSTRAINT custom_unique_team DO UPDATE SET
names = jsonb_set(mainapp_team.names, $12, to_jsonb($13::text), true)
RETURNING id, name
`, l.Sport.Id, l.Country.Id, name, cleanName, shortName, name, iValue(l.Gender),
jsonHost, name, pq.Array([]string{name}), pq.Array([]string{cleanName}),
jsonHost, name).Scan(&t.Id, &t.Name)
}
return nil
}
func DeleteTeam(t *match.Team) (int64, error) {
res, err := pg.psqlConn.Exec(`
DELETE FROM mainapp_team
WHERE id = $1
`, t.Id)
if err != nil {
return 0, err
}
return res.RowsAffected()
}
func InsertMatch(m *match.Match) error {
var arr []interface{}
arr = append(arr, m.League.Id)
arr = append(arr, foreignId(m.TeamHome.Id))
arr = append(arr, foreignId(m.TeamAway.Id))
arr = append(arr, foreignId(m.PlayerHome.Id))
arr = append(arr, foreignId(m.PlayerAway.Id))
arr = append(arr, sValue(m.Round))
hash := sha256.New()
hash.Write([]byte(concatWS(arr, "/")))
sign := fmt.Sprintf("%x", hash.Sum(nil))
return pg.psqlConn.QueryRow(`
INSERT INTO mainapp_match
(league_id, team_home_id, team_away_id, player_home_id, player_away_id, mday, round, leg, sign,
mday_id, base_url, start_date, error, trace)
VALUES
($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14)
ON CONFLICT ON CONSTRAINT mainapp_match_sign_key DO UPDATE SET
base_url = $11, start_date = $12, error = $13, trace = $14
RETURNING id
`, m.League.Id, foreignId(m.TeamHome.Id), foreignId(m.TeamAway.Id), foreignId(m.PlayerHome.Id),
foreignId(m.PlayerAway.Id), iValue(m.MatchDay), sValue(m.Round), iValue(m.Leg), sign,
iValue(m.MatchDayId), sValue(m.BaseUrl), m.StartDate, sValue(m.Error), sValue(m.Trace)).Scan(&m.Id)
}
func DeleteMatch(m *match.Match) (int64, error) {
res, err := pg.psqlConn.Exec(`
DELETE FROM mainapp_match
WHERE id = $1
`, m.Id)
if err != nil {
return 0, err
}
return res.RowsAffected()
}

View File

@ -1,6 +1,7 @@
package postgres package postgres
import ( import (
"1bet.fr/scraper/match"
"testing" "testing"
"time" "time"
@ -8,7 +9,12 @@ import (
"1bet.fr/scraper/utils" "1bet.fr/scraper/utils"
) )
var n *news.News var (
se *news.Source
ns *news.News
tm *match.Team
mh *match.Match
)
func TestConnect(t *testing.T) { func TestConnect(t *testing.T) {
return return
@ -24,8 +30,20 @@ func TestListLeagues(t *testing.T) {
} }
} }
func TestUpdateLeague(t *testing.T) {
league := &match.League{Id: 1}
updated, err := UpdateLeague(league)
if err != nil {
t.Error(err)
}
if updated != 1 {
t.Errorf("unexpected %d updated rows", updated)
}
}
func TestListSources(t *testing.T) { func TestListSources(t *testing.T) {
sources, err := ListSources() sources, err := ListSources()
se = sources[0]
if err != nil { if err != nil {
t.Errorf("unexpected error : %s", err) t.Errorf("unexpected error : %s", err)
} }
@ -38,8 +56,8 @@ func TestInsertNews(t *testing.T) {
tags := []string{"Test", "Hello Toto"} tags := []string{"Test", "Hello Toto"}
cleanTags := []string{"test", "hello-toto"} cleanTags := []string{"test", "hello-toto"}
nowTime := time.Now() nowTime := time.Now()
n = &news.News{ ns = &news.News{
Source: &news.Source{Id: 1, Sport: &news.Sport{Id: 1}}, Source: se,
PubDate: &nowTime, PubDate: &nowTime,
Link: "https://test.com/toto", Link: "https://test.com/toto",
Title: "Hello toto", Title: "Hello toto",
@ -47,30 +65,30 @@ func TestInsertNews(t *testing.T) {
Tags: &tags, Tags: &tags,
CleanTags: &cleanTags, CleanTags: &cleanTags,
} }
err := InsertNews(n) err := InsertNews(ns)
if err != nil { if err != nil {
t.Error(err) t.Error(err)
} }
if n.Id == 0 { if ns.Id == 0 {
t.Errorf("unexpected value 0 for n.Id") t.Errorf("unexpected value 0 for n.Id")
} }
} }
func TestUpdateNews(t *testing.T) { func TestUpdateNews(t *testing.T) {
content := []string{"toto", "test"} content := []string{"toto", "test"}
n.Content = &content ns.Content = &content
n.Author = utils.StringPointer("T. Toto") ns.Author = utils.StringPointer("T. Toto")
updated, err := UpdateNews(n) updated, err := UpdateNews(ns)
if err != nil { if err != nil {
t.Error(err) t.Error(err)
} }
if updated != 1 { if updated != 1 {
t.Errorf("unexpected %d update rows", updated) t.Errorf("unexpected %d updated rows", updated)
} }
} }
func TestDeleteNews(t *testing.T) { func TestDeleteNews(t *testing.T) {
deleted, err := DeleteNews(n) deleted, err := DeleteNews(ns)
if err != nil { if err != nil {
t.Error(err) t.Error(err)
} }
@ -79,6 +97,60 @@ func TestDeleteNews(t *testing.T) {
} }
} }
func TestInsertTeamBySourceName(t *testing.T) {
teamNames := map[string]string{utils.HostMatchendirect: "Toto"}
league := &match.League{
Id: 1,
Sport: &match.Sport{Id: 1},
Country: &match.Country{Id: 1},
Gender: utils.IntPointer(match.GenderMale),
}
tm = &match.Team{Names: &teamNames}
for _, _ = range []int{0, 1} {
if err := InsertTeamBySourceName(tm, league); err != nil {
t.Errorf("unexpected error : %s", err)
}
if tm.Id == 0 {
t.Error("unexpected zero team.Id")
}
}
}
func TestInsertMatch(t *testing.T) {
startDate := time.Now()
mh = match.NewMatch(&match.League{Id: 1}, tm, tm, &match.Player{}, &match.Player{}, "", 0, 0)
mh.StartDate = &startDate
mh.BaseUrl = utils.StringPointer("https://test.com/toto")
for _, _ = range []int{0, 1} {
if err := InsertMatch(mh); err != nil {
t.Error(err)
}
if mh.Id == 0 {
t.Errorf("unexpected zero match.Id")
}
}
}
func TestDeleteMatch(t *testing.T) {
deleted, err := DeleteMatch(mh)
if err != nil {
t.Error(err)
}
if deleted != 1 {
t.Errorf("unexpected %d matches deleted", deleted)
}
}
func TestDeleteTeam(t *testing.T) {
deleted, err := DeleteTeam(tm)
if err != nil {
t.Error(err)
}
if deleted != 1 {
t.Errorf("unexpected %d teams deleted", deleted)
}
}
func TestClose(t *testing.T) { func TestClose(t *testing.T) {
Close() Close()
} }

View File

@ -32,7 +32,7 @@ predictions between friends.
I decline any responsibility about your eventual usages of this project. I decline any responsibility about your eventual usages of this project.
## 2- DEPLOYMENT ## 2- DEPLOYMENT
The deployment is very simple as the binary `scraper` can be used directly. The deployment is very simple as the binary `scraper` can be used directly.

View File

@ -2,20 +2,23 @@ package requests
import ( import (
"fmt" "fmt"
"github.com/PuerkitoBio/goquery"
"log" "log"
"net/http" "net/http"
"net/url" "net/url"
"github.com/PuerkitoBio/goquery"
"github.com/go-redis/redis"
"golang.org/x/net/proxy" "golang.org/x/net/proxy"
) )
const ( const (
torAddr = "socks5://127.0.0.1:9050" redisAddr = "127.0.0.1:6379"
userAgent = "Mozilla/5.0 (X11; Linux x86_64…) Gecko/20100101 Firefox/68.0" torAddr = "socks5://127.0.0.1:9050"
defaultAgent = "Mozilla/5.0 (X11; Linux x86_64…) Gecko/20100101 Firefox/68.0"
) )
var cli *http.Client var cli *http.Client
var red *redis.Client
func init() { func init() {
proxyUrl, err := url.Parse(torAddr) proxyUrl, err := url.Parse(torAddr)
@ -34,6 +37,13 @@ func init() {
cli = &http.Client{ cli = &http.Client{
Transport: transport, Transport: transport,
} }
red = redis.NewClient(&redis.Options{
Addr: redisAddr,
})
if pong := red.Ping().Val(); pong != "PONG" {
log.Fatalf("unexpected response from redis PING conmmand : %s", pong)
}
} }
func GetDocumentFromURL(url string) (*goquery.Document, error) { func GetDocumentFromURL(url string) (*goquery.Document, error) {
@ -41,7 +51,11 @@ func GetDocumentFromURL(url string) (*goquery.Document, error) {
if err != nil { if err != nil {
return nil, fmt.Errorf("error while building request: %s", err) return nil, fmt.Errorf("error while building request: %s", err)
} }
req.Header.Set("User-Agent", userAgent) agent := red.SRandMember("agents").Val()
if agent == "" {
agent = defaultAgent
}
req.Header.Set("User-Agent", agent)
resp, err := cli.Do(req) resp, err := cli.Do(req)
if err != nil { if err != nil {

BIN
scraper

Binary file not shown.

View File

@ -32,7 +32,7 @@ func Sanitize(s string) (t string) {
} else if int(c) >= int('0') && int(c) <= int('9') { } else if int(c) >= int('0') && int(c) <= int('9') {
t += string(c) t += string(c)
} else if int(c) >= int('A') && int(c) <= int('Z') { } else if int(c) >= int('A') && int(c) <= int('Z') {
t += string(int(c) - int('A') + int('a')) t += string(rune(int(c) - int('A') + int('a')))
} else if v, ok := symbols[c]; ok { } else if v, ok := symbols[c]; ok {
t += v t += v
} else { } else {