diff --git a/go.mod b/go.mod index fac0912..f467407 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.12 require ( github.com/PuerkitoBio/goquery v1.6.0 + github.com/go-redis/redis v6.15.9+incompatible github.com/lib/pq v1.8.0 github.com/mmcdole/gofeed v1.1.0 golang.org/x/net v0.0.0-20201009032441-dbdefad45b89 diff --git a/go.sum b/go.sum index c816ae4..ae2e6a8 100644 --- a/go.sum +++ b/go.sum @@ -7,6 +7,8 @@ github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9Pq github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-redis/redis v6.15.9+incompatible h1:K0pv1D7EQUjfyoMql+r/jZqCLizCGKFlFgcHWWmHQjg= +github.com/go-redis/redis v6.15.9+incompatible/go.mod h1:NAIEuMOZ/fxfXJIrKDQDz8wamY7mA7PouImQ2Jvg6kA= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/json-iterator/go v1.1.10 h1:Kz6Cvnvv2wGdaG/V8yMvfkmNiXq9Ya2KUv4rouJJr68= github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= diff --git a/main.go b/main.go index 94165c6..b7fc5b9 100644 --- a/main.go +++ b/main.go @@ -1,13 +1,13 @@ package main import ( + "1bet.fr/scraper/utils" "flag" "fmt" + "github.com/lib/pq" "log" "sync" - "github.com/lib/pq" - "1bet.fr/scraper/match" "1bet.fr/scraper/news" "1bet.fr/scraper/postgres" @@ -84,38 +84,59 @@ func updateSchedule() { } waitGroup := sync.WaitGroup{} - sourceChannel := make(chan *match.Source) + leagueChannel := make(chan *match.League) for i := 0; i < nbProcesses; i++ { waitGroup.Add(1) - go func(sc chan *match.Source, wg *sync.WaitGroup) { + go func(lc chan *match.League, wg *sync.WaitGroup) { defer wg.Done() - for s := range sc { - matches, err := s.GetMatches() + for l := range lc { + sources, err := l.ListSources() if err != nil { - log.Fatal(err) + l.Error = utils.StringPointer("list sources error") + l.Trace = utils.StringPointer(fmt.Sprint(err)) + if updated, err := postgres.UpdateLeague(l); err != nil { + log.Fatalf("error while updating league : %s", err) + } else if updated != 1 { + log.Fatalf("error while updating league : %d league(s) updated", updated) + } + break } - - for _, m := range matches { - fmt.Println(m) - //if err = postgres.InsertMatch(m); err != nil { - // log.Fatal(err) - //} + for _, s := range sources { + log.Printf("[+] Parsing source %s", s.URL) + matches, err := s.GetMatches() + if err != nil { + log.Printf("[-] error while getting matches from league source %s : %s", s.URL, err) + s.League.Error = utils.StringPointer("league source error") + s.League.Trace = utils.StringPointer(fmt.Sprint(err)) + if updated, err := postgres.UpdateLeague(s.League); err != nil { + log.Fatalf("error while updating league : %s", err) + } else if updated != 1 { + log.Fatalf("error while updating league : %d league(s) updated", updated) + } + break + } + for _, m := range matches { + if err := postgres.InsertTeamBySourceName(m.TeamHome, s.League); err != nil { + log.Fatalf("error while saving team home : %s", err) + } + if err := postgres.InsertTeamBySourceName(m.TeamAway, s.League); err != nil { + log.Fatalf("error while saving team away : %s", err) + } + if err := postgres.InsertMatch(m); err != nil { + log.Fatalf("error while saving match : %s", err) + } + log.Printf("New match #%d (%s - %s)", m.Id, m.TeamHome.Name, m.TeamAway.Name) + } } - continue } - }(sourceChannel, &waitGroup) + }(leagueChannel, &waitGroup) } for _, league := range leagues { - sources, err := league.ListSources() - if err != nil { - log.Fatal(err) - } - for _, s := range sources { - log.Printf("Adding source %s", s.Url.String()) - sourceChannel <- s - } + leagueChannel <- league } + close(leagueChannel) + waitGroup.Wait() } func main() { diff --git a/match/match.go b/match/match.go index 5c2580d..cb1595a 100644 --- a/match/match.go +++ b/match/match.go @@ -2,6 +2,7 @@ package match import ( "fmt" + "log" "net/url" "regexp" "strings" @@ -13,122 +14,206 @@ import ( "1bet.fr/scraper/utils" ) +const ( + _ = iota + GenderMale = iota + //GenderFemale = iota +) + type Sport struct { Id int Name string CleanName string } -const ( - _ = iota - GenderMale = iota - GenderFemale = iota -) +type Country struct { + Id int +} + +type Player struct { + Id int + FullName string + CleanName string + Gender int +} type Team struct { Id int - SportId int - CountryId int + Sport *Sport + Country *Country Name string CleanName string - ShortName string - LongName string - Gender int + ShortName *string + LongName *string + Gender *int - Names interface{} - Url string - Images interface{} + Names *map[string]string + PlayersUrl *string + Images *interface{} - Tags []string - CleanTags []string + Tags *[]string + CleanTags *[]string NewsCount int - Error string - Trace string + Error *string + Trace *string } -const ( - _ = iota - LegFirst = iota - LegSecond = iota - LegReplay = iota -) - -const ( - _ = iota - StatusFirstTime = iota - StatusHalfTime = iota - StatusSecondTime = iota - StatusFirstExtra = iota - StatusHalfExtra = iota - StatusSecondExtra = iota - StatusShootout = iota - StatusWaitScores = iota - StatusOver = iota - StatusPostponed = iota - StatusCancelled = iota -) - -const ( - _ = iota - WinnerHome = iota - WinnerAway = iota - WinnerDraw = iota -) - -const ( - _ = iota - ExtraTimeExtraTime = iota - ExtraTimeShootout = iota -) - type Match struct { Id int - LeagueId int - TeamHomeId int - TeamAwayId int - PlayerHomeId int - PlayerAwayId int + League *League + TeamHome *Team + TeamAway *Team + PlayerHome *Player + PlayerAway *Player - MatchDay int - MatchDayId int - Round string - Leg int + MatchDay *int + Round *string + Leg *int - BaseUrl string - ScoreUrl string - LiveUrl string - TvChannels []string + BaseUrl *string + ScoreUrl *string + LiveUrl *string + TvChannels *[]string - Status int - Minute int + MatchDayId *int + Status *int + Minute *string StartDate *time.Time EndDate *time.Time - HomeScore int - AwayScore int - SetsScore int - Winner int - ExtraTime int - ShootoutHome int - ShootoutAway int + HomeScore *int + AwayScore *int + SetsScore *[]interface{} + Winner *int + ExtraTime *int + ShootoutHome *int + ShootoutAway *int - Squad []interface{} - Events []interface{} - Stats []interface{} - Live []interface{} - LastEvent interface{} + Squad *[]interface{} + Events *[]interface{} + Stats *[]interface{} + Live *[]interface{} + LastEvent *interface{} LastEventDate *time.Time - Error string - Trace string + Error *string + Trace *string +} + +func NewMatch(league *League, teamHome *Team, teamAway *Team, playerHome *Player, playerAway *Player, round string, matchDay int, leg int) *Match { + return &Match{ + League: league, + TeamHome: teamHome, + TeamAway: teamAway, + PlayerHome: playerHome, + PlayerAway: playerAway, + MatchDay: utils.IntPointer(matchDay), + Round: utils.StringPointer(round), + Leg: utils.IntPointer(leg), + } +} + +type source struct { + League *League + URL *url.URL + matchDay int + round string + + currentDate *time.Time +} + +func newSource(league *League, scheme string, host string, path string, query string, round string, matchDay int) *source { + return &source{ + League: league, + URL: &url.URL{ + Scheme: scheme, + Host: host, + Path: path, + RawQuery: query, + }, + round: round, + matchDay: matchDay, + } +} + +func (s *source) GetMatches() ([]*Match, error) { + var matches []*Match + + switch s.URL.Host { + case utils.HostMatchendirect: + doc, err := requests.GetDocumentFromURL(s.URL.String()) + if err != nil { + return nil, err + } + doc.Find("#livescore tr").Each(func (i int, row *goquery.Selection) { + headCell := row.Find("th") + if len(headCell.Nodes) == 1 { + curDate, err := time.Parse("Monday 02 January 2006", utils.EnglishDateString(headCell.Text())) + if err != nil { + log.Fatalf("unexpected error while parsing date : %s", err) + return + } + s.currentDate = &curDate + } else { + match := NewMatch(s.League, &Team{Gender: s.League.Gender}, &Team{Gender: s.League.Gender}, &Player{}, &Player{}, s.round, s.matchDay, 0) + + startTime := strings.TrimSpace(row.Find("td.lm1").Text()) + if startTime == "-- : --" { + startTime = "00:00" + } + startDate, err := time.Parse( + "2006-01-02 15:04 MST", + s.currentDate.Format("2006-01-02 ") + startTime + " CEST", + ) + if err != nil { + match.Error = utils.StringPointer("parse date error") + match.Trace = utils.StringPointer(fmt.Sprint(err)) + matches = append(matches, match) + return + } + match.StartDate = &startDate + + homeNames := map[string]string{utils.HostMatchendirect: strings.TrimSuffix(strings.TrimSpace(row.Find(".lm3_eq1").Text()), "*")} + match.TeamHome.Names = &homeNames + awayNames := map[string]string{utils.HostMatchendirect: strings.TrimSuffix(strings.TrimSpace(row.Find(".lm3_eq2").Text()), "*")} + match.TeamAway.Names = &awayNames + + basePath, ok := row.Find(".lm3 a").First().Attr("href") + if !ok { + match.Error = utils.StringPointer("unable to find href attr") + match.Trace = utils.StringPointer("unable to find href attribute for element '.lm3 a'") + matches = append(matches, match) + return + } + baseUrl := &url.URL{Scheme: s.URL.Scheme, Host: s.URL.Host, Path: basePath} + match.BaseUrl = utils.StringPointer(baseUrl.String()) + + if match.Round == nil && match.MatchDay == nil && s.League.MatchDays != nil && s.League.MatchesByMatchDay != nil { + if s.League.currentMatchDayId == 0 { + s.League.currentMatchDay ++ + } + curMatchDay := s.League.currentMatchDay + curMatchDayId := s.League.currentMatchDayId + match.MatchDay = &curMatchDay + match.MatchDayId = &curMatchDayId + s.League.currentMatchDayId = (s.League.currentMatchDayId + 1) % *s.League.MatchesByMatchDay + } + matches = append(matches, match) + } + }) + + default: + return nil, fmt.Errorf("unexpected source url %s", s.URL.String()) + } + return matches, nil } type League struct { Id int Sport *Sport - CountryId int + Country *Country Name string CleanName string @@ -140,7 +225,6 @@ type League struct { ChannelUrl *string MatchDays *int - CurrentMatchDay *int MatchesByMatchDay *int TeamCount int @@ -158,10 +242,13 @@ type League struct { Error *string Trace *string + + currentMatchDay int + currentMatchDayId int } -func (l *League) ListSources() ([]*Source, error) { - var sources []*Source +func (l *League) ListSources() ([]*source, error) { + var sources []*source if l.ScheduleUrl == nil { return nil, nil @@ -187,19 +274,13 @@ func (l *League) ListSources() ([]*Source, error) { parsedDate := strings.Split(strings.Split(value, "/")[3], "-") year := utils.AtoI(parsedDate[0]) week := utils.AtoI(parsedDate[1]) - if year >= 2020 && week >= 34 { - sources = append(sources, &Source{ - League: l, - Url: &url.URL{ - Scheme: originUrl.Scheme, - Host: originUrl.Host, - Path: value, - }, - }) + if (year == 2020 && week >= 34) || year > 2020 { + sources = append(sources, newSource(l, originUrl.Scheme, originUrl.Host, value, "", "", 0)) } }) + case utils.HostEurosport, utils.HostRugbyrama: - eurosportRegexp := regexp.MustCompile(`(\d)+e\s+Journée`) + curRegexp := regexp.MustCompile(`(\d)+e\s+Journée`) ajaxUrl, ok := doc.Find(".ajax-container").Attr("data-ajax-url") if !ok { @@ -212,74 +293,23 @@ func (l *League) ListSources() ([]*Source, error) { ajaxQuery := ajaxParsedUrl.Query() doc.Find("#results-match-nav .rounds-dropdown__round").Each(func (i int, s *goquery.Selection) { - var round *string - var matchDay *int - - roundStr, _ := s.Attr("data-label") - reMatch := eurosportRegexp.FindStringSubmatch(roundStr) + matchDay := 0 + round, _ := s.Attr("data-label") + reMatch := curRegexp.FindStringSubmatch(round) if reMatch != nil { - mdayInt := utils.AtoI(reMatch[1]) - matchDay = &mdayInt - } else { - round = &roundStr + round = "" + matchDay = utils.AtoI(reMatch[1]) } roundId, _ := s.Attr("data-round-id") ajaxQuery.Set("roundid", roundId) - - sources = append(sources, &Source{ - League: l, - Url: &url.URL{ - Scheme: originUrl.Scheme, - Host: originUrl.Host, - Path: ajaxParsedUrl.Path, - RawQuery: ajaxQuery.Encode(), - }, - Round: round, - MatchDay: matchDay, - }) + sources = append(sources, newSource(l, originUrl.Scheme, originUrl.Host, ajaxParsedUrl.Path, ajaxQuery.Encode(), round, matchDay)) }) + default: return nil, fmt.Errorf("unknown source url : %s", *l.ScheduleUrl) } return sources, nil } -type Source struct { - League *League - Url *url.URL - MatchDay *int - Round *string - currentDate *time.Time -} -func (s *Source) GetMatches() ([]*Match, error) { - var matches []*Match - - switch s.Url.Host { - case utils.HostMatchendirect: - doc, err := requests.GetDocumentFromURL(s.Url.String()) - if err != nil { - return nil, err - } - doc.Find("#livescore tr").Each(func (i int, row *goquery.Selection) { - row.Children().Each(func (j int, col *goquery.Selection) { - colspan, ok := col.Attr("colspan") - if ok && colspan == "4" { - currentDate, err := time.Parse("Monday 02 January 2006", utils.EnglishDateString(col.Text())) - if err != nil { - fmt.Println(err) - return - } - s.currentDate = ¤tDate - fmt.Println(s.currentDate) - } - }) - }) - case utils.HostEurosport, utils.HostRugbyrama: - - default: - return nil, fmt.Errorf("unexpected source url %s", s.Url.String()) - } - return matches, nil -} diff --git a/match/match_test.go b/match/match_test.go index 5b32988..968c8e4 100644 --- a/match/match_test.go +++ b/match/match_test.go @@ -1,6 +1,7 @@ package match import ( + "1bet.fr/scraper/utils" "net/url" "strings" @@ -21,8 +22,8 @@ func TestLeague_ListSources(t *testing.T) { t.Errorf("no sources found") } for _, s := range sources { - if !strings.HasPrefix(s.Url.String(), "http://www.matchendirect.fr/france/ligue-1/") { - t.Errorf("unexpected source url %s", s.Url) + if !strings.HasPrefix(s.URL.String(), "http://www.matchendirect.fr/france/ligue-1/") { + t.Errorf("unexpected source url %s", s.URL) } } @@ -39,8 +40,8 @@ func TestLeague_ListSources(t *testing.T) { t.Errorf("no sources found") } for _, s := range sources { - if !strings.HasPrefix(s.Url.String(), "https://www.eurosport.fr/") { - t.Errorf("unexpected source url %s", s.Url) + if !strings.HasPrefix(s.URL.String(), "https://www.eurosport.fr/") { + t.Errorf("unexpected source url %s", s.URL) } } @@ -57,17 +58,47 @@ func TestLeague_ListSources(t *testing.T) { t.Errorf("no sources found") } for _, s := range sources { - if !strings.HasPrefix(s.Url.String(), "https://www.rugbyrama.fr/") { - t.Errorf("unexpected source url %s", s.Url) + if !strings.HasPrefix(s.URL.String(), "https://www.rugbyrama.fr/") { + t.Errorf("unexpected source url %s", s.URL) } } } func TestSource_GetMatches(t *testing.T) { sourceUrl, _ := url.Parse("https://www.matchendirect.fr/france/ligue-1/2020-37/") - source := &Source{ - League: &League{Id: 1}, - Url: sourceUrl, + league := &League{Id: 1, MatchDays: utils.IntPointer(38), MatchesByMatchDay: utils.IntPointer(10)} + source := newSource(league, sourceUrl.Scheme, sourceUrl.Host, sourceUrl.Path, "", "", 0) + matches, err := source.GetMatches() + if err != nil { + t.Error(err) + } + for _, m := range matches { + if m.League == nil { + t.Error("unexpected nil match.League") + } + if m.StartDate == nil { + t.Error("unexpected nil match.StartDate") + } + if m.TeamHome.Names == nil { + t.Error("unexpected nil match.TeamHome.Names") + } + if m.TeamAway.Names == nil { + t.Error("unexpected nil match.TeamAway.Names") + } + if m.PlayerHome == nil { + t.Error("unexpected nil match.PlayerHome") + } + if m.PlayerAway == nil { + t.Error("unexpected nil match.PlayerAway") + } + if m.MatchDay == nil { + t.Error("unexpected nil match.MatchDay") + } + if m.MatchDayId == nil { + t.Error("unexpected nil match.MatchDayId") + } + if m.Error != nil { + t.Errorf("unexpected not nil match.Error : %s", *m.Error) + } } - source.GetMatches() } diff --git a/news/news.go b/news/news.go index f87e60a..d070f87 100644 --- a/news/news.go +++ b/news/news.go @@ -137,7 +137,8 @@ func (n *News) Feed() error { n.Author = utils.StringPointer(doc.Find(".article__author a").Text()) default: - n.Error = utils.StringPointer("unknown host " + parsedLink.Host) + n.Error = utils.StringPointer("unknown link host") + n.Trace = utils.StringPointer("unknown link host : " + n.Link) } if n.Content == nil { diff --git a/news/news_test.go b/news/news_test.go index ab6c47d..962b1ac 100644 --- a/news/news_test.go +++ b/news/news_test.go @@ -94,7 +94,7 @@ func TestNews_Feed(t *testing.T) { t.Errorf("cleanTags does not contain football") } - t.Logf("testing feed from Foot Mercato") + t.Logf("testing feed from Rugbyrama") n = &News{ Source: &Source{Sport: &Sport{Name: "Rugby", CleanName: "rugby"}}, Link: "https://www.rugbyrama.fr/rugby/top-14/2018-2019/top-14-face-au-racing-92-toulouse-n-aura-pas-de-marge-de-manoeuvre_sto7939622/story.shtml", @@ -136,10 +136,10 @@ func TestSource_ListNews(t *testing.T) { } for _, n := range newsList { if n.Title == "" { - t.Errorf("unexpected empty title") + t.Errorf("unexpected empty news title") } - if n.Image == nil { - t.Errorf("unexpected nil image") + if n.Link == "" { + t.Errorf("unexpected empty news link") } } } diff --git a/postgres/postgres.go b/postgres/postgres.go index dca099e..f30de97 100644 --- a/postgres/postgres.go +++ b/postgres/postgres.go @@ -1,10 +1,13 @@ package postgres import ( + "crypto/sha256" "database/sql" "fmt" "log" + "math" "os" + "strings" "github.com/lib/pq" @@ -46,6 +49,25 @@ func aValue(a *[]string) interface{} { return pq.Array(*a) } +func foreignId(i int) interface{} { + if i == 0 { + return nil + } + return i +} + +func concatWS(a []interface{}, s string) string { + var b []string + for _, x := range a { + if x == nil { + b = append(b, "") + } else { + b = append(b, fmt.Sprint(x)) + } + } + return strings.Join(b, s) +} + var pg *Postgres func init() { @@ -131,9 +153,9 @@ func ListLeagues() ([]*match.League, error) { } for rows.Next() { - league := &match.League{Sport: &match.Sport{}} + league := &match.League{Sport: &match.Sport{}, Country: &match.Country{}} if err = rows.Scan( - &league.Id, &league.Sport.Id, &league.CountryId, &league.Name, &league.CleanName, &league.Gender, + &league.Id, &league.Sport.Id, &league.Country.Id, &league.Name, &league.CleanName, &league.Gender, &league.ScheduleUrl, &league.RankingUrl, &league.ChannelUrl, &league.MatchDays, &league.MatchesByMatchDay, pq.Array(&league.Rounds), pq.Array(&league.Groups), &league.Sport.Name, &league.Sport.CleanName, @@ -145,9 +167,22 @@ func ListLeagues() ([]*match.League, error) { return leagues, nil } +func UpdateLeague(l *match.League) (int64, error) { + res, err := pg.psqlConn.Exec(` + UPDATE mainapp_league + SET error = $1, trace = $2 + WHERE id = $3 + `, sValue(l.Error), sValue(l.Trace), l.Id, + ) + if err != nil { + return 0, err + } + return res.RowsAffected() +} + func InsertNews(n *news.News) error { return pg.psqlConn.QueryRow(` - INSERT INTO public.mainapp_news + INSERT INTO mainapp_news (title, clean_title, link, pub_date, description, image, teaser, author, content, redirect, haystack, tags, clean_tags, error, trace, league_id, source_id, team_id) @@ -165,7 +200,7 @@ func InsertNews(n *news.News) error { func UpdateNews(n *news.News) (int64, error) { res, err := pg.psqlConn.Exec(` - UPDATE public.mainapp_news + UPDATE mainapp_news SET title = $1, clean_title = $2, pub_date = $3, link = $4, description = $5, image = $6, teaser = $7, author = $8, content = $9, redirect = $10, haystack = $11, tags = $12, clean_tags = $13, error = $14, trace = $15, @@ -185,7 +220,7 @@ func UpdateNews(n *news.News) (int64, error) { func DeleteNews(n *news.News) (int64, error) { res, err := pg.psqlConn.Exec(` - DELETE FROM public.mainapp_news + DELETE FROM mainapp_news WHERE id = $1 `, n.Id) if err != nil { @@ -193,3 +228,84 @@ func DeleteNews(n *news.News) (int64, error) { } return res.RowsAffected() } + +func InsertTeamBySourceName(t *match.Team, l *match.League) error { + var host, name string + + if t.Names == nil { + return fmt.Errorf("no source name given") + } + shortName := "" + for host, name = range *t.Names { + runeName := []rune(strings.ToUpper(name)) + shortName = string(runeName[:int(math.Min(3, float64(len(runeName))))]) + break + } + err := pg.psqlConn.QueryRow("SELECT id, name FROM mainapp_team WHERE names->>$1 = $2", host, name).Scan(&t.Id, &t.Name) + if err != nil { + cleanName := utils.Sanitize(name) + jsonHost := fmt.Sprintf("{\"%s\"}", utils.Sanitize(host)) + return pg.psqlConn.QueryRow(` + INSERT INTO mainapp_team + (sport_id, country_id, name, clean_name, short_name, long_name, gender, + names, tags, clean_tags, news_count) + VALUES + ($1, $2, $3, $4, $5, $6, $7, + jsonb_set('{}', $8, to_jsonb($9::text), true), $10, $11, 0) + ON CONFLICT ON CONSTRAINT custom_unique_team DO UPDATE SET + names = jsonb_set(mainapp_team.names, $12, to_jsonb($13::text), true) + RETURNING id, name + `, l.Sport.Id, l.Country.Id, name, cleanName, shortName, name, iValue(l.Gender), + jsonHost, name, pq.Array([]string{name}), pq.Array([]string{cleanName}), + jsonHost, name).Scan(&t.Id, &t.Name) + } + return nil +} + +func DeleteTeam(t *match.Team) (int64, error) { + res, err := pg.psqlConn.Exec(` + DELETE FROM mainapp_team + WHERE id = $1 + `, t.Id) + if err != nil { + return 0, err + } + return res.RowsAffected() +} + +func InsertMatch(m *match.Match) error { + var arr []interface{} + arr = append(arr, m.League.Id) + arr = append(arr, foreignId(m.TeamHome.Id)) + arr = append(arr, foreignId(m.TeamAway.Id)) + arr = append(arr, foreignId(m.PlayerHome.Id)) + arr = append(arr, foreignId(m.PlayerAway.Id)) + arr = append(arr, sValue(m.Round)) + hash := sha256.New() + hash.Write([]byte(concatWS(arr, "/"))) + sign := fmt.Sprintf("%x", hash.Sum(nil)) + + return pg.psqlConn.QueryRow(` + INSERT INTO mainapp_match + (league_id, team_home_id, team_away_id, player_home_id, player_away_id, mday, round, leg, sign, + mday_id, base_url, start_date, error, trace) + VALUES + ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14) + ON CONFLICT ON CONSTRAINT mainapp_match_sign_key DO UPDATE SET + base_url = $11, start_date = $12, error = $13, trace = $14 + RETURNING id + `, m.League.Id, foreignId(m.TeamHome.Id), foreignId(m.TeamAway.Id), foreignId(m.PlayerHome.Id), + foreignId(m.PlayerAway.Id), iValue(m.MatchDay), sValue(m.Round), iValue(m.Leg), sign, + iValue(m.MatchDayId), sValue(m.BaseUrl), m.StartDate, sValue(m.Error), sValue(m.Trace)).Scan(&m.Id) +} + +func DeleteMatch(m *match.Match) (int64, error) { + res, err := pg.psqlConn.Exec(` + DELETE FROM mainapp_match + WHERE id = $1 + `, m.Id) + if err != nil { + return 0, err + } + return res.RowsAffected() +} diff --git a/postgres/postgres_test.go b/postgres/postgres_test.go index dc71c70..218a829 100644 --- a/postgres/postgres_test.go +++ b/postgres/postgres_test.go @@ -1,6 +1,7 @@ package postgres import ( + "1bet.fr/scraper/match" "testing" "time" @@ -8,7 +9,12 @@ import ( "1bet.fr/scraper/utils" ) -var n *news.News +var ( + se *news.Source + ns *news.News + tm *match.Team + mh *match.Match +) func TestConnect(t *testing.T) { return @@ -24,8 +30,20 @@ func TestListLeagues(t *testing.T) { } } +func TestUpdateLeague(t *testing.T) { + league := &match.League{Id: 1} + updated, err := UpdateLeague(league) + if err != nil { + t.Error(err) + } + if updated != 1 { + t.Errorf("unexpected %d updated rows", updated) + } +} + func TestListSources(t *testing.T) { sources, err := ListSources() + se = sources[0] if err != nil { t.Errorf("unexpected error : %s", err) } @@ -38,8 +56,8 @@ func TestInsertNews(t *testing.T) { tags := []string{"Test", "Hello Toto"} cleanTags := []string{"test", "hello-toto"} nowTime := time.Now() - n = &news.News{ - Source: &news.Source{Id: 1, Sport: &news.Sport{Id: 1}}, + ns = &news.News{ + Source: se, PubDate: &nowTime, Link: "https://test.com/toto", Title: "Hello toto", @@ -47,30 +65,30 @@ func TestInsertNews(t *testing.T) { Tags: &tags, CleanTags: &cleanTags, } - err := InsertNews(n) + err := InsertNews(ns) if err != nil { t.Error(err) } - if n.Id == 0 { + if ns.Id == 0 { t.Errorf("unexpected value 0 for n.Id") } } func TestUpdateNews(t *testing.T) { content := []string{"toto", "test"} - n.Content = &content - n.Author = utils.StringPointer("T. Toto") - updated, err := UpdateNews(n) + ns.Content = &content + ns.Author = utils.StringPointer("T. Toto") + updated, err := UpdateNews(ns) if err != nil { t.Error(err) } if updated != 1 { - t.Errorf("unexpected %d update rows", updated) + t.Errorf("unexpected %d updated rows", updated) } } func TestDeleteNews(t *testing.T) { - deleted, err := DeleteNews(n) + deleted, err := DeleteNews(ns) if err != nil { t.Error(err) } @@ -79,6 +97,60 @@ func TestDeleteNews(t *testing.T) { } } +func TestInsertTeamBySourceName(t *testing.T) { + teamNames := map[string]string{utils.HostMatchendirect: "Toto"} + league := &match.League{ + Id: 1, + Sport: &match.Sport{Id: 1}, + Country: &match.Country{Id: 1}, + Gender: utils.IntPointer(match.GenderMale), + } + tm = &match.Team{Names: &teamNames} + for _, _ = range []int{0, 1} { + if err := InsertTeamBySourceName(tm, league); err != nil { + t.Errorf("unexpected error : %s", err) + } + if tm.Id == 0 { + t.Error("unexpected zero team.Id") + } + } +} + +func TestInsertMatch(t *testing.T) { + startDate := time.Now() + mh = match.NewMatch(&match.League{Id: 1}, tm, tm, &match.Player{}, &match.Player{}, "", 0, 0) + mh.StartDate = &startDate + mh.BaseUrl = utils.StringPointer("https://test.com/toto") + for _, _ = range []int{0, 1} { + if err := InsertMatch(mh); err != nil { + t.Error(err) + } + if mh.Id == 0 { + t.Errorf("unexpected zero match.Id") + } + } +} + +func TestDeleteMatch(t *testing.T) { + deleted, err := DeleteMatch(mh) + if err != nil { + t.Error(err) + } + if deleted != 1 { + t.Errorf("unexpected %d matches deleted", deleted) + } +} + +func TestDeleteTeam(t *testing.T) { + deleted, err := DeleteTeam(tm) + if err != nil { + t.Error(err) + } + if deleted != 1 { + t.Errorf("unexpected %d teams deleted", deleted) + } +} + func TestClose(t *testing.T) { Close() } diff --git a/readme.md b/readme.md index cbfc9ae..d67a2eb 100644 --- a/readme.md +++ b/readme.md @@ -32,7 +32,7 @@ predictions between friends. I decline any responsibility about your eventual usages of this project. -## 2- DEPLOYMENT +## 2- DEPLOYMENT The deployment is very simple as the binary `scraper` can be used directly. diff --git a/requests/requests.go b/requests/requests.go index b0e7857..22e7a7b 100644 --- a/requests/requests.go +++ b/requests/requests.go @@ -2,20 +2,23 @@ package requests import ( "fmt" - "github.com/PuerkitoBio/goquery" "log" "net/http" "net/url" + "github.com/PuerkitoBio/goquery" + "github.com/go-redis/redis" "golang.org/x/net/proxy" ) const ( - torAddr = "socks5://127.0.0.1:9050" - userAgent = "Mozilla/5.0 (X11; Linux x86_64…) Gecko/20100101 Firefox/68.0" + redisAddr = "127.0.0.1:6379" + torAddr = "socks5://127.0.0.1:9050" + defaultAgent = "Mozilla/5.0 (X11; Linux x86_64…) Gecko/20100101 Firefox/68.0" ) var cli *http.Client +var red *redis.Client func init() { proxyUrl, err := url.Parse(torAddr) @@ -34,6 +37,13 @@ func init() { cli = &http.Client{ Transport: transport, } + + red = redis.NewClient(&redis.Options{ + Addr: redisAddr, + }) + if pong := red.Ping().Val(); pong != "PONG" { + log.Fatalf("unexpected response from redis PING conmmand : %s", pong) + } } func GetDocumentFromURL(url string) (*goquery.Document, error) { @@ -41,7 +51,11 @@ func GetDocumentFromURL(url string) (*goquery.Document, error) { if err != nil { return nil, fmt.Errorf("error while building request: %s", err) } - req.Header.Set("User-Agent", userAgent) + agent := red.SRandMember("agents").Val() + if agent == "" { + agent = defaultAgent + } + req.Header.Set("User-Agent", agent) resp, err := cli.Do(req) if err != nil { diff --git a/scraper b/scraper index c89bad6..59cfc5a 100755 Binary files a/scraper and b/scraper differ diff --git a/utils/utils.go b/utils/utils.go index f1571d8..a9ee5cd 100644 --- a/utils/utils.go +++ b/utils/utils.go @@ -32,7 +32,7 @@ func Sanitize(s string) (t string) { } else if int(c) >= int('0') && int(c) <= int('9') { t += string(c) } else if int(c) >= int('A') && int(c) <= int('Z') { - t += string(int(c) - int('A') + int('a')) + t += string(rune(int(c) - int('A') + int('a'))) } else if v, ok := symbols[c]; ok { t += v } else {