Improve updateNews function

Start developping updateSchedule function
Use of go modules
This commit is contained in:
Samuel Campos 2020-10-10 19:08:06 +02:00
parent d87bedc8d6
commit 98d3cbe7e2
13 changed files with 1042 additions and 313 deletions

10
go.mod Normal file
View File

@ -0,0 +1,10 @@
module 1bet.fr/scraper
go 1.12
require (
github.com/PuerkitoBio/goquery v1.6.0
github.com/lib/pq v1.8.0
github.com/mmcdole/gofeed v1.1.0
golang.org/x/net v0.0.0-20201009032441-dbdefad45b89
)

47
go.sum Normal file
View File

@ -0,0 +1,47 @@
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/PuerkitoBio/goquery v1.6.0 h1:j7taAbelrdcsOlGeMenZxc2AWXD5fieT1/znArdnx94=
github.com/PuerkitoBio/goquery v1.6.0/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/json-iterator/go v1.1.10 h1:Kz6Cvnvv2wGdaG/V8yMvfkmNiXq9Ya2KUv4rouJJr68=
github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
github.com/lib/pq v1.8.0 h1:9xohqzkUwzR4Ga4ivdTcawVS89YSDVxXMa3xJX3cGzg=
github.com/lib/pq v1.8.0/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
github.com/mmcdole/gofeed v1.1.0 h1:T2WrGLVJRV04PY2qwhEJLHCt9JiCtBhb6SmC8ZvJH08=
github.com/mmcdole/gofeed v1.1.0/go.mod h1:PPiVwgDXLlz2N83KB4TrIim2lyYM5Zn7ZWH9Pi4oHUk=
github.com/mmcdole/goxpp v0.0.0-20181012175147-0068e33feabf h1:sWGE2v+hO0Nd4yFU/S/mDBM5plIU8v/Qhfz41hkDIAI=
github.com/mmcdole/goxpp v0.0.0-20181012175147-0068e33feabf/go.mod h1:pasqhqstspkosTneA62Nc+2p9SOBBYAPbnmRRWPQ0V8=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 h1:ZqeYNhU3OHLH3mGKHDcjJRFFRrJa6eAM5H+CtDdOsPc=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742 h1:Esafd1046DLDQ0W1YjYsBW+p8U2u7vzgW2SQVmlNazg=
github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/urfave/cli v1.22.3/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201009032441-dbdefad45b89 h1:1GKfLldebiSdhTlt3nalwrb7L40Tixr/0IH+kSbRgmk=
golang.org/x/net v0.0.0-20201009032441-dbdefad45b89/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=

97
main.go
View File

@ -1,12 +1,16 @@
package main package main
import ( import (
"github.com/lib/pq" "flag"
"fmt"
"log" "log"
"sync" "sync"
"./news" "github.com/lib/pq"
"./postgres"
"1bet.fr/scraper/match"
"1bet.fr/scraper/news"
"1bet.fr/scraper/postgres"
) )
const ( const (
@ -17,18 +21,9 @@ const (
func updateNews() { func updateNews() {
defer postgres.Close() defer postgres.Close()
sports, err := postgres.ListSports()
if err != nil {
log.Fatal(err)
}
mapSports := map[int]*news.Sport{}
for _, sport := range sports {
mapSports[sport.Id] = sport
}
sources, err := postgres.ListSources() sources, err := postgres.ListSources()
if err != nil { if err != nil {
log.Fatal(err) log.Fatalf("error while getting list of sources : %s", err)
} }
waitGroup := sync.WaitGroup{} waitGroup := sync.WaitGroup{}
@ -39,19 +34,18 @@ func updateNews() {
defer wg.Done() defer wg.Done()
for n := range nc { for n := range nc {
if err := n.Feed(); err != nil { if err := n.Feed(); err != nil {
log.Fatal(err) log.Fatalf("error while feeding news : %s", err)
} }
if err := postgres.UpdateNews(n); err != nil { if _, err := postgres.UpdateNews(n); err != nil {
log.Fatal(err) log.Fatalf("error while update news in postgres : %s", err)
} }
} }
}(newsChannel, &waitGroup) }(newsChannel, &waitGroup)
} }
for _, source := range sources { for _, source := range sources {
for sportId, url := range source.Urls { log.Printf("[+] Starting parse of source : %s", source.FeedUrl)
log.Printf("[+] Starting parse of source : %s", url) newsList, err := source.ListNews()
newsList, err := source.ListNews(mapSports[sportId], url)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
@ -67,21 +61,76 @@ func updateNews() {
log.Printf("Stopping parse of source (unique violation) with %d news added", i) log.Printf("Stopping parse of source (unique violation) with %d news added", i)
break break
} else { } else {
log.Fatalf("error while getting pq.Error object") log.Fatalf("error while inserting news in postgres : %s", err.Message)
} }
} else { } else {
log.Fatal(err) log.Fatalf("error while getting pq.Error object")
} }
} }
log.Printf("Adding news %s", n.Link) log.Printf("Adding news %s", n.Link)
newsChannel <- n newsChannel <- n
} }
} }
}
close(newsChannel) close(newsChannel)
waitGroup.Wait() waitGroup.Wait()
} }
func main() { func updateSchedule() {
updateNews() defer postgres.Close()
leagues, err := postgres.ListLeagues()
if err != nil {
log.Fatal(err)
}
waitGroup := sync.WaitGroup{}
sourceChannel := make(chan *match.Source)
for i := 0; i < nbProcesses; i++ {
waitGroup.Add(1)
go func(sc chan *match.Source, wg *sync.WaitGroup) {
defer wg.Done()
for s := range sc {
matches, err := s.GetMatches()
if err != nil {
log.Fatal(err)
}
for _, m := range matches {
fmt.Println(m)
//if err = postgres.InsertMatch(m); err != nil {
// log.Fatal(err)
//}
}
continue
}
}(sourceChannel, &waitGroup)
}
for _, league := range leagues {
sources, err := league.ListSources()
if err != nil {
log.Fatal(err)
}
for _, s := range sources {
log.Printf("Adding source %s", s.Url.String())
sourceChannel <- s
}
}
}
func main() {
flag.Parse()
args := flag.Args()
if len(args) != 1 {
log.Fatalf("unexpected number of args : len(%s) != 1", args)
}
switch args[0] {
case "news":
updateNews()
case "schedule":
updateSchedule()
default:
log.Fatalf("unexpected arg : %s", args[0])
}
} }

285
match/match.go Normal file
View File

@ -0,0 +1,285 @@
package match
import (
"fmt"
"net/url"
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"1bet.fr/scraper/requests"
"1bet.fr/scraper/utils"
)
type Sport struct {
Id int
Name string
CleanName string
}
const (
_ = iota
GenderMale = iota
GenderFemale = iota
)
type Team struct {
Id int
SportId int
CountryId int
Name string
CleanName string
ShortName string
LongName string
Gender int
Names interface{}
Url string
Images interface{}
Tags []string
CleanTags []string
NewsCount int
Error string
Trace string
}
const (
_ = iota
LegFirst = iota
LegSecond = iota
LegReplay = iota
)
const (
_ = iota
StatusFirstTime = iota
StatusHalfTime = iota
StatusSecondTime = iota
StatusFirstExtra = iota
StatusHalfExtra = iota
StatusSecondExtra = iota
StatusShootout = iota
StatusWaitScores = iota
StatusOver = iota
StatusPostponed = iota
StatusCancelled = iota
)
const (
_ = iota
WinnerHome = iota
WinnerAway = iota
WinnerDraw = iota
)
const (
_ = iota
ExtraTimeExtraTime = iota
ExtraTimeShootout = iota
)
type Match struct {
Id int
LeagueId int
TeamHomeId int
TeamAwayId int
PlayerHomeId int
PlayerAwayId int
MatchDay int
MatchDayId int
Round string
Leg int
BaseUrl string
ScoreUrl string
LiveUrl string
TvChannels []string
Status int
Minute int
StartDate *time.Time
EndDate *time.Time
HomeScore int
AwayScore int
SetsScore int
Winner int
ExtraTime int
ShootoutHome int
ShootoutAway int
Squad []interface{}
Events []interface{}
Stats []interface{}
Live []interface{}
LastEvent interface{}
LastEventDate *time.Time
Error string
Trace string
}
type League struct {
Id int
Sport *Sport
CountryId int
Name string
CleanName string
Gender *int
Degree *int
ScheduleUrl *string
RankingUrl *string
ChannelUrl *string
MatchDays *int
CurrentMatchDay *int
MatchesByMatchDay *int
TeamCount int
Rounds []string
Groups []string
Points interface{}
Promotions interface{}
Images interface{}
Schedule interface{}
Tags []string
CleanTags []string
NewsCount int
Error *string
Trace *string
}
func (l *League) ListSources() ([]*Source, error) {
var sources []*Source
if l.ScheduleUrl == nil {
return nil, nil
}
doc, err := requests.GetDocumentFromURL(*l.ScheduleUrl)
if err != nil {
return nil, err
}
originUrl, err := url.Parse(*l.ScheduleUrl)
if err != nil {
return nil, err
}
switch originUrl.Host {
case utils.HostMatchendirect:
doc.Find(".fDate option").Each(func (i int, s *goquery.Selection) {
value, ok := s.Attr("value")
if !ok {
return
}
parsedDate := strings.Split(strings.Split(value, "/")[3], "-")
year := utils.AtoI(parsedDate[0])
week := utils.AtoI(parsedDate[1])
if year >= 2020 && week >= 34 {
sources = append(sources, &Source{
League: l,
Url: &url.URL{
Scheme: originUrl.Scheme,
Host: originUrl.Host,
Path: value,
},
})
}
})
case utils.HostEurosport, utils.HostRugbyrama:
eurosportRegexp := regexp.MustCompile(`(\d)+e\s+Journée`)
ajaxUrl, ok := doc.Find(".ajax-container").Attr("data-ajax-url")
if !ok {
return nil, fmt.Errorf("ajax-container url not found")
}
ajaxParsedUrl, err := url.Parse(ajaxUrl)
if err != nil {
return nil, err
}
ajaxQuery := ajaxParsedUrl.Query()
doc.Find("#results-match-nav .rounds-dropdown__round").Each(func (i int, s *goquery.Selection) {
var round *string
var matchDay *int
roundStr, _ := s.Attr("data-label")
reMatch := eurosportRegexp.FindStringSubmatch(roundStr)
if reMatch != nil {
mdayInt := utils.AtoI(reMatch[1])
matchDay = &mdayInt
} else {
round = &roundStr
}
roundId, _ := s.Attr("data-round-id")
ajaxQuery.Set("roundid", roundId)
sources = append(sources, &Source{
League: l,
Url: &url.URL{
Scheme: originUrl.Scheme,
Host: originUrl.Host,
Path: ajaxParsedUrl.Path,
RawQuery: ajaxQuery.Encode(),
},
Round: round,
MatchDay: matchDay,
})
})
default:
return nil, fmt.Errorf("unknown source url : %s", *l.ScheduleUrl)
}
return sources, nil
}
type Source struct {
League *League
Url *url.URL
MatchDay *int
Round *string
currentDate *time.Time
}
func (s *Source) GetMatches() ([]*Match, error) {
var matches []*Match
switch s.Url.Host {
case utils.HostMatchendirect:
doc, err := requests.GetDocumentFromURL(s.Url.String())
if err != nil {
return nil, err
}
doc.Find("#livescore tr").Each(func (i int, row *goquery.Selection) {
row.Children().Each(func (j int, col *goquery.Selection) {
colspan, ok := col.Attr("colspan")
if ok && colspan == "4" {
currentDate, err := time.Parse("Monday 02 January 2006", utils.EnglishDateString(col.Text()))
if err != nil {
fmt.Println(err)
return
}
s.currentDate = &currentDate
fmt.Println(s.currentDate)
}
})
})
case utils.HostEurosport, utils.HostRugbyrama:
default:
return nil, fmt.Errorf("unexpected source url %s", s.Url.String())
}
return matches, nil
}

73
match/match_test.go Normal file
View File

@ -0,0 +1,73 @@
package match
import (
"net/url"
"strings"
"testing"
)
func TestLeague_ListSources(t *testing.T) {
t.Log("Testing matchendirect.fr sources...")
scheduleUrl := "http://www.matchendirect.fr/france/ligue-1/"
league := League{
ScheduleUrl: &scheduleUrl,
}
sources, err := league.ListSources()
if err != nil {
t.Error(err)
}
if len(sources) == 0 {
t.Errorf("no sources found")
}
for _, s := range sources {
if !strings.HasPrefix(s.Url.String(), "http://www.matchendirect.fr/france/ligue-1/") {
t.Errorf("unexpected source url %s", s.Url)
}
}
t.Log("Testing eurosport.fr sources...")
scheduleUrl = "https://www.eurosport.fr/tennis/open-d-australie-messieurs/2020/standing.shtml"
league = League{
ScheduleUrl: &scheduleUrl,
}
sources, err = league.ListSources()
if err != nil {
t.Error(err)
}
if len(sources) == 0 {
t.Errorf("no sources found")
}
for _, s := range sources {
if !strings.HasPrefix(s.Url.String(), "https://www.eurosport.fr/") {
t.Errorf("unexpected source url %s", s.Url)
}
}
t.Log("Testing rugbyrama.fr sources...")
scheduleUrl = "https://www.rugbyrama.fr/rugby/top-14/calendar-result.shtml"
league = League{
ScheduleUrl: &scheduleUrl,
}
sources, err = league.ListSources()
if err != nil {
t.Error(err)
}
if len(sources) == 0 {
t.Errorf("no sources found")
}
for _, s := range sources {
if !strings.HasPrefix(s.Url.String(), "https://www.rugbyrama.fr/") {
t.Errorf("unexpected source url %s", s.Url)
}
}
}
func TestSource_GetMatches(t *testing.T) {
sourceUrl, _ := url.Parse("https://www.matchendirect.fr/france/ligue-1/2020-37/")
source := &Source{
League: &League{Id: 1},
Url: sourceUrl,
}
source.GetMatches()
}

View File

@ -2,16 +2,16 @@ package news
import ( import (
"fmt" "fmt"
"net/url"
"regexp" "regexp"
"strings" "strings"
"time" "time"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"github.com/mmcdole/gofeed" "github.com/mmcdole/gofeed"
"../requests" "1bet.fr/scraper/requests"
"../utils" "1bet.fr/scraper/utils"
) )
type Sport struct { type Sport struct {
@ -22,9 +22,10 @@ type Sport struct {
type Source struct { type Source struct {
Id int Id int
Sport *Sport
Name string Name string
CleanName string CleanName string
Urls map[int]string FeedUrl string
Error *string Error *string
Trace *string Trace *string
@ -33,105 +34,128 @@ type Source struct {
type News struct { type News struct {
Id int Id int
Source *Source Source *Source
Sport *Sport LeagueId *int
LeagueId int TeamId *int
TeamId int
Title string Title string
CleanTitle string CleanTitle string
PubDate *time.Time
Description string
Link string Link string
Image string PubDate *time.Time
Description *string
Image *string
Teaser string Teaser *string
Author string Author *string
Content []string Content *[]string
Redirect string Redirect *string
Haystack string Haystack *string
Tags []string Tags *[]string
CleanTags []string CleanTags *[]string
Error string Error *string
Trace string Trace *string
} }
func (n *News) Feed() error { func (n *News) Feed() error {
parsedLink, err := url.Parse(n.Link)
if err != nil {
return err
}
doc, err := requests.GetDocumentFromURL(n.Link) doc, err := requests.GetDocumentFromURL(n.Link)
if err != nil { if err != nil {
return err return err
} }
switch n.Source.Name { switch parsedLink.Host {
case "Eurosport": case utils.HostEurosport:
n.Teaser = strings.TrimSpace(doc.Find("h2").Text()) n.Teaser = utils.StringPointer(doc.Find("h2").First().Text())
doc.Find(".article-body .article-s4-rs p").Each(func(i int, s *goquery.Selection) { doc.Find(".article-body .article-s4-rs p").Each(func(i int, s *goquery.Selection) {
n.Content = append(n.Content, s.Text()) n.Content = utils.ArrayPointerAppend(n.Content, s.Text())
}) })
n.Author = strings.TrimSpace(doc.Find(".flex a.caption-s5-fx div.font-bold").Text()) n.Author = utils.StringPointer(doc.Find(".flex a.caption-s5-fx div.font-bold").Text())
doc.Find(".related-topics .atom-tag").Each(func(i int, s *goquery.Selection) { doc.Find(".related-topics .atom-tag").Each(func(i int, s *goquery.Selection) {
tag := strings.TrimSpace(s.Text()) tag := strings.TrimSpace(s.Text())
cleanTag := utils.Sanitize(tag) cleanTag := utils.Sanitize(tag)
if !utils.ArrayContains(n.CleanTags, cleanTag) { if !utils.ArrayPointerContains(n.CleanTags, cleanTag) {
n.Tags = append(n.Tags, tag) n.Tags = utils.ArrayPointerAppend(n.Tags, tag)
n.CleanTags = append(n.CleanTags, cleanTag) n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, cleanTag)
} }
}) })
case "L'équipe":
n.Teaser = strings.TrimSpace(doc.Find("h2.Article__chapo").Text()) case utils.HostRugbyrama:
doc.Find(".Paragraph__content").Each(func(i int, s *goquery.Selection) { n.Teaser = utils.StringPointer(doc.Find("h2.storyfull__teaser").Text())
n.Content = append(n.Content, s.Text()) doc.Find(".storyfull__paragraphs p.storyfull__paragraph").Each(func(i int, s *goquery.Selection) {
n.Content = utils.ArrayPointerAppend(n.Content, s.Text())
}) })
n.Author = strings.TrimSpace(doc.Find(".Author__name").Text()) n.Author = utils.StringPointer(strings.Replace(doc.Find(".storyfull__publisher-author-name").Text(), "Par ", "", 1))
doc.Find(".storyfull__linkentities-infos a.storyfull__linkentities-name").Each(func(i int, s *goquery.Selection) {
tag := strings.TrimSpace(s.Text())
cleanTag := utils.Sanitize(tag)
if !utils.ArrayPointerContains(n.CleanTags, cleanTag) {
n.Tags = utils.ArrayPointerAppend(n.Tags, tag)
n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, cleanTag)
}
})
case utils.HostLequipe:
n.Teaser = utils.StringPointer(doc.Find("h2.Article__chapo").Text())
doc.Find(".Paragraph__content").Each(func(i int, s *goquery.Selection) {
n.Content = utils.ArrayPointerAppend(n.Content, s.Text())
})
n.Author = utils.StringPointer(doc.Find(".Author__name").Text())
doc.Find(".RelatedLinks a.RelatedLinks__link").Each(func(i int, s *goquery.Selection) { doc.Find(".RelatedLinks a.RelatedLinks__link").Each(func(i int, s *goquery.Selection) {
tag := strings.TrimSpace(s.Text()) tag := strings.TrimSpace(s.Text())
cleanTag := utils.Sanitize(tag) cleanTag := utils.Sanitize(tag)
if !utils.ArrayContains(n.CleanTags, cleanTag) { if !utils.ArrayPointerContains(n.CleanTags, cleanTag) {
n.Tags = append(n.Tags, tag) n.Tags = utils.ArrayPointerAppend(n.Tags, tag)
n.CleanTags = append(n.CleanTags, cleanTag) n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, cleanTag)
} }
}) })
case "FFTT":
n.Teaser = strings.TrimSpace(doc.Find(".news-description p").First().Text()) case utils.HostFFTT:
n.Teaser = utils.StringPointer(doc.Find(".news-description p").First().Text())
doc.Find(".news-description p").Each(func(i int, s *goquery.Selection) { doc.Find(".news-description p").Each(func(i int, s *goquery.Selection) {
if i > 0 { if i > 0 {
n.Content = append(n.Content, s.Text()) n.Content = utils.ArrayPointerAppend(n.Content, s.Text())
} }
}) })
doc.Find(".social-shares-large-wrapper a.link").Each(func(i int, s *goquery.Selection) { doc.Find(".social-shares-large-wrapper a.link").Each(func(i int, s *goquery.Selection) {
tag := strings.TrimSpace(s.Text()) tag := strings.TrimSpace(s.Text())
cleanTag := utils.Sanitize(tag) cleanTag := utils.Sanitize(tag)
if !utils.ArrayContains(n.CleanTags, cleanTag) { if !utils.ArrayPointerContains(n.CleanTags, cleanTag) {
n.Tags = append(n.Tags, tag) n.Tags = utils.ArrayPointerAppend(n.Tags, tag)
n.CleanTags = append(n.CleanTags, cleanTag) n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, cleanTag)
} }
}) })
case "Foot Mercato":
n.Teaser = strings.TrimSpace(doc.Find("h2.article__lead").Text()) case utils.HostFootmercato:
n.Teaser = utils.StringPointer(doc.Find("h2.article__lead").Text())
doc.Find(".article__content p").Each(func(i int, s *goquery.Selection) { doc.Find(".article__content p").Each(func(i int, s *goquery.Selection) {
n.Content = append(n.Content, s.Text()) n.Content = utils.ArrayPointerAppend(n.Content, s.Text())
}) })
n.Author = strings.TrimSpace(doc.Find(".article__author a").Text()) n.Author = utils.StringPointer(doc.Find(".article__author a").Text())
default: default:
return fmt.Errorf("unknown source %s", n.Source.Name) n.Error = utils.StringPointer("unknown host " + parsedLink.Host)
} }
if len(n.Content) == 0 {
n.Redirect = n.Link if n.Content == nil {
n.Redirect = utils.StringPointer(n.Link)
} }
if len(n.CleanTags) == 0 { if n.CleanTags == nil {
n.Tags = append(n.Tags, n.Sport.Name) n.Tags = utils.ArrayPointerAppend(n.Tags, n.Source.Sport.Name)
n.CleanTags = append(n.CleanTags, n.Sport.CleanName) n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, n.Source.Sport.CleanName)
} }
n.Haystack = fmt.Sprintf("%s-%s", n.CleanTitle, strings.Join(n.CleanTags, "-")) n.Haystack = utils.StringPointer(fmt.Sprintf("%s-%s", n.CleanTitle, utils.ArrayPointerJoin(n.CleanTags, "-")))
return nil return nil
} }
func (s *Source) ListNews(sport *Sport, url string) ([]*News, error) { func (s *Source) ListNews() ([]*News, error) {
var newsList []*News var newsList []*News
fp := gofeed.NewParser() fp := gofeed.NewParser()
feed, err := fp.ParseURL(url) feed, err := fp.ParseURL(s.FeedUrl)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -139,33 +163,32 @@ func (s *Source) ListNews(sport *Sport, url string) ([]*News, error) {
for _, item := range feed.Items { for _, item := range feed.Items {
n := &News{ n := &News{
Source: s, Source: s,
Sport: sport,
Title: item.Title, Title: item.Title,
Description: regexp.MustCompile(`<[^>]*>`).ReplaceAllLiteralString(item.Description, ""), Description: utils.StringPointer(regexp.MustCompile(`<[^>]*>`).ReplaceAllLiteralString(item.Description, "")),
CleanTitle: utils.Sanitize(item.Title), CleanTitle: utils.Sanitize(item.Title),
PubDate: item.PublishedParsed, PubDate: item.PublishedParsed,
Link: item.Link, Link: item.Link,
} }
for _, tags := range item.Categories { for _, tags := range item.Categories {
for _, tag := range strings.Split(tags, ",") { for _, tag := range strings.Split(tags, ",") {
n.Tags = append(n.Tags, strings.TrimSpace(tag)) n.Tags = utils.ArrayPointerAppend(n.Tags, tag)
n.CleanTags = append(n.CleanTags, utils.Sanitize(strings.TrimSpace(tag))) n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, utils.Sanitize(tag))
} }
} }
if item.Image != nil { if item.Image != nil {
n.Image = item.Image.URL n.Image = utils.StringPointer(item.Image.URL)
} else if len(item.Enclosures) > 0 { } else if len(item.Enclosures) > 0 {
n.Image = item.Enclosures[0].URL n.Image = utils.StringPointer(item.Enclosures[0].URL)
} else if s.Name == "Eurosport" { } else {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(item.Description)) doc, err := goquery.NewDocumentFromReader(strings.NewReader(item.Description))
if err == nil { if err == nil {
if src, ok := doc.Find("img").Attr("src"); ok { if src, ok := doc.Find("img").Attr("src"); ok {
n.Image = src n.Image = utils.StringPointer(src)
} }
} }
} }
if item.Author != nil { if item.Author != nil {
n.Author = item.Author.Name n.Author = utils.StringPointer(item.Author.Name)
} }
newsList = append(newsList, n) newsList = append(newsList, n)
} }

View File

@ -4,114 +4,130 @@ import (
"strings" "strings"
"testing" "testing"
"../utils" "1bet.fr/scraper/utils"
) )
type expectedResult struct {
news *News
teaser string
paragraph string
author string
urlTags []string
haystack string
source *Source
sourceUrl string
}
func TestNews_Feed(t *testing.T) { func TestNews_Feed(t *testing.T) {
expList := [4]*expectedResult{ var n *News
{
news: &News{
Source: &Source{Id: 1, Name: "Eurosport"},
Sport: &Sport{Id: 1, Name: "Football", UrlName: "football"},
Link: "https://www.eurosport.fr/football/bundesliga/2020-2021/dortmund-au-tapis-thuram-debloque-son-compteur_sto7905745/story.shtml",
},
teaser: "BUNDESLIGA Le Borussia Dortmund et ses jeunes stars ont chuté",
paragraph: "Etonnante Bundesliga. Dortmund battu, Leipzig tenu en échec samedi,",
author: "Eurosport",
urlTags: []string{"football", "bundesliga"},
},
{
news: &News{
Source: &Source{Id: 2, Name: "L'équipe"},
Sport: &Sport{Id: 1, Name: "Football", UrlName: "football"},
Link: "https://www.lequipe.fr/Football/Actualites/Mitchel-bakker-psg-je-vais-devoir-elever-mon-niveau-de-jeu/1176182",
},
teaser: "Mitchel Bakker, le latéral néerlandais du PSG",
paragraph:  Les absences de Juan Bernat et Layvin Kurzawa",
author: "H. De.",
urlTags: []string{"ligue-1", "paris-sg--fra-", "reims--fra-"},
},
{
news: &News{
Source: &Source{Id: 3, Name: "FFTT"},
Sport: &Sport{Id: 6, Name: "Tennis de Table", UrlName: "tennis-de-table"},
Link: "http://www.fftt.com/site/actualites/2020-09-22/laura-gasnier-page-qui-se-tourne-avec-bleues",
},
teaser: "Après 15 années en équipe de France, Laura Gasnier a décidé",
paragraph: "Elle évoque un choix personnel qui a demandé plusieurs mois de réflexion",
author: "",
urlTags: []string{"equipe-de-france", "gasnier-laura"},
},
{
news: &News{
Source: &Source{Id: 4, Name: "Foot Mercato"},
Sport: &Sport{Id: 1, Name: "Football", UrlName: "football"},
Link: "https://www.footmercato.net/a3190892483125730002-real-madrid-personne-ne-veut-de-luka-jovic",
},
teaser: "Alors que la date de fin du mercato approche considérablement,",
paragraph: "Tic-tac, tic-tac... Le chrono défile, et le Real Madrid",
author: "Max Franco Sanchez",
urlTags: []string{"football"},
},
}
for _, exp := range expList { t.Logf("testing feed from Eurosport")
t.Logf("testing feed from %s", exp.news.Source.Name) n = &News{
if err := exp.news.Feed(); err != nil { Source: &Source{Sport: &Sport{Name: "Football", CleanName: "football"}},
Link: "https://www.eurosport.fr/football/bundesliga/2020-2021/dortmund-au-tapis-thuram-debloque-son-compteur_sto7905745/story.shtml",
}
if err := n.Feed(); err != nil {
t.Errorf("unexpected error : %s", err) t.Errorf("unexpected error : %s", err)
} }
if !strings.HasPrefix(exp.news.Teaser, exp.teaser) { if !strings.HasPrefix(*n.Teaser, "BUNDESLIGA Le Borussia Dortmund et ses jeunes stars ont chuté") {
t.Errorf("unexpected teaser : %s", exp.news.Teaser) t.Errorf("unexpected teaser : %s", *n.Teaser)
} }
if !strings.HasPrefix(exp.news.Content[0], exp.paragraph) { if !strings.HasPrefix((*n.Content)[0], "Etonnante Bundesliga. Dortmund battu, Leipzig tenu en échec samedi,") {
t.Errorf("unexpected content : %s", exp.news.Content[0]) t.Errorf("unexpected content : %s", (*n.Content)[0])
} }
if exp.news.Author != exp.author { if *n.Author != "Eurosport" {
t.Errorf("unexpected author : %s", exp.news.Author) t.Errorf("unexpected author : %s", *n.Author)
} }
for _, urlTag := range exp.urlTags { if !utils.ArrayPointerContains(n.CleanTags, "bundesliga") {
if !utils.ArrayContains(exp.news.UrlTags, urlTag) { t.Errorf("cleanTags does not contain bundesliga")
t.Errorf("urltags does not contain %s", urlTag)
} }
t.Logf("testing feed from L'équipe")
n = &News{
Source: &Source{Sport: &Sport{Name: "Football", CleanName: "football"}},
Link: "https://www.lequipe.fr/Football/Actualites/Mitchel-bakker-psg-je-vais-devoir-elever-mon-niveau-de-jeu/1176182",
} }
if err := n.Feed(); err != nil {
t.Errorf("unexpected error : %s", err)
}
if !strings.HasPrefix(*n.Teaser, "Mitchel Bakker, le latéral néerlandais du PSG") {
t.Errorf("unexpected teaser : %s", *n.Teaser)
}
if !strings.HasPrefix((*n.Content)[0],  Les absences de Juan Bernat et Layvin Kurzawa") {
t.Errorf("unexpected content : %s", (*n.Content)[0])
}
if *n.Author != "H. De." {
t.Errorf("unexpected author : %s", *n.Author)
}
if !utils.ArrayPointerContains(n.CleanTags, "paris-sg--fra-") {
t.Errorf("cleanTags does not contain paris-sg--fra-")
}
t.Logf("testing feed from FFTT")
n = &News{
Source: &Source{Sport: &Sport{Name: "Tennis de Table", CleanName: "tennis-de-table"}},
Link: "http://www.fftt.com/site/actualites/2020-09-22/laura-gasnier-page-qui-se-tourne-avec-bleues",
}
if err := n.Feed(); err != nil {
t.Errorf("unexpected error : %s", err)
}
if !strings.HasPrefix(*n.Teaser, "Après 15 années en équipe de France, Laura Gasnier a décidé") {
t.Errorf("unexpected teaser : %s", *n.Teaser)
}
if !strings.HasPrefix((*n.Content)[0], "Elle évoque un choix personnel qui a demandé plusieurs mois de réflexion") {
t.Errorf("unexpected content : %s", (*n.Content)[0])
}
if n.Author != nil {
t.Errorf("unexpected author : %s", *n.Author)
}
if !utils.ArrayPointerContains(n.CleanTags, "gasnier-laura") {
t.Errorf("cleanTags does not contain gasnier-laura")
}
t.Logf("testing feed from Foot Mercato")
n = &News{
Source: &Source{Sport: &Sport{Name: "Football", CleanName: "football"}},
Link: "https://www.footmercato.net/a3190892483125730002-real-madrid-personne-ne-veut-de-luka-jovic",
}
if err := n.Feed(); err != nil {
t.Errorf("unexpected error : %s", err)
}
if !strings.HasPrefix(*n.Teaser, "Alors que la date de fin du mercato approche considérablement,") {
t.Errorf("unexpected teaser : %s", *n.Teaser)
}
if !strings.HasPrefix((*n.Content)[0], "Tic-tac, tic-tac... Le chrono défile, et le Real Madrid") {
t.Errorf("unexpected content : %s", (*n.Content)[0])
}
if *n.Author != "Max Franco Sanchez" {
t.Errorf("unexpected author : %s", *n.Author)
}
if !utils.ArrayPointerContains(n.CleanTags, "football") {
t.Errorf("cleanTags does not contain football")
}
t.Logf("testing feed from Foot Mercato")
n = &News{
Source: &Source{Sport: &Sport{Name: "Rugby", CleanName: "rugby"}},
Link: "https://www.rugbyrama.fr/rugby/top-14/2018-2019/top-14-face-au-racing-92-toulouse-n-aura-pas-de-marge-de-manoeuvre_sto7939622/story.shtml",
}
if err := n.Feed(); err != nil {
t.Errorf("unexpected error : %s", err)
}
if !strings.HasPrefix(*n.Teaser, "TOP 14 - Opposé au Racing 92 à la Paris la Défense Arena") {
t.Errorf("unexpected teaser : %s", *n.Teaser)
}
if !strings.HasPrefix((*n.Content)[0], "Réaliser et produire le même contenu") {
t.Errorf("unexpected content : %s", (*n.Content)[0])
}
if *n.Author != "Rugbyrama" {
t.Errorf("unexpected author : %s", *n.Author)
}
if !utils.ArrayPointerContains(n.CleanTags, "top-14") {
t.Errorf("cleanTags does not contain football")
} }
} }
func TestSource_ListNews(t *testing.T) { func TestSource_ListNews(t *testing.T) {
expList := []*expectedResult{ links := []string{
{ "http://www.eurosport.fr/football/rss.xml",
source: &Source{Id: 1, Name: "Eurosport"}, "https://www.lequipe.fr/rss/actu_rss_Football.xml",
sourceUrl: "http://www.eurosport.fr/football/rss.xml", "http://www.fftt.com/site/medias/flux/rss_competition.xml",
}, "http://www.footmercato.net/flux-rss",
{
source: &Source{Id: 1, Name: "L'équipe"},
sourceUrl: "https://www.lequipe.fr/rss/actu_rss_Football.xml",
},
{
source: &Source{Id: 1, Name: "FFTT"},
sourceUrl: "http://www.fftt.com/site/medias/flux/rss_competition.xml",
},
{
source: &Source{Id: 1, Name: "Foot Mercato"},
sourceUrl: "http://www.footmercato.net/flux-rss",
},
} }
for _, exp := range expList { for _, link := range links {
t.Logf("testing newsList from %s", exp.source.Name) t.Logf("testing ListNews from %s", link)
newsList, err := exp.source.ListNews(&Sport{Id: 1}, exp.sourceUrl) source := &Source{FeedUrl: link}
newsList, err := source.ListNews()
if err != nil { if err != nil {
t.Errorf("unexpected error : %s", err) t.Errorf("unexpected error : %s", err)
} }
@ -122,8 +138,8 @@ func TestSource_ListNews(t *testing.T) {
if n.Title == "" { if n.Title == "" {
t.Errorf("unexpected empty title") t.Errorf("unexpected empty title")
} }
if n.Image == "" { if n.Image == nil {
t.Errorf("unexpected empty image") t.Errorf("unexpected nil image")
} }
} }
} }

View File

@ -2,15 +2,15 @@ package postgres
import ( import (
"database/sql" "database/sql"
"encoding/json"
"fmt" "fmt"
"log" "log"
"os" "os"
"github.com/lib/pq" "github.com/lib/pq"
"../news" "1bet.fr/scraper/match"
"../utils" "1bet.fr/scraper/news"
"1bet.fr/scraper/utils"
) )
type Postgres struct { type Postgres struct {
@ -25,6 +25,27 @@ type Postgres struct {
isConnected bool isConnected bool
} }
func sValue(s *string) interface{} {
if s == nil {
return nil
}
return *s
}
func iValue(i *int) interface{} {
if i == nil {
return nil
}
return *i
}
func aValue(a *[]string) interface{} {
if a == nil {
return nil
}
return pq.Array(*a)
}
var pg *Postgres var pg *Postgres
func init() { func init() {
@ -64,41 +85,26 @@ func Close() {
pg.isConnected = false pg.isConnected = false
} }
func ListSports() ([]*news.Sport, error) {
var sports []*news.Sport
rows, err := pg.psqlConn.Query("SELECT id, name, clean_name FROM public.mainapp_sport")
if err != nil {
return nil, fmt.Errorf("error while querying postgres : %s", err)
}
for rows.Next() {
sport := &news.Sport{}
err = rows.Scan(&sport.Id, &sport.Name, &sport.CleanName)
if err != nil {
return nil, fmt.Errorf("error while scanning row from postgres : %s", err)
}
sports = append(sports, sport)
}
return sports, nil
}
func ListSources() ([]*news.Source, error) { func ListSources() ([]*news.Source, error) {
var sources []*news.Source var sources []*news.Source
rows, err := pg.psqlConn.Query("SELECT id, name, clean_name, urls FROM public.mainapp_source") rows, err := pg.psqlConn.Query(`
SELECT
mainapp_source.id, sport_id, mainapp_source.name, mainapp_source.clean_name, feed_url,
mainapp_sport.name, mainapp_sport.clean_name
FROM
mainapp_source, mainapp_sport
WHERE
mainapp_sport.id = mainapp_source.sport_id
`)
if err != nil { if err != nil {
return nil, fmt.Errorf("error while querying postgres : %s", err) return nil, fmt.Errorf("error while querying postgres : %s", err)
} }
for rows.Next() { for rows.Next() {
source := &news.Source{} source := &news.Source{Sport: &news.Sport{}}
sourceUrls := "" if err = rows.Scan(&source.Id, &source.Sport.Id, &source.Name, &source.CleanName, &source.FeedUrl,
err = rows.Scan(&source.Id, &source.Name, &source.CleanName, &sourceUrls) &source.Sport.Name, &source.Sport.CleanName); err != nil {
if err != nil {
return nil, fmt.Errorf("error while scanning row from postgres : %s", err)
}
if err = json.Unmarshal([]byte(sourceUrls), &source.Urls); err != nil {
return nil, fmt.Errorf("error while scanning row from postgres : %s", err) return nil, fmt.Errorf("error while scanning row from postgres : %s", err)
} }
sources = append(sources, source) sources = append(sources, source)
@ -106,43 +112,84 @@ func ListSources() ([]*news.Source, error) {
return sources, nil return sources, nil
} }
func ListLeagues() ([]*match.League, error) {
var leagues []*match.League
rows, err := pg.psqlConn.Query(`
SELECT
mainapp_league.id, sport_id, country_id, mainapp_league.name, mainapp_league.clean_name, gender,
schedule_url, ranking_url, channel_url,
mdays, matches_by_mday, rounds, groups,
mainapp_sport.name, mainapp_sport.clean_name
FROM
mainapp_league, mainapp_sport
WHERE
mainapp_sport.id = mainapp_league.sport_id
`)
if err != nil {
return nil, fmt.Errorf("error while querying postgres : %s", err)
}
for rows.Next() {
league := &match.League{Sport: &match.Sport{}}
if err = rows.Scan(
&league.Id, &league.Sport.Id, &league.CountryId, &league.Name, &league.CleanName, &league.Gender,
&league.ScheduleUrl, &league.RankingUrl, &league.ChannelUrl,
&league.MatchDays, &league.MatchesByMatchDay, pq.Array(&league.Rounds), pq.Array(&league.Groups),
&league.Sport.Name, &league.Sport.CleanName,
); err != nil {
return nil, fmt.Errorf("error while scanning row from postgres : %s", err)
}
leagues = append(leagues, league)
}
return leagues, nil
}
func InsertNews(n *news.News) error { func InsertNews(n *news.News) error {
err := pg.psqlConn.QueryRow(` return pg.psqlConn.QueryRow(`
INSERT INTO public.mainapp_news INSERT INTO public.mainapp_news
(title, clean_title, link, pub_date, description, image, teaser, author, (title, clean_title, link, pub_date, description, image, teaser, author,
content, redirect, haystack, tags, clean_tags, error, trace, content, redirect, haystack, tags, clean_tags, error, trace,
league_id, source_id, sport_id, team_id) league_id, source_id, team_id)
VALUES VALUES
($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19) ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
RETURNING RETURNING
id id
`, n.Title, n.CleanTitle, n.Link, n.PubDate, utils.NullableString(n.Description), `, n.Title, n.CleanTitle, n.Link, n.PubDate, sValue(n.Description),
utils.NullableString(n.Image), utils.NullableString(n.Teaser), utils.NullableString(n.Author), sValue(n.Image), sValue(n.Teaser), sValue(n.Author),
pq.Array(n.Content), utils.NullableString(n.Redirect), utils.NullableString(n.Haystack), aValue(n.Content), sValue(n.Redirect), sValue(n.Haystack),
pq.Array(n.Tags), pq.Array(n.CleanTags), utils.NullableString(n.Error), utils.NullableString(n.Trace), aValue(n.Tags), aValue(n.CleanTags), sValue(n.Error), sValue(n.Trace),
utils.NullableInt(n.LeagueId), n.Source.Id, n.Sport.Id, utils.NullableInt(n.TeamId), iValue(n.LeagueId), n.Source.Id, iValue(n.TeamId),
).Scan(&n.Id) ).Scan(&n.Id)
if err != nil {
return err
}
return nil
} }
func UpdateNews(n *news.News) error { func UpdateNews(n *news.News) (int64, error) {
if _, err := pg.psqlConn.Exec(` res, err := pg.psqlConn.Exec(`
UPDATE public.mainapp_news UPDATE public.mainapp_news
SET title = $1, clean_title = $2, pub_date = $3, link = $4, description = $5, SET title = $1, clean_title = $2, pub_date = $3, link = $4, description = $5,
image = $6, teaser = $7, author = $8, content = $9, redirect = $10, image = $6, teaser = $7, author = $8, content = $9, redirect = $10,
haystack = $11, tags = $12, clean_tags = $13, error = $14, trace = $15, haystack = $11, tags = $12, clean_tags = $13, error = $14, trace = $15,
league_id = $16, source_id = $17, sport_id = $18, team_id = $19 league_id = get_matching_league($11, $18), source_id = $16, team_id = $17
WHERE id = $20 WHERE id = $19
`, n.Title, n.CleanTitle, n.PubDate, n.Link, utils.NullableString(n.Description), `, n.Title, n.CleanTitle, n.PubDate, n.Link,sValue(n.Description),
utils.NullableString(n.Image), utils.NullableString(n.Teaser), utils.NullableString(n.Author), sValue(n.Image), sValue(n.Teaser), sValue(n.Author),
pq.Array(n.Content), utils.NullableString(n.Redirect), utils.NullableString(n.Haystack), aValue(n.Content), sValue(n.Redirect), sValue(n.Haystack),
pq.Array(n.Tags), pq.Array(n.CleanTags), utils.NullableString(n.Error), utils.NullableString(n.Trace), aValue(n.Tags), aValue(n.CleanTags), sValue(n.Error), sValue(n.Trace),
utils.NullableInt(n.LeagueId), n.Source.Id, n.Sport.Id, utils.NullableInt(n.TeamId), n.Id, n.Source.Id, iValue(n.TeamId), n.Source.Sport.Id, n.Id,
); err != nil { )
return err if err != nil {
return 0, err
} }
return nil return res.RowsAffected()
}
func DeleteNews(n *news.News) (int64, error) {
res, err := pg.psqlConn.Exec(`
DELETE FROM public.mainapp_news
WHERE id = $1
`, n.Id)
if err != nil {
return 0, err
}
return res.RowsAffected()
} }

View File

@ -2,19 +2,83 @@ package postgres
import ( import (
"testing" "testing"
"time"
"1bet.fr/scraper/news"
"1bet.fr/scraper/utils"
) )
var n *news.News
func TestConnect(t *testing.T) { func TestConnect(t *testing.T) {
return return
} }
func TestListLeagues(t *testing.T) {
leagues, err := ListLeagues()
if err != nil {
t.Errorf("unexpected error : %s", err)
}
if len(leagues) == 0 {
t.Errorf("no league got from ListLeagues function")
}
}
func TestListSources(t *testing.T) { func TestListSources(t *testing.T) {
defer Close()
sources, err := ListSources() sources, err := ListSources()
if err != nil { if err != nil {
t.Errorf("unexpected error : %s", err) t.Errorf("unexpected error : %s", err)
} }
if len(sources) == 0 { if len(sources) == 0 {
t.Errorf("no sources got from ListSources function") t.Errorf("no source got from ListSources function")
} }
} }
func TestInsertNews(t *testing.T) {
tags := []string{"Test", "Hello Toto"}
cleanTags := []string{"test", "hello-toto"}
nowTime := time.Now()
n = &news.News{
Source: &news.Source{Id: 1, Sport: &news.Sport{Id: 1}},
PubDate: &nowTime,
Link: "https://test.com/toto",
Title: "Hello toto",
CleanTitle: "hello-toto",
Tags: &tags,
CleanTags: &cleanTags,
}
err := InsertNews(n)
if err != nil {
t.Error(err)
}
if n.Id == 0 {
t.Errorf("unexpected value 0 for n.Id")
}
}
func TestUpdateNews(t *testing.T) {
content := []string{"toto", "test"}
n.Content = &content
n.Author = utils.StringPointer("T. Toto")
updated, err := UpdateNews(n)
if err != nil {
t.Error(err)
}
if updated != 1 {
t.Errorf("unexpected %d update rows", updated)
}
}
func TestDeleteNews(t *testing.T) {
deleted, err := DeleteNews(n)
if err != nil {
t.Error(err)
}
if deleted != 1 {
t.Errorf("unexpected %d news deleted", deleted)
}
}
func TestClose(t *testing.T) {
Close()
}

View File

@ -2,11 +2,11 @@ package requests
import ( import (
"fmt" "fmt"
"github.com/PuerkitoBio/goquery"
"log" "log"
"net/http" "net/http"
"net/url" "net/url"
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/proxy" "golang.org/x/net/proxy"
) )

BIN
scraper

Binary file not shown.

View File

@ -3,6 +3,16 @@ package utils
import ( import (
"log" "log"
"strconv" "strconv"
"strings"
)
const (
HostMatchendirect = "www.matchendirect.fr"
HostEurosport = "www.eurosport.fr"
HostRugbyrama = "www.rugbyrama.fr"
HostFFTT = "www.fftt.com"
HostFootmercato = "www.footmercato.net"
HostLequipe = "www.lequipe.fr"
) )
func Sanitize(s string) (t string) { func Sanitize(s string) (t string) {
@ -32,29 +42,6 @@ func Sanitize(s string) (t string) {
return t return t
} }
func NullableString(s string) interface{} {
if len(s) == 0 {
return nil
}
return s
}
func NullableInt(i int) interface{} {
if i == 0 {
return nil
}
return i
}
func ArrayContains(arr []string, val string) bool {
for _, elt := range arr {
if elt == val {
return true
}
}
return false
}
func AtoI(s string) int { func AtoI(s string) int {
res, err := strconv.Atoi(s) res, err := strconv.Atoi(s)
if err != nil { if err != nil {
@ -62,3 +49,87 @@ func AtoI(s string) int {
} }
return res return res
} }
func EnglishDateString(s string) string {
months := map[string]string{
"janvier": "January",
"février": "February",
"mars": "March",
"avril": "April",
"mai": "May",
"juin": "June",
"juillet": "July",
"août": "August",
"septembre": "September",
"octobre": "October",
"novembre": "November",
"décembre": "December",
}
days := map[string]string{
"lundi": "Monday",
"mardi": "Tuesday",
"mercredi": "Wednesday",
"jeudi": "Thursday",
"vendredi": "Friday",
"samedi": "Saturday",
"dimanche": "Sunday",
}
s = strings.TrimSpace(strings.ToLower(s))
for fr, en := range months {
if strings.Contains(s, fr) {
s = strings.ReplaceAll(s, fr, en)
break
}
}
for fr, en := range days {
if strings.Contains(s, fr) {
s = strings.ReplaceAll(s, fr, en)
break
}
}
return s
}
func StringPointer(s string) *string {
if s == "" {
return nil
}
s = strings.TrimSpace(s)
return &s
}
func IntPointer(i int) *int {
if i == 0 {
return nil
}
return &i
}
func ArrayPointerContains(arr *[]string, val string) bool {
if arr == nil {
return false
}
for _, elt := range *arr {
if elt == val {
return true
}
}
return false
}
func ArrayPointerAppend(a *[]string, v string) *[]string {
if a == nil {
r := []string{strings.TrimSpace(v)}
return &r
}
r := append(*a, strings.TrimSpace(v))
return &r
}
func ArrayPointerJoin(a *[]string, sep string) string {
if a == nil {
return ""
}
return strings.Join(*a, sep)
}

View File

@ -10,32 +10,17 @@ func TestSanitize(t *testing.T) {
} }
} }
func TestNullableString(t *testing.T) {
if res := NullableString("test"); res != "test" {
t.Errorf("unexepected NullableString() answer '%s' != 'test'", res)
}
if res := NullableString(""); res != nil {
t.Errorf("unexepected NullableString() answer '%s' != nil", res)
}
}
func TestNullableInt(t *testing.T) {
if res := NullableInt(3); res != 3 {
t.Errorf("unexepected NullableInt() answer %s != 3", res)
}
if res := NullableInt(0); res != nil {
t.Errorf("unexepected NullableInt() answer %s != nil", res)
}
}
func TestArrayContains(t *testing.T) { func TestArrayContains(t *testing.T) {
if !ArrayContains([]string{"bird", "apple", "ocean", "fork", "anchor"}, "bird") { if ArrayPointerContains(nil, "toto") {
t.Errorf("unexpected ArrayContains() false answer for 'bird'") t.Errorf("unexpected contains true for nil array")
} }
if ArrayContains([]string{"bird", "apple", "ocean", "fork", "anchor"}, "potato") { arr := []string{"bird", "apple", "ocean", "fork", "anchor"}
t.Errorf("unexpected ArrayContains() true answer for 'potato'") if !ArrayPointerContains(&arr, "bird") {
t.Errorf("unexpected contains false")
}
arr = []string{"bird", "apple", "ocean", "fork", "anchor"}
if ArrayPointerContains(&arr, "potato") {
t.Errorf("unexpected contains true")
} }
} }
@ -44,3 +29,62 @@ func TestAtoI(t *testing.T) {
t.Errorf("unexpected answer %d != 3", res) t.Errorf("unexpected answer %d != 3", res)
} }
} }
func TestEnglishDateString(t *testing.T) {
if res := EnglishDateString("Mercredi 03 février 2021"); res != "Wednesday 03 February 2021" {
t.Errorf("unexpected date format : %s", res)
}
}
func TestStringPointer(t *testing.T) {
if res := StringPointer(""); res != nil {
t.Errorf("unexpected res : %s", *res)
}
if res := StringPointer("toto"); res == nil {
t.Errorf("unexpected res : nil")
} else {
if *res != "toto" {
t.Errorf("unexpected res : %s", *res)
}
}
}
func TestIntPointer(t *testing.T) {
if res := IntPointer(0); res != nil {
t.Errorf("unexpected res : %d", *res)
}
if res := IntPointer(123); res == nil {
t.Errorf("unexpected res : nil")
} else if *res != 123 {
t.Errorf("unexpected res : %d", *res)
}
}
func TestArrayPointerAppend(t *testing.T) {
var arr *[]string
if arr = ArrayPointerAppend(arr, "toto"); arr == nil {
t.Errorf("unexpected arr : nil")
} else if len(*arr) != 1 {
t.Errorf("unexpected arr len : %d", len(*arr))
} else if (*arr)[0] != "toto" {
t.Errorf("unexpected arr content : %s", *arr)
}
if arr = ArrayPointerAppend(arr, "test"); arr == nil {
t.Errorf("unexpected arr : nil")
} else if len(*arr) != 2 {
t.Errorf("unexpected arr len : %d", len(*arr))
} else if (*arr)[1] != "test" {
t.Errorf("unexpected arr content : %s", *arr)
}
}
func TestArrayPointerJoin(t *testing.T) {
if s := ArrayPointerJoin(nil, "-"); s != "" {
t.Errorf("unexpected join result : %s", s)
}
arr := []string{"toto", "test"}
if s:= ArrayPointerJoin(&arr, "-"); s != "toto-test" {
t.Errorf("unexpected join result : %s", s)
}
}