Initial Commit

This commit is contained in:
Samuel Campos 2020-10-05 10:24:33 +02:00
commit d87bedc8d6
12 changed files with 789 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
.idea

87
main.go Normal file
View File

@ -0,0 +1,87 @@
package main
import (
"github.com/lib/pq"
"log"
"sync"
"./news"
"./postgres"
)
const (
nbProcesses = 50
maxNewsPerSource = 50
)
func updateNews() {
defer postgres.Close()
sports, err := postgres.ListSports()
if err != nil {
log.Fatal(err)
}
mapSports := map[int]*news.Sport{}
for _, sport := range sports {
mapSports[sport.Id] = sport
}
sources, err := postgres.ListSources()
if err != nil {
log.Fatal(err)
}
waitGroup := sync.WaitGroup{}
newsChannel := make(chan *news.News)
for i := 0; i < nbProcesses; i++ {
waitGroup.Add(1)
go func(nc chan *news.News, wg *sync.WaitGroup) {
defer wg.Done()
for n := range nc {
if err := n.Feed(); err != nil {
log.Fatal(err)
}
if err := postgres.UpdateNews(n); err != nil {
log.Fatal(err)
}
}
}(newsChannel, &waitGroup)
}
for _, source := range sources {
for sportId, url := range source.Urls {
log.Printf("[+] Starting parse of source : %s", url)
newsList, err := source.ListNews(mapSports[sportId], url)
if err != nil {
log.Fatal(err)
}
for i, n := range newsList {
if i >= maxNewsPerSource {
log.Printf("Stopping parse of source with %d news added", i)
break
}
if err := postgres.InsertNews(n); err != nil {
if err, ok := err.(*pq.Error); ok {
if err.Code.Name() == "unique_violation" {
log.Printf("Stopping parse of source (unique violation) with %d news added", i)
break
} else {
log.Fatalf("error while getting pq.Error object")
}
} else {
log.Fatal(err)
}
}
log.Printf("Adding news %s", n.Link)
newsChannel <- n
}
}
}
close(newsChannel)
waitGroup.Wait()
}
func main() {
updateNews()
}

173
news/news.go Normal file
View File

@ -0,0 +1,173 @@
package news
import (
"fmt"
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/mmcdole/gofeed"
"../requests"
"../utils"
)
type Sport struct {
Id int
Name string
CleanName string
}
type Source struct {
Id int
Name string
CleanName string
Urls map[int]string
Error *string
Trace *string
}
type News struct {
Id int
Source *Source
Sport *Sport
LeagueId int
TeamId int
Title string
CleanTitle string
PubDate *time.Time
Description string
Link string
Image string
Teaser string
Author string
Content []string
Redirect string
Haystack string
Tags []string
CleanTags []string
Error string
Trace string
}
func (n *News) Feed() error {
doc, err := requests.GetDocumentFromURL(n.Link)
if err != nil {
return err
}
switch n.Source.Name {
case "Eurosport":
n.Teaser = strings.TrimSpace(doc.Find("h2").Text())
doc.Find(".article-body .article-s4-rs p").Each(func(i int, s *goquery.Selection) {
n.Content = append(n.Content, s.Text())
})
n.Author = strings.TrimSpace(doc.Find(".flex a.caption-s5-fx div.font-bold").Text())
doc.Find(".related-topics .atom-tag").Each(func(i int, s *goquery.Selection) {
tag := strings.TrimSpace(s.Text())
cleanTag := utils.Sanitize(tag)
if !utils.ArrayContains(n.CleanTags, cleanTag) {
n.Tags = append(n.Tags, tag)
n.CleanTags = append(n.CleanTags, cleanTag)
}
})
case "L'équipe":
n.Teaser = strings.TrimSpace(doc.Find("h2.Article__chapo").Text())
doc.Find(".Paragraph__content").Each(func(i int, s *goquery.Selection) {
n.Content = append(n.Content, s.Text())
})
n.Author = strings.TrimSpace(doc.Find(".Author__name").Text())
doc.Find(".RelatedLinks a.RelatedLinks__link").Each(func(i int, s *goquery.Selection) {
tag := strings.TrimSpace(s.Text())
cleanTag := utils.Sanitize(tag)
if !utils.ArrayContains(n.CleanTags, cleanTag) {
n.Tags = append(n.Tags, tag)
n.CleanTags = append(n.CleanTags, cleanTag)
}
})
case "FFTT":
n.Teaser = strings.TrimSpace(doc.Find(".news-description p").First().Text())
doc.Find(".news-description p").Each(func(i int, s *goquery.Selection) {
if i > 0 {
n.Content = append(n.Content, s.Text())
}
})
doc.Find(".social-shares-large-wrapper a.link").Each(func(i int, s *goquery.Selection) {
tag := strings.TrimSpace(s.Text())
cleanTag := utils.Sanitize(tag)
if !utils.ArrayContains(n.CleanTags, cleanTag) {
n.Tags = append(n.Tags, tag)
n.CleanTags = append(n.CleanTags, cleanTag)
}
})
case "Foot Mercato":
n.Teaser = strings.TrimSpace(doc.Find("h2.article__lead").Text())
doc.Find(".article__content p").Each(func(i int, s *goquery.Selection) {
n.Content = append(n.Content, s.Text())
})
n.Author = strings.TrimSpace(doc.Find(".article__author a").Text())
default:
return fmt.Errorf("unknown source %s", n.Source.Name)
}
if len(n.Content) == 0 {
n.Redirect = n.Link
}
if len(n.CleanTags) == 0 {
n.Tags = append(n.Tags, n.Sport.Name)
n.CleanTags = append(n.CleanTags, n.Sport.CleanName)
}
n.Haystack = fmt.Sprintf("%s-%s", n.CleanTitle, strings.Join(n.CleanTags, "-"))
return nil
}
func (s *Source) ListNews(sport *Sport, url string) ([]*News, error) {
var newsList []*News
fp := gofeed.NewParser()
feed, err := fp.ParseURL(url)
if err != nil {
return nil, err
}
for _, item := range feed.Items {
n := &News{
Source: s,
Sport: sport,
Title: item.Title,
Description: regexp.MustCompile(`<[^>]*>`).ReplaceAllLiteralString(item.Description, ""),
CleanTitle: utils.Sanitize(item.Title),
PubDate: item.PublishedParsed,
Link: item.Link,
}
for _, tags := range item.Categories {
for _, tag := range strings.Split(tags, ",") {
n.Tags = append(n.Tags, strings.TrimSpace(tag))
n.CleanTags = append(n.CleanTags, utils.Sanitize(strings.TrimSpace(tag)))
}
}
if item.Image != nil {
n.Image = item.Image.URL
} else if len(item.Enclosures) > 0 {
n.Image = item.Enclosures[0].URL
} else if s.Name == "Eurosport" {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(item.Description))
if err == nil {
if src, ok := doc.Find("img").Attr("src"); ok {
n.Image = src
}
}
}
if item.Author != nil {
n.Author = item.Author.Name
}
newsList = append(newsList, n)
}
return newsList, nil
}

130
news/news_test.go Normal file
View File

@ -0,0 +1,130 @@
package news
import (
"strings"
"testing"
"../utils"
)
type expectedResult struct {
news *News
teaser string
paragraph string
author string
urlTags []string
haystack string
source *Source
sourceUrl string
}
func TestNews_Feed(t *testing.T) {
expList := [4]*expectedResult{
{
news: &News{
Source: &Source{Id: 1, Name: "Eurosport"},
Sport: &Sport{Id: 1, Name: "Football", UrlName: "football"},
Link: "https://www.eurosport.fr/football/bundesliga/2020-2021/dortmund-au-tapis-thuram-debloque-son-compteur_sto7905745/story.shtml",
},
teaser: "BUNDESLIGA Le Borussia Dortmund et ses jeunes stars ont chuté",
paragraph: "Etonnante Bundesliga. Dortmund battu, Leipzig tenu en échec samedi,",
author: "Eurosport",
urlTags: []string{"football", "bundesliga"},
},
{
news: &News{
Source: &Source{Id: 2, Name: "L'équipe"},
Sport: &Sport{Id: 1, Name: "Football", UrlName: "football"},
Link: "https://www.lequipe.fr/Football/Actualites/Mitchel-bakker-psg-je-vais-devoir-elever-mon-niveau-de-jeu/1176182",
},
teaser: "Mitchel Bakker, le latéral néerlandais du PSG",
paragraph:  Les absences de Juan Bernat et Layvin Kurzawa",
author: "H. De.",
urlTags: []string{"ligue-1", "paris-sg--fra-", "reims--fra-"},
},
{
news: &News{
Source: &Source{Id: 3, Name: "FFTT"},
Sport: &Sport{Id: 6, Name: "Tennis de Table", UrlName: "tennis-de-table"},
Link: "http://www.fftt.com/site/actualites/2020-09-22/laura-gasnier-page-qui-se-tourne-avec-bleues",
},
teaser: "Après 15 années en équipe de France, Laura Gasnier a décidé",
paragraph: "Elle évoque un choix personnel qui a demandé plusieurs mois de réflexion",
author: "",
urlTags: []string{"equipe-de-france", "gasnier-laura"},
},
{
news: &News{
Source: &Source{Id: 4, Name: "Foot Mercato"},
Sport: &Sport{Id: 1, Name: "Football", UrlName: "football"},
Link: "https://www.footmercato.net/a3190892483125730002-real-madrid-personne-ne-veut-de-luka-jovic",
},
teaser: "Alors que la date de fin du mercato approche considérablement,",
paragraph: "Tic-tac, tic-tac... Le chrono défile, et le Real Madrid",
author: "Max Franco Sanchez",
urlTags: []string{"football"},
},
}
for _, exp := range expList {
t.Logf("testing feed from %s", exp.news.Source.Name)
if err := exp.news.Feed(); err != nil {
t.Errorf("unexpected error : %s", err)
}
if !strings.HasPrefix(exp.news.Teaser, exp.teaser) {
t.Errorf("unexpected teaser : %s", exp.news.Teaser)
}
if !strings.HasPrefix(exp.news.Content[0], exp.paragraph) {
t.Errorf("unexpected content : %s", exp.news.Content[0])
}
if exp.news.Author != exp.author {
t.Errorf("unexpected author : %s", exp.news.Author)
}
for _, urlTag := range exp.urlTags {
if !utils.ArrayContains(exp.news.UrlTags, urlTag) {
t.Errorf("urltags does not contain %s", urlTag)
}
}
}
}
func TestSource_ListNews(t *testing.T) {
expList := []*expectedResult{
{
source: &Source{Id: 1, Name: "Eurosport"},
sourceUrl: "http://www.eurosport.fr/football/rss.xml",
},
{
source: &Source{Id: 1, Name: "L'équipe"},
sourceUrl: "https://www.lequipe.fr/rss/actu_rss_Football.xml",
},
{
source: &Source{Id: 1, Name: "FFTT"},
sourceUrl: "http://www.fftt.com/site/medias/flux/rss_competition.xml",
},
{
source: &Source{Id: 1, Name: "Foot Mercato"},
sourceUrl: "http://www.footmercato.net/flux-rss",
},
}
for _, exp := range expList {
t.Logf("testing newsList from %s", exp.source.Name)
newsList, err := exp.source.ListNews(&Sport{Id: 1}, exp.sourceUrl)
if err != nil {
t.Errorf("unexpected error : %s", err)
}
if len(newsList) == 0 {
t.Errorf("no news parsed from rss")
}
for _, n := range newsList {
if n.Title == "" {
t.Errorf("unexpected empty title")
}
if n.Image == "" {
t.Errorf("unexpected empty image")
}
}
}
}

148
postgres/postgres.go Normal file
View File

@ -0,0 +1,148 @@
package postgres
import (
"database/sql"
"encoding/json"
"fmt"
"log"
"os"
"github.com/lib/pq"
"../news"
"../utils"
)
type Postgres struct {
host string
port int
username string
password string
database string
psqlInfo string
psqlConn *sql.DB
isConnected bool
}
var pg *Postgres
func init() {
var err error
pg = &Postgres{
host: os.Getenv("POSTGRES_HOST"),
port: utils.AtoI(os.Getenv("POSTGRES_PORT")),
username: os.Getenv("POSTGRES_USERNAME"),
password: os.Getenv("POSTGRES_PASSWORD"),
database: os.Getenv("POSTGRES_DATABASE"),
}
pg.psqlInfo = fmt.Sprintf(
"host=%s port=%d user=%s password=%s dbname=%s sslmode=disable",
pg.host, pg.port, pg.username, pg.password, pg.database,
)
pg.psqlConn, err = sql.Open("postgres", pg.psqlInfo)
if err != nil {
log.Fatalf("error while opening pg connection : %s", err)
}
if err = pg.psqlConn.Ping(); err != nil {
log.Fatalf("error while pinging pg server : %s", err)
}
pg.isConnected = true
}
func Close() {
if !pg.isConnected {
return
}
if err := pg.psqlConn.Close(); err != nil {
log.Fatalf("error while closing pg connection : %s", err)
}
pg.isConnected = false
}
func ListSports() ([]*news.Sport, error) {
var sports []*news.Sport
rows, err := pg.psqlConn.Query("SELECT id, name, clean_name FROM public.mainapp_sport")
if err != nil {
return nil, fmt.Errorf("error while querying postgres : %s", err)
}
for rows.Next() {
sport := &news.Sport{}
err = rows.Scan(&sport.Id, &sport.Name, &sport.CleanName)
if err != nil {
return nil, fmt.Errorf("error while scanning row from postgres : %s", err)
}
sports = append(sports, sport)
}
return sports, nil
}
func ListSources() ([]*news.Source, error) {
var sources []*news.Source
rows, err := pg.psqlConn.Query("SELECT id, name, clean_name, urls FROM public.mainapp_source")
if err != nil {
return nil, fmt.Errorf("error while querying postgres : %s", err)
}
for rows.Next() {
source := &news.Source{}
sourceUrls := ""
err = rows.Scan(&source.Id, &source.Name, &source.CleanName, &sourceUrls)
if err != nil {
return nil, fmt.Errorf("error while scanning row from postgres : %s", err)
}
if err = json.Unmarshal([]byte(sourceUrls), &source.Urls); err != nil {
return nil, fmt.Errorf("error while scanning row from postgres : %s", err)
}
sources = append(sources, source)
}
return sources, nil
}
func InsertNews(n *news.News) error {
err := pg.psqlConn.QueryRow(`
INSERT INTO public.mainapp_news
(title, clean_title, link, pub_date, description, image, teaser, author,
content, redirect, haystack, tags, clean_tags, error, trace,
league_id, source_id, sport_id, team_id)
VALUES
($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19)
RETURNING
id
`, n.Title, n.CleanTitle, n.Link, n.PubDate, utils.NullableString(n.Description),
utils.NullableString(n.Image), utils.NullableString(n.Teaser), utils.NullableString(n.Author),
pq.Array(n.Content), utils.NullableString(n.Redirect), utils.NullableString(n.Haystack),
pq.Array(n.Tags), pq.Array(n.CleanTags), utils.NullableString(n.Error), utils.NullableString(n.Trace),
utils.NullableInt(n.LeagueId), n.Source.Id, n.Sport.Id, utils.NullableInt(n.TeamId),
).Scan(&n.Id)
if err != nil {
return err
}
return nil
}
func UpdateNews(n *news.News) error {
if _, err := pg.psqlConn.Exec(`
UPDATE public.mainapp_news
SET title = $1, clean_title = $2, pub_date = $3, link = $4, description = $5,
image = $6, teaser = $7, author = $8, content = $9, redirect = $10,
haystack = $11, tags = $12, clean_tags = $13, error = $14, trace = $15,
league_id = $16, source_id = $17, sport_id = $18, team_id = $19
WHERE id = $20
`, n.Title, n.CleanTitle, n.PubDate, n.Link, utils.NullableString(n.Description),
utils.NullableString(n.Image), utils.NullableString(n.Teaser), utils.NullableString(n.Author),
pq.Array(n.Content), utils.NullableString(n.Redirect), utils.NullableString(n.Haystack),
pq.Array(n.Tags), pq.Array(n.CleanTags), utils.NullableString(n.Error), utils.NullableString(n.Trace),
utils.NullableInt(n.LeagueId), n.Source.Id, n.Sport.Id, utils.NullableInt(n.TeamId), n.Id,
); err != nil {
return err
}
return nil
}

20
postgres/postgres_test.go Normal file
View File

@ -0,0 +1,20 @@
package postgres
import (
"testing"
)
func TestConnect(t *testing.T) {
return
}
func TestListSources(t *testing.T) {
defer Close()
sources, err := ListSources()
if err != nil {
t.Errorf("unexpected error : %s", err)
}
if len(sources) == 0 {
t.Errorf("no sources got from ListSources function")
}
}

40
readme.md Normal file
View File

@ -0,0 +1,40 @@
## 1- INTRODUCTION
Scraper is a project made in Golang to scrap different types of data about sports, such as :
- all news about sports
- schedules and scores [coming soon]
- team staff [coming soon]
- details about players [coming soon]
- tv schedule [coming soon]
For that, different technologies have been used :
- `Golang` (with `go routines`, `goquery`, `gofeed`, `net/proxy`)
- `PostgreSQL` (used to store data)
- `InfluxDB` (used to store details about each program execution) [coming soon]
Several websites are stored in order to gather different types of data :
- eurosport.fr
- rugbyrama.fr
- fftt.com
- footao.tv [coming soon]
- football.fr [coming soon]
- football365.fr [coming soon]
- football-direct.com [coming soon]
- footmercato.net
- lequipe.fr
- matchendirect.fr [coming soon]
- programme-television.org [coming soon]
- transfermarkt.fr [coming soon]
All these data are collected in nonprofit purpose for `1bet.fr`, a website made for free sports
predictions between friends.
I decline any responsibility about your eventual usages of this project.
## 2- DEPLOYMENT
The deployment is very simple as the binary `scraper` can be used directly.
A PostgresSQL database is needed for this program, as well as some environment variables,
all clearly listed in `postgres.go`.

62
requests/requests.go Normal file
View File

@ -0,0 +1,62 @@
package requests
import (
"fmt"
"log"
"net/http"
"net/url"
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/proxy"
)
const (
torAddr = "socks5://127.0.0.1:9050"
userAgent = "Mozilla/5.0 (X11; Linux x86_64…) Gecko/20100101 Firefox/68.0"
)
var cli *http.Client
func init() {
proxyUrl, err := url.Parse(torAddr)
if err != nil {
log.Fatalf("error while parsing torAddr %s : %s", torAddr, err)
}
dialer, err := proxy.FromURL(proxyUrl, proxy.Direct)
if err != nil {
log.Fatalf("error while creating dialer : %s", err)
}
transport := &http.Transport{
Dial: dialer.Dial,
}
cli = &http.Client{
Transport: transport,
}
}
func GetDocumentFromURL(url string) (*goquery.Document, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, fmt.Errorf("error while building request: %s", err)
}
req.Header.Set("User-Agent", userAgent)
resp, err := cli.Do(req)
if err != nil {
return nil, fmt.Errorf("error while sending request: %s", err)
}
defer func() {
if err := resp.Body.Close(); err != nil {
log.Fatalf("error while closing body for %s : %s", url, err)
}
}()
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, fmt.Errorf("error while parsing response: %s", err)
}
return doc, nil
}

17
requests/requests_test.go Normal file
View File

@ -0,0 +1,17 @@
package requests
import (
"testing"
)
func TestGetDocumentFromURL(t *testing.T) {
url := "https://check.torproject.org/"
doc, err := GetDocumentFromURL(url)
if err != nil {
t.Errorf("unexpected error: %s", err)
}
h1 := doc.Find("h1")
if h1.HasClass("off") {
t.Errorf("tor is not correctly configured")
}
}

BIN
scraper Executable file

Binary file not shown.

64
utils/utils.go Normal file
View File

@ -0,0 +1,64 @@
package utils
import (
"log"
"strconv"
)
func Sanitize(s string) (t string) {
symbols := map[rune]string{
'á': "a", 'Á': "a", 'à': "e", 'À': "a", 'â': "a", 'Â': "a", 'ä': "a", 'Ä': "a", 'ã': "a",
'ç': "c",
'é': "e", 'É': "e", 'è': "e", 'È': "e", 'ê': "e", 'Ê': "e", 'ë': "e", 'Ë': "e",
'í': "i", 'Í': "i", 'ì': "i", 'Ì': "i", 'î': "i", 'Î': "i", 'ï': "i", 'Ï': "i",
'ñ': "n",
'ó': "o", 'Ó': "o", 'ò': "o", 'Ò': "o", 'ô': "o", 'Ô': "o", 'ö': "o", 'Ö': "o", 'ø': "o",
'ú': "u", 'Ú': "u", 'ù': "u", 'Ù': "u", 'û': "u", 'Û': "u", 'ü': "u", 'Ü': "u",
}
for _, c := range s {
if int(c) >= int('a') && int(c) <= int('z') {
t += string(c)
} else if int(c) >= int('0') && int(c) <= int('9') {
t += string(c)
} else if int(c) >= int('A') && int(c) <= int('Z') {
t += string(int(c) - int('A') + int('a'))
} else if v, ok := symbols[c]; ok {
t += v
} else {
t += "-"
}
}
return t
}
func NullableString(s string) interface{} {
if len(s) == 0 {
return nil
}
return s
}
func NullableInt(i int) interface{} {
if i == 0 {
return nil
}
return i
}
func ArrayContains(arr []string, val string) bool {
for _, elt := range arr {
if elt == val {
return true
}
}
return false
}
func AtoI(s string) int {
res, err := strconv.Atoi(s)
if err != nil {
log.Fatalf("error while converting '%s' to int : %s", s, err)
}
return res
}

46
utils/utils_test.go Normal file
View File

@ -0,0 +1,46 @@
package utils
import (
"testing"
)
func TestSanitize(t *testing.T) {
if res := Sanitize("abcdef0123ABCDEFéçè_ ?.!=îôù"); res != "abcdef0123abcdefece------iou" {
t.Errorf("unexpected Sanitize() answer '%s' != 'abcdef0123abcdefece------iou'", res)
}
}
func TestNullableString(t *testing.T) {
if res := NullableString("test"); res != "test" {
t.Errorf("unexepected NullableString() answer '%s' != 'test'", res)
}
if res := NullableString(""); res != nil {
t.Errorf("unexepected NullableString() answer '%s' != nil", res)
}
}
func TestNullableInt(t *testing.T) {
if res := NullableInt(3); res != 3 {
t.Errorf("unexepected NullableInt() answer %s != 3", res)
}
if res := NullableInt(0); res != nil {
t.Errorf("unexepected NullableInt() answer %s != nil", res)
}
}
func TestArrayContains(t *testing.T) {
if !ArrayContains([]string{"bird", "apple", "ocean", "fork", "anchor"}, "bird") {
t.Errorf("unexpected ArrayContains() false answer for 'bird'")
}
if ArrayContains([]string{"bird", "apple", "ocean", "fork", "anchor"}, "potato") {
t.Errorf("unexpected ArrayContains() true answer for 'potato'")
}
}
func TestAtoI(t *testing.T) {
if res := AtoI("3"); res != 3 {
t.Errorf("unexpected answer %d != 3", res)
}
}