Initial Commit
This commit is contained in:
commit
d87bedc8d6
|
@ -0,0 +1,2 @@
|
|||
.idea
|
||||
|
|
@ -0,0 +1,87 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"github.com/lib/pq"
|
||||
"log"
|
||||
"sync"
|
||||
|
||||
"./news"
|
||||
"./postgres"
|
||||
)
|
||||
|
||||
const (
|
||||
nbProcesses = 50
|
||||
maxNewsPerSource = 50
|
||||
)
|
||||
|
||||
func updateNews() {
|
||||
defer postgres.Close()
|
||||
|
||||
sports, err := postgres.ListSports()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
mapSports := map[int]*news.Sport{}
|
||||
for _, sport := range sports {
|
||||
mapSports[sport.Id] = sport
|
||||
}
|
||||
|
||||
sources, err := postgres.ListSources()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
waitGroup := sync.WaitGroup{}
|
||||
newsChannel := make(chan *news.News)
|
||||
for i := 0; i < nbProcesses; i++ {
|
||||
waitGroup.Add(1)
|
||||
go func(nc chan *news.News, wg *sync.WaitGroup) {
|
||||
defer wg.Done()
|
||||
for n := range nc {
|
||||
if err := n.Feed(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
if err := postgres.UpdateNews(n); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
}(newsChannel, &waitGroup)
|
||||
}
|
||||
|
||||
for _, source := range sources {
|
||||
for sportId, url := range source.Urls {
|
||||
log.Printf("[+] Starting parse of source : %s", url)
|
||||
newsList, err := source.ListNews(mapSports[sportId], url)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
for i, n := range newsList {
|
||||
if i >= maxNewsPerSource {
|
||||
log.Printf("Stopping parse of source with %d news added", i)
|
||||
break
|
||||
}
|
||||
if err := postgres.InsertNews(n); err != nil {
|
||||
if err, ok := err.(*pq.Error); ok {
|
||||
if err.Code.Name() == "unique_violation" {
|
||||
log.Printf("Stopping parse of source (unique violation) with %d news added", i)
|
||||
break
|
||||
} else {
|
||||
log.Fatalf("error while getting pq.Error object")
|
||||
}
|
||||
} else {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
log.Printf("Adding news %s", n.Link)
|
||||
newsChannel <- n
|
||||
}
|
||||
}
|
||||
}
|
||||
close(newsChannel)
|
||||
waitGroup.Wait()
|
||||
}
|
||||
|
||||
func main() {
|
||||
updateNews()
|
||||
}
|
|
@ -0,0 +1,173 @@
|
|||
package news
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
|
||||
"github.com/mmcdole/gofeed"
|
||||
|
||||
"../requests"
|
||||
"../utils"
|
||||
)
|
||||
|
||||
type Sport struct {
|
||||
Id int
|
||||
Name string
|
||||
CleanName string
|
||||
}
|
||||
|
||||
type Source struct {
|
||||
Id int
|
||||
Name string
|
||||
CleanName string
|
||||
Urls map[int]string
|
||||
|
||||
Error *string
|
||||
Trace *string
|
||||
}
|
||||
|
||||
type News struct {
|
||||
Id int
|
||||
Source *Source
|
||||
Sport *Sport
|
||||
LeagueId int
|
||||
TeamId int
|
||||
|
||||
Title string
|
||||
CleanTitle string
|
||||
PubDate *time.Time
|
||||
Description string
|
||||
Link string
|
||||
Image string
|
||||
|
||||
Teaser string
|
||||
Author string
|
||||
Content []string
|
||||
Redirect string
|
||||
|
||||
Haystack string
|
||||
Tags []string
|
||||
CleanTags []string
|
||||
|
||||
Error string
|
||||
Trace string
|
||||
}
|
||||
|
||||
func (n *News) Feed() error {
|
||||
doc, err := requests.GetDocumentFromURL(n.Link)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
switch n.Source.Name {
|
||||
case "Eurosport":
|
||||
n.Teaser = strings.TrimSpace(doc.Find("h2").Text())
|
||||
doc.Find(".article-body .article-s4-rs p").Each(func(i int, s *goquery.Selection) {
|
||||
n.Content = append(n.Content, s.Text())
|
||||
})
|
||||
n.Author = strings.TrimSpace(doc.Find(".flex a.caption-s5-fx div.font-bold").Text())
|
||||
doc.Find(".related-topics .atom-tag").Each(func(i int, s *goquery.Selection) {
|
||||
tag := strings.TrimSpace(s.Text())
|
||||
cleanTag := utils.Sanitize(tag)
|
||||
if !utils.ArrayContains(n.CleanTags, cleanTag) {
|
||||
n.Tags = append(n.Tags, tag)
|
||||
n.CleanTags = append(n.CleanTags, cleanTag)
|
||||
}
|
||||
})
|
||||
case "L'équipe":
|
||||
n.Teaser = strings.TrimSpace(doc.Find("h2.Article__chapo").Text())
|
||||
doc.Find(".Paragraph__content").Each(func(i int, s *goquery.Selection) {
|
||||
n.Content = append(n.Content, s.Text())
|
||||
})
|
||||
n.Author = strings.TrimSpace(doc.Find(".Author__name").Text())
|
||||
doc.Find(".RelatedLinks a.RelatedLinks__link").Each(func(i int, s *goquery.Selection) {
|
||||
tag := strings.TrimSpace(s.Text())
|
||||
cleanTag := utils.Sanitize(tag)
|
||||
if !utils.ArrayContains(n.CleanTags, cleanTag) {
|
||||
n.Tags = append(n.Tags, tag)
|
||||
n.CleanTags = append(n.CleanTags, cleanTag)
|
||||
}
|
||||
})
|
||||
case "FFTT":
|
||||
n.Teaser = strings.TrimSpace(doc.Find(".news-description p").First().Text())
|
||||
doc.Find(".news-description p").Each(func(i int, s *goquery.Selection) {
|
||||
if i > 0 {
|
||||
n.Content = append(n.Content, s.Text())
|
||||
}
|
||||
})
|
||||
doc.Find(".social-shares-large-wrapper a.link").Each(func(i int, s *goquery.Selection) {
|
||||
tag := strings.TrimSpace(s.Text())
|
||||
cleanTag := utils.Sanitize(tag)
|
||||
if !utils.ArrayContains(n.CleanTags, cleanTag) {
|
||||
n.Tags = append(n.Tags, tag)
|
||||
n.CleanTags = append(n.CleanTags, cleanTag)
|
||||
}
|
||||
})
|
||||
case "Foot Mercato":
|
||||
n.Teaser = strings.TrimSpace(doc.Find("h2.article__lead").Text())
|
||||
doc.Find(".article__content p").Each(func(i int, s *goquery.Selection) {
|
||||
n.Content = append(n.Content, s.Text())
|
||||
})
|
||||
n.Author = strings.TrimSpace(doc.Find(".article__author a").Text())
|
||||
default:
|
||||
return fmt.Errorf("unknown source %s", n.Source.Name)
|
||||
}
|
||||
if len(n.Content) == 0 {
|
||||
n.Redirect = n.Link
|
||||
}
|
||||
if len(n.CleanTags) == 0 {
|
||||
n.Tags = append(n.Tags, n.Sport.Name)
|
||||
n.CleanTags = append(n.CleanTags, n.Sport.CleanName)
|
||||
}
|
||||
n.Haystack = fmt.Sprintf("%s-%s", n.CleanTitle, strings.Join(n.CleanTags, "-"))
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Source) ListNews(sport *Sport, url string) ([]*News, error) {
|
||||
var newsList []*News
|
||||
|
||||
fp := gofeed.NewParser()
|
||||
feed, err := fp.ParseURL(url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, item := range feed.Items {
|
||||
n := &News{
|
||||
Source: s,
|
||||
Sport: sport,
|
||||
Title: item.Title,
|
||||
Description: regexp.MustCompile(`<[^>]*>`).ReplaceAllLiteralString(item.Description, ""),
|
||||
CleanTitle: utils.Sanitize(item.Title),
|
||||
PubDate: item.PublishedParsed,
|
||||
Link: item.Link,
|
||||
}
|
||||
for _, tags := range item.Categories {
|
||||
for _, tag := range strings.Split(tags, ",") {
|
||||
n.Tags = append(n.Tags, strings.TrimSpace(tag))
|
||||
n.CleanTags = append(n.CleanTags, utils.Sanitize(strings.TrimSpace(tag)))
|
||||
}
|
||||
}
|
||||
if item.Image != nil {
|
||||
n.Image = item.Image.URL
|
||||
} else if len(item.Enclosures) > 0 {
|
||||
n.Image = item.Enclosures[0].URL
|
||||
} else if s.Name == "Eurosport" {
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(item.Description))
|
||||
if err == nil {
|
||||
if src, ok := doc.Find("img").Attr("src"); ok {
|
||||
n.Image = src
|
||||
}
|
||||
}
|
||||
}
|
||||
if item.Author != nil {
|
||||
n.Author = item.Author.Name
|
||||
}
|
||||
newsList = append(newsList, n)
|
||||
}
|
||||
return newsList, nil
|
||||
}
|
|
@ -0,0 +1,130 @@
|
|||
package news
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"../utils"
|
||||
)
|
||||
|
||||
type expectedResult struct {
|
||||
news *News
|
||||
teaser string
|
||||
paragraph string
|
||||
author string
|
||||
urlTags []string
|
||||
haystack string
|
||||
|
||||
source *Source
|
||||
sourceUrl string
|
||||
}
|
||||
|
||||
func TestNews_Feed(t *testing.T) {
|
||||
expList := [4]*expectedResult{
|
||||
{
|
||||
news: &News{
|
||||
Source: &Source{Id: 1, Name: "Eurosport"},
|
||||
Sport: &Sport{Id: 1, Name: "Football", UrlName: "football"},
|
||||
Link: "https://www.eurosport.fr/football/bundesliga/2020-2021/dortmund-au-tapis-thuram-debloque-son-compteur_sto7905745/story.shtml",
|
||||
},
|
||||
teaser: "BUNDESLIGA – Le Borussia Dortmund et ses jeunes stars ont chuté",
|
||||
paragraph: "Etonnante Bundesliga. Dortmund battu, Leipzig tenu en échec samedi,",
|
||||
author: "Eurosport",
|
||||
urlTags: []string{"football", "bundesliga"},
|
||||
},
|
||||
{
|
||||
news: &News{
|
||||
Source: &Source{Id: 2, Name: "L'équipe"},
|
||||
Sport: &Sport{Id: 1, Name: "Football", UrlName: "football"},
|
||||
Link: "https://www.lequipe.fr/Football/Actualites/Mitchel-bakker-psg-je-vais-devoir-elever-mon-niveau-de-jeu/1176182",
|
||||
},
|
||||
teaser: "Mitchel Bakker, le latéral néerlandais du PSG",
|
||||
paragraph: "« Les absences de Juan Bernat et Layvin Kurzawa",
|
||||
author: "H. De.",
|
||||
urlTags: []string{"ligue-1", "paris-sg--fra-", "reims--fra-"},
|
||||
},
|
||||
{
|
||||
news: &News{
|
||||
Source: &Source{Id: 3, Name: "FFTT"},
|
||||
Sport: &Sport{Id: 6, Name: "Tennis de Table", UrlName: "tennis-de-table"},
|
||||
Link: "http://www.fftt.com/site/actualites/2020-09-22/laura-gasnier-page-qui-se-tourne-avec-bleues",
|
||||
},
|
||||
teaser: "Après 15 années en équipe de France, Laura Gasnier a décidé",
|
||||
paragraph: "Elle évoque un choix personnel qui a demandé plusieurs mois de réflexion",
|
||||
author: "",
|
||||
urlTags: []string{"equipe-de-france", "gasnier-laura"},
|
||||
},
|
||||
{
|
||||
news: &News{
|
||||
Source: &Source{Id: 4, Name: "Foot Mercato"},
|
||||
Sport: &Sport{Id: 1, Name: "Football", UrlName: "football"},
|
||||
Link: "https://www.footmercato.net/a3190892483125730002-real-madrid-personne-ne-veut-de-luka-jovic",
|
||||
},
|
||||
teaser: "Alors que la date de fin du mercato approche considérablement,",
|
||||
paragraph: "Tic-tac, tic-tac... Le chrono défile, et le Real Madrid",
|
||||
author: "Max Franco Sanchez",
|
||||
urlTags: []string{"football"},
|
||||
},
|
||||
}
|
||||
|
||||
for _, exp := range expList {
|
||||
t.Logf("testing feed from %s", exp.news.Source.Name)
|
||||
if err := exp.news.Feed(); err != nil {
|
||||
t.Errorf("unexpected error : %s", err)
|
||||
}
|
||||
if !strings.HasPrefix(exp.news.Teaser, exp.teaser) {
|
||||
t.Errorf("unexpected teaser : %s", exp.news.Teaser)
|
||||
}
|
||||
if !strings.HasPrefix(exp.news.Content[0], exp.paragraph) {
|
||||
t.Errorf("unexpected content : %s", exp.news.Content[0])
|
||||
}
|
||||
if exp.news.Author != exp.author {
|
||||
t.Errorf("unexpected author : %s", exp.news.Author)
|
||||
}
|
||||
for _, urlTag := range exp.urlTags {
|
||||
if !utils.ArrayContains(exp.news.UrlTags, urlTag) {
|
||||
t.Errorf("urltags does not contain %s", urlTag)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSource_ListNews(t *testing.T) {
|
||||
expList := []*expectedResult{
|
||||
{
|
||||
source: &Source{Id: 1, Name: "Eurosport"},
|
||||
sourceUrl: "http://www.eurosport.fr/football/rss.xml",
|
||||
},
|
||||
{
|
||||
source: &Source{Id: 1, Name: "L'équipe"},
|
||||
sourceUrl: "https://www.lequipe.fr/rss/actu_rss_Football.xml",
|
||||
},
|
||||
{
|
||||
source: &Source{Id: 1, Name: "FFTT"},
|
||||
sourceUrl: "http://www.fftt.com/site/medias/flux/rss_competition.xml",
|
||||
},
|
||||
{
|
||||
source: &Source{Id: 1, Name: "Foot Mercato"},
|
||||
sourceUrl: "http://www.footmercato.net/flux-rss",
|
||||
},
|
||||
}
|
||||
|
||||
for _, exp := range expList {
|
||||
t.Logf("testing newsList from %s", exp.source.Name)
|
||||
newsList, err := exp.source.ListNews(&Sport{Id: 1}, exp.sourceUrl)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error : %s", err)
|
||||
}
|
||||
if len(newsList) == 0 {
|
||||
t.Errorf("no news parsed from rss")
|
||||
}
|
||||
for _, n := range newsList {
|
||||
if n.Title == "" {
|
||||
t.Errorf("unexpected empty title")
|
||||
}
|
||||
if n.Image == "" {
|
||||
t.Errorf("unexpected empty image")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,148 @@
|
|||
package postgres
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
|
||||
"github.com/lib/pq"
|
||||
|
||||
"../news"
|
||||
"../utils"
|
||||
)
|
||||
|
||||
type Postgres struct {
|
||||
host string
|
||||
port int
|
||||
username string
|
||||
password string
|
||||
database string
|
||||
|
||||
psqlInfo string
|
||||
psqlConn *sql.DB
|
||||
isConnected bool
|
||||
}
|
||||
|
||||
var pg *Postgres
|
||||
|
||||
func init() {
|
||||
var err error
|
||||
|
||||
pg = &Postgres{
|
||||
host: os.Getenv("POSTGRES_HOST"),
|
||||
port: utils.AtoI(os.Getenv("POSTGRES_PORT")),
|
||||
username: os.Getenv("POSTGRES_USERNAME"),
|
||||
password: os.Getenv("POSTGRES_PASSWORD"),
|
||||
database: os.Getenv("POSTGRES_DATABASE"),
|
||||
}
|
||||
|
||||
pg.psqlInfo = fmt.Sprintf(
|
||||
"host=%s port=%d user=%s password=%s dbname=%s sslmode=disable",
|
||||
pg.host, pg.port, pg.username, pg.password, pg.database,
|
||||
)
|
||||
|
||||
pg.psqlConn, err = sql.Open("postgres", pg.psqlInfo)
|
||||
if err != nil {
|
||||
log.Fatalf("error while opening pg connection : %s", err)
|
||||
}
|
||||
|
||||
if err = pg.psqlConn.Ping(); err != nil {
|
||||
log.Fatalf("error while pinging pg server : %s", err)
|
||||
}
|
||||
pg.isConnected = true
|
||||
}
|
||||
|
||||
func Close() {
|
||||
if !pg.isConnected {
|
||||
return
|
||||
}
|
||||
if err := pg.psqlConn.Close(); err != nil {
|
||||
log.Fatalf("error while closing pg connection : %s", err)
|
||||
}
|
||||
pg.isConnected = false
|
||||
}
|
||||
|
||||
func ListSports() ([]*news.Sport, error) {
|
||||
var sports []*news.Sport
|
||||
|
||||
rows, err := pg.psqlConn.Query("SELECT id, name, clean_name FROM public.mainapp_sport")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error while querying postgres : %s", err)
|
||||
}
|
||||
|
||||
for rows.Next() {
|
||||
sport := &news.Sport{}
|
||||
err = rows.Scan(&sport.Id, &sport.Name, &sport.CleanName)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error while scanning row from postgres : %s", err)
|
||||
}
|
||||
sports = append(sports, sport)
|
||||
}
|
||||
return sports, nil
|
||||
}
|
||||
|
||||
func ListSources() ([]*news.Source, error) {
|
||||
var sources []*news.Source
|
||||
|
||||
rows, err := pg.psqlConn.Query("SELECT id, name, clean_name, urls FROM public.mainapp_source")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error while querying postgres : %s", err)
|
||||
}
|
||||
|
||||
for rows.Next() {
|
||||
source := &news.Source{}
|
||||
sourceUrls := ""
|
||||
err = rows.Scan(&source.Id, &source.Name, &source.CleanName, &sourceUrls)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error while scanning row from postgres : %s", err)
|
||||
}
|
||||
if err = json.Unmarshal([]byte(sourceUrls), &source.Urls); err != nil {
|
||||
return nil, fmt.Errorf("error while scanning row from postgres : %s", err)
|
||||
}
|
||||
sources = append(sources, source)
|
||||
}
|
||||
return sources, nil
|
||||
}
|
||||
|
||||
func InsertNews(n *news.News) error {
|
||||
err := pg.psqlConn.QueryRow(`
|
||||
INSERT INTO public.mainapp_news
|
||||
(title, clean_title, link, pub_date, description, image, teaser, author,
|
||||
content, redirect, haystack, tags, clean_tags, error, trace,
|
||||
league_id, source_id, sport_id, team_id)
|
||||
VALUES
|
||||
($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19)
|
||||
RETURNING
|
||||
id
|
||||
`, n.Title, n.CleanTitle, n.Link, n.PubDate, utils.NullableString(n.Description),
|
||||
utils.NullableString(n.Image), utils.NullableString(n.Teaser), utils.NullableString(n.Author),
|
||||
pq.Array(n.Content), utils.NullableString(n.Redirect), utils.NullableString(n.Haystack),
|
||||
pq.Array(n.Tags), pq.Array(n.CleanTags), utils.NullableString(n.Error), utils.NullableString(n.Trace),
|
||||
utils.NullableInt(n.LeagueId), n.Source.Id, n.Sport.Id, utils.NullableInt(n.TeamId),
|
||||
).Scan(&n.Id)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func UpdateNews(n *news.News) error {
|
||||
if _, err := pg.psqlConn.Exec(`
|
||||
UPDATE public.mainapp_news
|
||||
SET title = $1, clean_title = $2, pub_date = $3, link = $4, description = $5,
|
||||
image = $6, teaser = $7, author = $8, content = $9, redirect = $10,
|
||||
haystack = $11, tags = $12, clean_tags = $13, error = $14, trace = $15,
|
||||
league_id = $16, source_id = $17, sport_id = $18, team_id = $19
|
||||
WHERE id = $20
|
||||
`, n.Title, n.CleanTitle, n.PubDate, n.Link, utils.NullableString(n.Description),
|
||||
utils.NullableString(n.Image), utils.NullableString(n.Teaser), utils.NullableString(n.Author),
|
||||
pq.Array(n.Content), utils.NullableString(n.Redirect), utils.NullableString(n.Haystack),
|
||||
pq.Array(n.Tags), pq.Array(n.CleanTags), utils.NullableString(n.Error), utils.NullableString(n.Trace),
|
||||
utils.NullableInt(n.LeagueId), n.Source.Id, n.Sport.Id, utils.NullableInt(n.TeamId), n.Id,
|
||||
); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
package postgres
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestConnect(t *testing.T) {
|
||||
return
|
||||
}
|
||||
|
||||
func TestListSources(t *testing.T) {
|
||||
defer Close()
|
||||
sources, err := ListSources()
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error : %s", err)
|
||||
}
|
||||
if len(sources) == 0 {
|
||||
t.Errorf("no sources got from ListSources function")
|
||||
}
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
## 1- INTRODUCTION
|
||||
|
||||
Scraper is a project made in Golang to scrap different types of data about sports, such as :
|
||||
- all news about sports
|
||||
- schedules and scores [coming soon]
|
||||
- team staff [coming soon]
|
||||
- details about players [coming soon]
|
||||
- tv schedule [coming soon]
|
||||
|
||||
For that, different technologies have been used :
|
||||
- `Golang` (with `go routines`, `goquery`, `gofeed`, `net/proxy`)
|
||||
- `PostgreSQL` (used to store data)
|
||||
- `InfluxDB` (used to store details about each program execution) [coming soon]
|
||||
|
||||
Several websites are stored in order to gather different types of data :
|
||||
- eurosport.fr
|
||||
- rugbyrama.fr
|
||||
- fftt.com
|
||||
- footao.tv [coming soon]
|
||||
- football.fr [coming soon]
|
||||
- football365.fr [coming soon]
|
||||
- football-direct.com [coming soon]
|
||||
- footmercato.net
|
||||
- lequipe.fr
|
||||
- matchendirect.fr [coming soon]
|
||||
- programme-television.org [coming soon]
|
||||
- transfermarkt.fr [coming soon]
|
||||
|
||||
All these data are collected in nonprofit purpose for `1bet.fr`, a website made for free sports
|
||||
predictions between friends.
|
||||
|
||||
I decline any responsibility about your eventual usages of this project.
|
||||
|
||||
|
||||
## 2- DEPLOYMENT
|
||||
|
||||
The deployment is very simple as the binary `scraper` can be used directly.
|
||||
|
||||
A PostgresSQL database is needed for this program, as well as some environment variables,
|
||||
all clearly listed in `postgres.go`.
|
|
@ -0,0 +1,62 @@
|
|||
package requests
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/url"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"golang.org/x/net/proxy"
|
||||
)
|
||||
|
||||
const (
|
||||
torAddr = "socks5://127.0.0.1:9050"
|
||||
userAgent = "Mozilla/5.0 (X11; Linux x86_64…) Gecko/20100101 Firefox/68.0"
|
||||
)
|
||||
|
||||
var cli *http.Client
|
||||
|
||||
func init() {
|
||||
proxyUrl, err := url.Parse(torAddr)
|
||||
if err != nil {
|
||||
log.Fatalf("error while parsing torAddr %s : %s", torAddr, err)
|
||||
}
|
||||
|
||||
dialer, err := proxy.FromURL(proxyUrl, proxy.Direct)
|
||||
if err != nil {
|
||||
log.Fatalf("error while creating dialer : %s", err)
|
||||
}
|
||||
|
||||
transport := &http.Transport{
|
||||
Dial: dialer.Dial,
|
||||
}
|
||||
cli = &http.Client{
|
||||
Transport: transport,
|
||||
}
|
||||
}
|
||||
|
||||
func GetDocumentFromURL(url string) (*goquery.Document, error) {
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error while building request: %s", err)
|
||||
}
|
||||
req.Header.Set("User-Agent", userAgent)
|
||||
|
||||
resp, err := cli.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error while sending request: %s", err)
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err := resp.Body.Close(); err != nil {
|
||||
log.Fatalf("error while closing body for %s : %s", url, err)
|
||||
}
|
||||
}()
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error while parsing response: %s", err)
|
||||
}
|
||||
return doc, nil
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
package requests
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestGetDocumentFromURL(t *testing.T) {
|
||||
url := "https://check.torproject.org/"
|
||||
doc, err := GetDocumentFromURL(url)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %s", err)
|
||||
}
|
||||
h1 := doc.Find("h1")
|
||||
if h1.HasClass("off") {
|
||||
t.Errorf("tor is not correctly configured")
|
||||
}
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
package utils
|
||||
|
||||
import (
|
||||
"log"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
func Sanitize(s string) (t string) {
|
||||
symbols := map[rune]string{
|
||||
'á': "a", 'Á': "a", 'à': "e", 'À': "a", 'â': "a", 'Â': "a", 'ä': "a", 'Ä': "a", 'ã': "a",
|
||||
'ç': "c",
|
||||
'é': "e", 'É': "e", 'è': "e", 'È': "e", 'ê': "e", 'Ê': "e", 'ë': "e", 'Ë': "e",
|
||||
'í': "i", 'Í': "i", 'ì': "i", 'Ì': "i", 'î': "i", 'Î': "i", 'ï': "i", 'Ï': "i",
|
||||
'ñ': "n",
|
||||
'ó': "o", 'Ó': "o", 'ò': "o", 'Ò': "o", 'ô': "o", 'Ô': "o", 'ö': "o", 'Ö': "o", 'ø': "o",
|
||||
'ú': "u", 'Ú': "u", 'ù': "u", 'Ù': "u", 'û': "u", 'Û': "u", 'ü': "u", 'Ü': "u",
|
||||
}
|
||||
|
||||
for _, c := range s {
|
||||
if int(c) >= int('a') && int(c) <= int('z') {
|
||||
t += string(c)
|
||||
} else if int(c) >= int('0') && int(c) <= int('9') {
|
||||
t += string(c)
|
||||
} else if int(c) >= int('A') && int(c) <= int('Z') {
|
||||
t += string(int(c) - int('A') + int('a'))
|
||||
} else if v, ok := symbols[c]; ok {
|
||||
t += v
|
||||
} else {
|
||||
t += "-"
|
||||
}
|
||||
}
|
||||
return t
|
||||
}
|
||||
|
||||
func NullableString(s string) interface{} {
|
||||
if len(s) == 0 {
|
||||
return nil
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func NullableInt(i int) interface{} {
|
||||
if i == 0 {
|
||||
return nil
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
func ArrayContains(arr []string, val string) bool {
|
||||
for _, elt := range arr {
|
||||
if elt == val {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func AtoI(s string) int {
|
||||
res, err := strconv.Atoi(s)
|
||||
if err != nil {
|
||||
log.Fatalf("error while converting '%s' to int : %s", s, err)
|
||||
}
|
||||
return res
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
package utils
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestSanitize(t *testing.T) {
|
||||
if res := Sanitize("abcdef0123ABCDEFéçè_ ?.!=îôù"); res != "abcdef0123abcdefece------iou" {
|
||||
t.Errorf("unexpected Sanitize() answer '%s' != 'abcdef0123abcdefece------iou'", res)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNullableString(t *testing.T) {
|
||||
if res := NullableString("test"); res != "test" {
|
||||
t.Errorf("unexepected NullableString() answer '%s' != 'test'", res)
|
||||
}
|
||||
|
||||
if res := NullableString(""); res != nil {
|
||||
t.Errorf("unexepected NullableString() answer '%s' != nil", res)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNullableInt(t *testing.T) {
|
||||
if res := NullableInt(3); res != 3 {
|
||||
t.Errorf("unexepected NullableInt() answer %s != 3", res)
|
||||
}
|
||||
|
||||
if res := NullableInt(0); res != nil {
|
||||
t.Errorf("unexepected NullableInt() answer %s != nil", res)
|
||||
}
|
||||
}
|
||||
|
||||
func TestArrayContains(t *testing.T) {
|
||||
if !ArrayContains([]string{"bird", "apple", "ocean", "fork", "anchor"}, "bird") {
|
||||
t.Errorf("unexpected ArrayContains() false answer for 'bird'")
|
||||
}
|
||||
if ArrayContains([]string{"bird", "apple", "ocean", "fork", "anchor"}, "potato") {
|
||||
t.Errorf("unexpected ArrayContains() true answer for 'potato'")
|
||||
}
|
||||
}
|
||||
|
||||
func TestAtoI(t *testing.T) {
|
||||
if res := AtoI("3"); res != 3 {
|
||||
t.Errorf("unexpected answer %d != 3", res)
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue