scraper/news/news.go

198 lines
5.6 KiB
Go
Raw Permalink Normal View History

2020-10-05 08:24:33 +00:00
package news
import (
"fmt"
"net/url"
2020-10-05 08:24:33 +00:00
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/mmcdole/gofeed"
"1bet.fr/scraper/requests"
"1bet.fr/scraper/utils"
2020-10-05 08:24:33 +00:00
)
type Sport struct {
Id int
Name string
CleanName string
}
type Source struct {
Id int
Sport *Sport
2020-10-05 08:24:33 +00:00
Name string
CleanName string
FeedUrl string
2020-10-05 08:24:33 +00:00
Error *string
Trace *string
}
type News struct {
Id int
Source *Source
LeagueId *int
TeamId *int
2020-10-05 08:24:33 +00:00
Title string
CleanTitle string
Link string
PubDate *time.Time
Description *string
Image *string
2020-10-05 08:24:33 +00:00
Teaser *string
Author *string
Content *[]string
Redirect *string
2020-10-05 08:24:33 +00:00
Haystack *string
Tags *[]string
CleanTags *[]string
2020-10-05 08:24:33 +00:00
Error *string
Trace *string
2020-10-05 08:24:33 +00:00
}
func (n *News) Feed() error {
parsedLink, err := url.Parse(n.Link)
if err != nil {
return err
}
2020-10-05 08:24:33 +00:00
doc, err := requests.GetDocumentFromURL(n.Link)
if err != nil {
return err
}
switch parsedLink.Host {
case utils.HostEurosport:
n.Teaser = utils.StringPointer(doc.Find("h2").First().Text())
2020-10-05 08:24:33 +00:00
doc.Find(".article-body .article-s4-rs p").Each(func(i int, s *goquery.Selection) {
n.Content = utils.ArrayPointerAppend(n.Content, s.Text())
2020-10-05 08:24:33 +00:00
})
n.Author = utils.StringPointer(doc.Find(".flex a.caption-s5-fx div.font-bold").Text())
2020-10-05 08:24:33 +00:00
doc.Find(".related-topics .atom-tag").Each(func(i int, s *goquery.Selection) {
tag := strings.TrimSpace(s.Text())
cleanTag := utils.Sanitize(tag)
if !utils.ArrayPointerContains(n.CleanTags, cleanTag) {
n.Tags = utils.ArrayPointerAppend(n.Tags, tag)
n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, cleanTag)
2020-10-05 08:24:33 +00:00
}
})
case utils.HostRugbyrama:
n.Teaser = utils.StringPointer(doc.Find("h2.storyfull__teaser").Text())
doc.Find(".storyfull__paragraphs p.storyfull__paragraph").Each(func(i int, s *goquery.Selection) {
n.Content = utils.ArrayPointerAppend(n.Content, s.Text())
})
n.Author = utils.StringPointer(strings.Replace(doc.Find(".storyfull__publisher-author-name").Text(), "Par ", "", 1))
doc.Find(".storyfull__linkentities-infos a.storyfull__linkentities-name").Each(func(i int, s *goquery.Selection) {
tag := strings.TrimSpace(s.Text())
cleanTag := utils.Sanitize(tag)
if !utils.ArrayPointerContains(n.CleanTags, cleanTag) {
n.Tags = utils.ArrayPointerAppend(n.Tags, tag)
n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, cleanTag)
}
})
case utils.HostLequipe:
n.Teaser = utils.StringPointer(doc.Find("h2.Article__chapo").Text())
2020-10-05 08:24:33 +00:00
doc.Find(".Paragraph__content").Each(func(i int, s *goquery.Selection) {
n.Content = utils.ArrayPointerAppend(n.Content, s.Text())
2020-10-05 08:24:33 +00:00
})
n.Author = utils.StringPointer(doc.Find(".Author__name").Text())
2020-10-05 08:24:33 +00:00
doc.Find(".RelatedLinks a.RelatedLinks__link").Each(func(i int, s *goquery.Selection) {
tag := strings.TrimSpace(s.Text())
cleanTag := utils.Sanitize(tag)
if !utils.ArrayPointerContains(n.CleanTags, cleanTag) {
n.Tags = utils.ArrayPointerAppend(n.Tags, tag)
n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, cleanTag)
2020-10-05 08:24:33 +00:00
}
})
case utils.HostFFTT:
n.Teaser = utils.StringPointer(doc.Find(".news-description p").First().Text())
2020-10-05 08:24:33 +00:00
doc.Find(".news-description p").Each(func(i int, s *goquery.Selection) {
if i > 0 {
n.Content = utils.ArrayPointerAppend(n.Content, s.Text())
2020-10-05 08:24:33 +00:00
}
})
doc.Find(".social-shares-large-wrapper a.link").Each(func(i int, s *goquery.Selection) {
tag := strings.TrimSpace(s.Text())
cleanTag := utils.Sanitize(tag)
if !utils.ArrayPointerContains(n.CleanTags, cleanTag) {
n.Tags = utils.ArrayPointerAppend(n.Tags, tag)
n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, cleanTag)
2020-10-05 08:24:33 +00:00
}
})
case utils.HostFootmercato:
n.Teaser = utils.StringPointer(doc.Find("h2.article__lead").Text())
2020-10-05 08:24:33 +00:00
doc.Find(".article__content p").Each(func(i int, s *goquery.Selection) {
n.Content = utils.ArrayPointerAppend(n.Content, s.Text())
2020-10-05 08:24:33 +00:00
})
n.Author = utils.StringPointer(doc.Find(".article__author a").Text())
2020-10-05 08:24:33 +00:00
default:
2020-10-19 09:26:23 +00:00
n.Error = utils.StringPointer("unknown link host")
n.Trace = utils.StringPointer("unknown link host : " + n.Link)
2020-10-05 08:24:33 +00:00
}
if n.Content == nil {
n.Redirect = utils.StringPointer(n.Link)
2020-10-05 08:24:33 +00:00
}
if n.CleanTags == nil {
n.Tags = utils.ArrayPointerAppend(n.Tags, n.Source.Sport.Name)
n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, n.Source.Sport.CleanName)
2020-10-05 08:24:33 +00:00
}
n.Haystack = utils.StringPointer(fmt.Sprintf("%s-%s", n.CleanTitle, utils.ArrayPointerJoin(n.CleanTags, "-")))
2020-10-05 08:24:33 +00:00
return nil
}
func (s *Source) ListNews() ([]*News, error) {
2020-10-05 08:24:33 +00:00
var newsList []*News
fp := gofeed.NewParser()
feed, err := fp.ParseURL(s.FeedUrl)
2020-10-05 08:24:33 +00:00
if err != nil {
return nil, err
}
for _, item := range feed.Items {
n := &News{
Source: s,
Title: item.Title,
Description: utils.StringPointer(regexp.MustCompile(`<[^>]*>`).ReplaceAllLiteralString(item.Description, "")),
CleanTitle: utils.Sanitize(item.Title),
2020-10-05 08:24:33 +00:00
PubDate: item.PublishedParsed,
Link: item.Link,
}
for _, tags := range item.Categories {
for _, tag := range strings.Split(tags, ",") {
n.Tags = utils.ArrayPointerAppend(n.Tags, tag)
n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, utils.Sanitize(tag))
2020-10-05 08:24:33 +00:00
}
}
if item.Image != nil {
n.Image = utils.StringPointer(item.Image.URL)
2020-10-05 08:24:33 +00:00
} else if len(item.Enclosures) > 0 {
n.Image = utils.StringPointer(item.Enclosures[0].URL)
} else {
2020-10-05 08:24:33 +00:00
doc, err := goquery.NewDocumentFromReader(strings.NewReader(item.Description))
if err == nil {
if src, ok := doc.Find("img").Attr("src"); ok {
n.Image = utils.StringPointer(src)
2020-10-05 08:24:33 +00:00
}
}
}
if item.Author != nil {
n.Author = utils.StringPointer(item.Author.Name)
2020-10-05 08:24:33 +00:00
}
newsList = append(newsList, n)
}
return newsList, nil
}