scraper/news/news.go

174 lines
4.3 KiB
Go
Raw Normal View History

2020-10-05 08:24:33 +00:00
package news
import (
"fmt"
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/mmcdole/gofeed"
"../requests"
"../utils"
)
type Sport struct {
Id int
Name string
CleanName string
}
type Source struct {
Id int
Name string
CleanName string
Urls map[int]string
Error *string
Trace *string
}
type News struct {
Id int
Source *Source
Sport *Sport
LeagueId int
TeamId int
Title string
CleanTitle string
PubDate *time.Time
Description string
Link string
Image string
Teaser string
Author string
Content []string
Redirect string
Haystack string
Tags []string
CleanTags []string
Error string
Trace string
}
func (n *News) Feed() error {
doc, err := requests.GetDocumentFromURL(n.Link)
if err != nil {
return err
}
switch n.Source.Name {
case "Eurosport":
n.Teaser = strings.TrimSpace(doc.Find("h2").Text())
doc.Find(".article-body .article-s4-rs p").Each(func(i int, s *goquery.Selection) {
n.Content = append(n.Content, s.Text())
})
n.Author = strings.TrimSpace(doc.Find(".flex a.caption-s5-fx div.font-bold").Text())
doc.Find(".related-topics .atom-tag").Each(func(i int, s *goquery.Selection) {
tag := strings.TrimSpace(s.Text())
cleanTag := utils.Sanitize(tag)
if !utils.ArrayContains(n.CleanTags, cleanTag) {
n.Tags = append(n.Tags, tag)
n.CleanTags = append(n.CleanTags, cleanTag)
}
})
case "L'équipe":
n.Teaser = strings.TrimSpace(doc.Find("h2.Article__chapo").Text())
doc.Find(".Paragraph__content").Each(func(i int, s *goquery.Selection) {
n.Content = append(n.Content, s.Text())
})
n.Author = strings.TrimSpace(doc.Find(".Author__name").Text())
doc.Find(".RelatedLinks a.RelatedLinks__link").Each(func(i int, s *goquery.Selection) {
tag := strings.TrimSpace(s.Text())
cleanTag := utils.Sanitize(tag)
if !utils.ArrayContains(n.CleanTags, cleanTag) {
n.Tags = append(n.Tags, tag)
n.CleanTags = append(n.CleanTags, cleanTag)
}
})
case "FFTT":
n.Teaser = strings.TrimSpace(doc.Find(".news-description p").First().Text())
doc.Find(".news-description p").Each(func(i int, s *goquery.Selection) {
if i > 0 {
n.Content = append(n.Content, s.Text())
}
})
doc.Find(".social-shares-large-wrapper a.link").Each(func(i int, s *goquery.Selection) {
tag := strings.TrimSpace(s.Text())
cleanTag := utils.Sanitize(tag)
if !utils.ArrayContains(n.CleanTags, cleanTag) {
n.Tags = append(n.Tags, tag)
n.CleanTags = append(n.CleanTags, cleanTag)
}
})
case "Foot Mercato":
n.Teaser = strings.TrimSpace(doc.Find("h2.article__lead").Text())
doc.Find(".article__content p").Each(func(i int, s *goquery.Selection) {
n.Content = append(n.Content, s.Text())
})
n.Author = strings.TrimSpace(doc.Find(".article__author a").Text())
default:
return fmt.Errorf("unknown source %s", n.Source.Name)
}
if len(n.Content) == 0 {
n.Redirect = n.Link
}
if len(n.CleanTags) == 0 {
n.Tags = append(n.Tags, n.Sport.Name)
n.CleanTags = append(n.CleanTags, n.Sport.CleanName)
}
n.Haystack = fmt.Sprintf("%s-%s", n.CleanTitle, strings.Join(n.CleanTags, "-"))
return nil
}
func (s *Source) ListNews(sport *Sport, url string) ([]*News, error) {
var newsList []*News
fp := gofeed.NewParser()
feed, err := fp.ParseURL(url)
if err != nil {
return nil, err
}
for _, item := range feed.Items {
n := &News{
Source: s,
Sport: sport,
Title: item.Title,
Description: regexp.MustCompile(`<[^>]*>`).ReplaceAllLiteralString(item.Description, ""),
CleanTitle: utils.Sanitize(item.Title),
PubDate: item.PublishedParsed,
Link: item.Link,
}
for _, tags := range item.Categories {
for _, tag := range strings.Split(tags, ",") {
n.Tags = append(n.Tags, strings.TrimSpace(tag))
n.CleanTags = append(n.CleanTags, utils.Sanitize(strings.TrimSpace(tag)))
}
}
if item.Image != nil {
n.Image = item.Image.URL
} else if len(item.Enclosures) > 0 {
n.Image = item.Enclosures[0].URL
} else if s.Name == "Eurosport" {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(item.Description))
if err == nil {
if src, ok := doc.Find("img").Attr("src"); ok {
n.Image = src
}
}
}
if item.Author != nil {
n.Author = item.Author.Name
}
newsList = append(newsList, n)
}
return newsList, nil
}