Files
kurious/internal/common/client/sravni/client.go
2024-03-24 16:51:39 +03:00

537 lines
14 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package sravni
import (
"context"
"encoding/json"
"fmt"
"io"
"log/slog"
"strconv"
"strings"
"time"
"git.loyso.art/frx/kurious/internal/common/errors"
"git.loyso.art/frx/kurious/internal/common/xslices"
"git.loyso.art/frx/kurious/pkg/xdefault"
"github.com/go-resty/resty/v2"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"golang.org/x/time/rate"
)
const (
baseURL = "https://www.sravni.ru/kursy"
)
//go:generate mockery --name Client
type Client interface {
GetMainPageState() (*PageState, error)
ListEducationalProducts(
ctx context.Context,
params ListEducationProductsParams,
) (result ListEducationProductsResponse, err error)
ListEducationalProductsFilterCount(
ctx context.Context,
params ListEducationProductsParams,
) (result ProductsFilterCount, err error)
}
func NewClient(ctx context.Context, log *slog.Logger, debug bool) (c *client, err error) {
c = &client{
log: log.With(slog.String("client", "sravni")),
limiter: rate.NewLimiter(rate.Every(time.Millisecond*100), 1),
http: resty.New().
SetBaseURL(baseURL).
SetDebug(debug),
}
c.cachedMainPageInfo, err = c.getMainPageState(ctx)
if err != nil {
return nil, err
}
getQuerySet := func(fields []Field) querySet {
items := xslices.Map(fields, func(f Field) string {
return f.Value
})
return newQuerySet(items...)
}
dicts := c.cachedMainPageInfo.Props.InitialReduxState.Dictionaries.Data
c.validLearningTypes = getQuerySet(dicts.LearningType.Fields)
c.validCourseThematics = getQuerySet(dicts.CourseThematics.Fields)
return c, nil
}
type client struct {
log *slog.Logger
http *resty.Client
cachedMainPageInfo *PageState
validLearningTypes querySet
validCourseThematics querySet
limiter *rate.Limiter
}
func (c *client) GetMainPageState() (*PageState, error) {
return c.cachedMainPageInfo.Clone(), nil
}
type ListEducationProductsParams struct {
LearningType string
CoursesThematics []string
CourseGraphics []string
CourseLevels []string
CourseFormats []string
CourseDurations []string
CourseTypes []string
SortBy string
Limit int
Offset int
}
type stringifiedBool string
func AsStringifiedBool(b bool) stringifiedBool {
return stringifiedBool(strconv.FormatBool(b))
}
// FilterLevel is a Уровень сложности
type FilterLevel string
const (
FilterLevelJunior FilterLevel = "levelJuniorNew"
FilterLevelMiddle FilterLevel = "levelMiddleNew"
FilterLevelChildren FilterLevel = "levelChildNew"
)
// FilterTime is a срок обучения
type FilterTime string
const (
FilterTimeLessMonth FilterTime = "1" // less than month
FilterTimeFrom1To3Month FilterTime = "2" // from month to three month
FilterTimeFrom3To6 FilterTime = "3" // from three to six months
FilterTimeFrom6To12 FilterTime = "4" // from six to twelve months
FilterTimeFrom12 FilterTime = "5" // from twelve months
)
// FilterFormat is a Форма обучение
type FilterFormat string
const (
FilterFormatRecord FilterFormat = "formatRecordNew"
FilterFormatOnline FilterFormat = "formatOnlineNew"
FilterFormatOffline FilterFormat = "formatOfflineNew"
)
// FilterGraphic is a График прохождения
type FilterGraphic string
const (
FilterGraphicTimeLength FilterGraphic = "courseTimeLengthNew"
FilterGraphicTerm FilterGraphic = "courseTimeTermNew"
)
type listEducationProductsRequest struct {
Fingerprint string `json:"fingerPrint,omitempty"`
ProductName string `json:"productName,omitempty"`
Location string `json:"location"`
OfferTypes []string `json:"offerTypes"`
IsMix bool `json:"isMix"`
MixRepeated bool `json:"mixRepeated"`
Fields []string `json:"fields"`
// Filters
LearningType []string `json:"learningtype"`
CoursesThematics []string `json:"coursesThematics"`
Organizations []string `json:"organizations,omitempty"` // list of ids
DictionatyFormatFilterNew []FilterFormat `json:"dictionaryFormatFilterNew,omitempty"`
DictionaryTimeFilter []FilterTime `json:"dictionaryTimeFilter,omitempty"`
DictionaryGraphicFilterNew []FilterGraphic `json:"dictionaryGraphicFilterNew,omitempty"`
DictionatyLevelFilterNew []FilterLevel `json:"dictionaryLevelFilterNew,omitempty"`
// Options
SubMentor []stringifiedBool `json:"sub-mentor,omitempty"` // option with mentor
SubTimeFree []stringifiedBool `json:"sub-timeFree,omitempty"` // option with trial
SubJobGarantSub []stringifiedBool `json:"sub-jobGarantsub,omitempty"` // option for job garantee
SubPriceFree []stringifiedBool `json:"sub-priceFree,omitempty"` // only free
SubInstallment []stringifiedBool `json:"sub-installment,omitempty"` // with credit
SubIsCourseProfession []stringifiedBool `json:"sub-isCourseProfession,omitempty"` // освоить профессию с нуля
DevelopSkills []stringifiedBool `json:"developSkills,omitempty"` // развить навыки
NotSubIsWebinar string `json:"not-sub-isWebinar,omitempty"`
NotB2B string `json:"not-b2b,omitempty"`
AdvertisingOnly bool `json:"advertisingOnly,omitempty"`
// Pagination and sorting
Limit int `json:"limit"`
Offset int `json:"offset"`
SortProperty string `json:"sortProperty"`
SortDirection string `json:"sortDirection"`
}
type ListEducationProductsResponse struct {
Items []Course `json:"items"`
Organizations map[string]Organization `json:"organizations"`
TotalCount int `json:"totalCount"`
TotalCountAdv int `json:"totalCountAdv"`
}
func (c *client) ListEducationalProducts(
ctx context.Context,
params ListEducationProductsParams,
) (result ListEducationProductsResponse, err error) {
const urlPath = "/v1/education/products"
const defaultLimit = 1
const defaultSortProp = "advertising.position"
const defaultSortDirection = "asc"
// TODO: find out should it be settable
const productName = "learning-courses"
if err = c.checkClientInited(); err != nil {
return result, err
}
if !c.validLearningTypes.hasValue(params.LearningType) {
return result, errors.NewValidationError("learning_type", "unknown value")
}
reqParams := listEducationProductsRequest{
LearningType: valueAsArray(params.LearningType),
CoursesThematics: params.CoursesThematics,
ProductName: productName,
Fields: defaultProductFields,
SortProperty: defaultSortProp, // mayber sort by price?
SortDirection: defaultSortDirection,
NotSubIsWebinar: strconv.FormatBool(true),
NotB2B: strconv.FormatBool(true),
IsMix: false, // not sure why, but for better parsing
MixRepeated: true, // looks like this option should force to exclude duplicates
AdvertisingOnly: false, // If true, it will show only paid items.
Location: "", // TODO: get and fill location?
Fingerprint: "", // not sure it should be set.
OfferTypes: []string{}, // for more precise filter but not needed.
Limit: xdefault.WithFallback(params.Limit, defaultLimit),
Offset: params.Offset,
}
if err = c.limiter.Wait(ctx); err != nil {
return result, fmt.Errorf("waiting for limit: %w", err)
}
resp, err := c.http.R().
SetBody(reqParams).
SetResult(&result).
EnableTrace().
Post(c.makeEducationURL(urlPath))
if err != nil {
return result, fmt.Errorf("making request: %w", err)
}
if resp.IsError() {
return result, fmt.Errorf("bad status code %d: %w", resp.StatusCode(), errors.ErrUnexpectedStatus)
}
return result, nil
}
type educationProductFilterCountRequest struct {
Filters educationProductFilter `json:"filters"`
}
type educationProductFilter struct {
AdvertisingOnly bool `json:"advertisingOnly"`
Location string `json:"location"`
LearningTypes []string `json:"learningTypes"`
CoursesThematics []string `json:"coursesThematics"`
CourseGraphics []string `json:"courseGraphics"`
CourseLevels []string `json:"courseLevels"`
CourseFormats []string `json:"courseFormats"`
CourseDurations []string `json:"courseDurations"`
CourseTypes []string `json:"courseTypes"`
}
type boolableDict map[int]int
type nameableDict map[string]int
type ProductsFilterCount struct {
IsCourseProfession boolableDict `json:"isCourseProfession"` // 0: count, 1: count eq to false + true
CourseLevels nameableDict `json:"courseLevels"`
CourseGraphics nameableDict `json:"courseGraphics"`
OrganizationIDs nameableDict `json:"organizationIds"`
HasTrialPeriod boolableDict `json:"hasTrialPeriod"`
HasMentor boolableDict `json:"hasMentor"`
HasJobGuarantee boolableDict `json:"hasJobGuarantee"`
CourseFormats nameableDict `json:"courseFormats"`
CourseDurations nameableDict `json:"courseDurations"`
CoursesThematics nameableDict `json:"coursesThematics"`
LearningTypes nameableDict `json:"learningTypes"`
}
func (c *client) ListEducationalProductsFilterCount(
ctx context.Context,
params ListEducationProductsParams,
) (result ProductsFilterCount, err error) {
const urlPath = "/v2/education/products/filter/count"
if err = c.checkClientInited(); err != nil {
return result, err
}
if !c.validLearningTypes.hasValue(params.LearningType) {
return result, errors.NewValidationError("learning_type", "unknown value")
}
for _, ct := range params.CoursesThematics {
if !c.validCourseThematics.hasValue(ct) {
return result, errors.NewValidationError("courses_thematics", "unknown value "+ct)
}
}
reqParams := educationProductFilterCountRequest{
Filters: educationProductFilter{
AdvertisingOnly: false,
Location: "",
LearningTypes: valueAsArray(params.LearningType),
CoursesThematics: params.CoursesThematics,
CourseGraphics: params.CourseGraphics,
CourseLevels: params.CourseLevels,
CourseFormats: params.CourseFormats,
CourseDurations: params.CourseDurations,
CourseTypes: params.CourseTypes,
},
}
if err = c.limiter.Wait(ctx); err != nil {
return result, fmt.Errorf("waiting for limit: %w", err)
}
var respData DataContainer[ProductsFilterCount]
resp, err := c.http.R().
SetBody(reqParams).
SetResult(&respData).
EnableTrace().
Post(c.makeEducationURL(urlPath))
if err != nil {
return result, fmt.Errorf("making request: %w", err)
}
if resp.IsError() {
return result, fmt.Errorf("bad status code %d: %w", resp.StatusCode(), errors.ErrUnexpectedStatus)
}
return respData.Data, nil
}
func (c *client) makeEducationURL(path string) string {
if c.cachedMainPageInfo == nil {
return ""
}
return c.cachedMainPageInfo.RuntimeConfig.EducationURL + path
}
func (c *client) checkClientInited() error {
if c.cachedMainPageInfo == nil {
return ErrClientNotInited
}
return nil
}
func (c *client) getMainPageState(ctx context.Context) (*PageState, error) {
ctxLogger := restyCtxLogger{
ctx: ctx,
log: c.log,
}
req := c.http.R().
SetContext(ctx).
SetLogger(ctxLogger).
SetDoNotParseResponse(true).
EnableTrace()
resp, err := req.Get("/")
if err != nil {
return nil, fmt.Errorf("getting request: %w", err)
}
if resp.IsError() {
c.log.ErrorContext(ctx, "unable to proceed request", slog.String("body", string(resp.Body())))
return nil, fmt.Errorf("got %d, but expected success: %w", resp.StatusCode(), errors.ErrUnexpectedStatus)
}
traceInfo := resp.Request.TraceInfo()
c.log.InfoContext(ctx, "request proceeded", slog.Any("trace", traceInfo))
return c.parsePageState(ctx, resp.RawBody())
}
func (c *client) parsePageState(ctx context.Context, body io.Reader) (*PageState, error) {
page, err := html.Parse(body)
if err != nil {
return nil, fmt.Errorf("parsing body: %w", err)
}
c.log.DebugContext(ctx, "finding page state")
var (
foundHtml bool
foundBody bool
)
findFunc := func(node *html.Node) (found, deeper bool) {
if node == nil {
c.log.DebugContext(ctx, "node is null, skipping")
return false, false
}
if !foundHtml && node.Type == html.ElementNode {
c.log.DebugContext(ctx, "found html node")
foundHtml = true
return false, true
}
if foundHtml && !foundBody && node.DataAtom == atom.Body {
c.log.DebugContext(ctx, "found body node")
foundBody = true
return false, true
}
if foundHtml && foundBody && node.DataAtom == atom.Script {
for _, attr := range node.Attr {
if attr.Key == "id" && attr.Val == "__NEXT_DATA__" {
c.log.DebugContext(ctx, "found script node with next_data")
return true, false
}
}
}
return false, false
}
nextData := findNode(page, findFunc)
if nextData == nil {
return nil, nil
}
var out PageState
dataReader := strings.NewReader(nextData.FirstChild.Data)
err = json.NewDecoder(dataReader).Decode(&out)
if err != nil {
return nil, fmt.Errorf("decoding html data: %w", err)
}
return &out, nil
}
var educationProductFields = newQuerySet(
"id",
"name",
"organization",
"advertising",
"discount",
"link",
"learningtype",
"dateStart",
"timeStart",
"timeAllHour",
"timeAllDay",
"timeAllMonth",
"isTermApproximately",
"dictionaryFormatFilterNew",
"dictionaryLevelFilterNew",
"price",
"priceAll",
"priceInstallment",
"courseImage",
"price",
"withoutDiscountPrice",
)
var defaultProductFields = must(educationProductFields.exactSubset(
"id",
"name",
"organization",
"advertising",
"discount",
"link",
"learningtype",
"dateStart",
"timeStart",
"timeAllHour",
"timeAllDay",
"timeAllMonth",
"price",
"priceAll",
"priceInstallment",
"courseImage",
"price",
"withoutDiscountPrice",
))
func must[T any](t T, err error) T {
if err != nil {
panic(err.Error())
}
return t
}
type querySet struct {
values []string
mappedValues map[string]struct{}
}
func (qs querySet) Values() []string {
out := make([]string, len(qs.values))
copy(out, qs.values)
return out
}
func (qs querySet) hasValue(value string) bool {
_, ok := qs.mappedValues[value]
return ok
}
func (qs querySet) exactSubset(values ...string) ([]string, error) {
out := make([]string, 0, len(values))
for _, value := range values {
if !qs.hasValue(value) {
return nil, fmt.Errorf("value %s was not found in set", value)
}
out = append(out, value)
}
return out, nil
}
func newQuerySet(values ...string) querySet {
qs := querySet{
values: make([]string, len(values)),
mappedValues: make(map[string]struct{}, len(values)),
}
for i, v := range values {
qs.values[i] = v
qs.mappedValues[v] = struct{}{}
}
return qs
}
func valueAsArray(value string) []string {
if value == "" {
return nil
}
return []string{value}
}