FedP2P/webseed/client.go

package webseed

import (
	"bytes"
	"context"
	"errors"
	"fmt"
	"io"
	"log"
	"net/http"
	"strings"

	"github.com/RoaringBitmap/roaring"

	"github.com/anacrolix/torrent/common"
	"github.com/anacrolix/torrent/metainfo"
	"github.com/anacrolix/torrent/segments"
)

type RequestSpec = segments.Extent

type requestPartResult struct {
	resp *http.Response
	err  error
}

type requestPart struct {
	req    *http.Request
	e      segments.Extent
	result chan requestPartResult
	start  func()
	// Wrap http response bodies for such things as download rate limiting.
	responseBodyWrapper ResponseBodyWrapper
}

type Request struct {
	cancel func()
	Result chan RequestResult
}

func (r Request) Cancel() {
	r.cancel()
}

type Client struct {
	HttpClient *http.Client
	Url        string
	fileIndex  segments.Index
	info       *metainfo.Info
	// The pieces we can request with the Url. We're more likely to ban/block at the file-level
	// given that's how requests are mapped to webseeds, but the torrent.Client works at the piece
	// level. We can map our file-level adjustments to the pieces here. This probably need to be
	// private in the future, if Client ever starts removing pieces.
	Pieces              roaring.Bitmap
	ResponseBodyWrapper ResponseBodyWrapper
	PathEscaper         PathEscaper
}

type ResponseBodyWrapper func(io.Reader) io.Reader

func (me *Client) SetInfo(info *metainfo.Info) {
	if !strings.HasSuffix(me.Url, "/") && info.IsDir() {
		// In my experience, this is a non-conforming webseed. For example the
		// http://ia600500.us.archive.org/1/items URLs in archive.org torrents.
		return
	}
	me.fileIndex = segments.NewIndex(common.LengthIterFromUpvertedFiles(info.UpvertedFiles()))
	me.info = info
	me.Pieces.AddRange(0, uint64(info.NumPieces()))
}

type RequestResult struct {
	Bytes []byte
	Err   error
}

func (ws *Client) NewRequest(r RequestSpec) Request {
	ctx, cancel := context.WithCancel(context.Background())
	var requestParts []requestPart
	if !ws.fileIndex.Locate(r, func(i int, e segments.Extent) bool {
		req, err := newRequest(
			ws.Url, i, ws.info, e.Start, e.Length,
			ws.PathEscaper,
		)
		if err != nil {
			panic(err)
		}
		req = req.WithContext(ctx)
		part := requestPart{
			req:                 req,
			result:              make(chan requestPartResult, 1),
			e:                   e,
			responseBodyWrapper: ws.ResponseBodyWrapper,
		}
		part.start = func() {
			go func() {
				resp, err := ws.HttpClient.Do(req)
				part.result <- requestPartResult{
					resp: resp,
					err:  err,
				}
			}()
		}
		requestParts = append(requestParts, part)
		return true
	}) {
		panic("request out of file bounds")
	}
	req := Request{
		cancel: cancel,
		Result: make(chan RequestResult, 1),
	}
	go func() {
		b, err := readRequestPartResponses(ctx, requestParts)
		req.Result <- RequestResult{
			Bytes: b,
			Err:   err,
		}
	}()
	return req
}

type ErrBadResponse struct {
	Msg      string
	Response *http.Response
}

func (me ErrBadResponse) Error() string {
	return me.Msg
}

func recvPartResult(ctx context.Context, buf io.Writer, part requestPart) error {
	result := <-part.result
	// Make sure there's no further results coming, it should be a one-shot channel.
	close(part.result)
	if result.err != nil {
		return result.err
	}
	defer result.resp.Body.Close()
	var body io.Reader = result.resp.Body
	if part.responseBodyWrapper != nil {
		body = part.responseBodyWrapper(body)
	}
	// Prevent further accidental use
	result.resp.Body = nil
	if ctx.Err() != nil {
		return ctx.Err()
	}
	switch result.resp.StatusCode {
	case http.StatusPartialContent:
		copied, err := io.Copy(buf, body)
		if err != nil {
			return err
		}
		if copied != part.e.Length {
			return fmt.Errorf("got %v bytes, expected %v", copied, part.e.Length)
		}
		return nil
	case http.StatusOK:
		// This number is based on
		// https://archive.org/download/BloodyPitOfHorror/BloodyPitOfHorror.asr.srt. It seems that
		// archive.org might be using a webserver implementation that refuses to do partial
		// responses to small files.
		if part.e.Start < 48<<10 {
			if part.e.Start != 0 {
				log.Printf("resp status ok but requested range [url=%q, range=%q]",
					part.req.URL,
					part.req.Header.Get("Range"))
			}
			// Instead of discarding, we could try receiving all the chunks present in the response
			// body. I don't know how one would handle multiple chunk requests resulting in an OK
			// response for the same file. The request algorithm might be need to be smarter for
			// that.
			discarded, _ := io.CopyN(io.Discard, body, part.e.Start)
			if discarded != 0 {
				log.Printf("discarded %v bytes in webseed request response part", discarded)
			}
			_, err := io.CopyN(buf, body, part.e.Length)
			return err
		} else {
			return ErrBadResponse{"resp status ok but requested range", result.resp}
		}
	case http.StatusServiceUnavailable:
		return ErrTooFast
	default:
		return ErrBadResponse{
			fmt.Sprintf("unhandled response status code (%v)", result.resp.StatusCode),
			result.resp,
		}
	}
}

var ErrTooFast = errors.New("making requests too fast")

func readRequestPartResponses(ctx context.Context, parts []requestPart) (_ []byte, err error) {
	var buf bytes.Buffer
	for _, part := range parts {
		part.start()
		err = recvPartResult(ctx, &buf, part)
		if err != nil {
			err = fmt.Errorf("reading %q at %q: %w", part.req.URL, part.req.Header.Get("Range"), err)
			break
		}
	}
	return buf.Bytes(), err
}
webseed Client skeleton 2020-05-31 19:00:44 +08:00			`package webseed`

			`import (`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`"bytes"`
			`"context"`
Handle 503 returns from webseed peer endpoints 2021-12-06 12:14:59 +08:00			`"errors"`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`"fmt"`
			`"io"`
Allow non-partial webseed part responses for small files 2021-11-12 10:30:13 +08:00			`"log"`
webseed Client skeleton 2020-05-31 19:00:44 +08:00			`"net/http"`
Don't use non-directory webseed URLs for multi-file torrents 2021-11-12 09:37:40 +08:00			`"strings"`
webseed Client skeleton 2020-05-31 19:00:44 +08:00
Don't use non-directory webseed URLs for multi-file torrents 2021-11-12 09:37:40 +08:00			`"github.com/RoaringBitmap/roaring"`
gorond ./... 2022-11-15 20:22:10 +08:00
Don't use non-directory webseed URLs for multi-file torrents 2021-11-12 09:37:40 +08:00			`"github.com/anacrolix/torrent/common"`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`"github.com/anacrolix/torrent/metainfo"`
			`"github.com/anacrolix/torrent/segments"`
webseed Client skeleton 2020-05-31 19:00:44 +08:00			`)`

Further progress on webseeding 2020-06-01 16:25:45 +08:00			`type RequestSpec = segments.Extent`

Remove events from webseed Manage this stuff inside the webseed peer instead. 2020-06-02 11:54:26 +08:00			`type requestPartResult struct {`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`resp *http.Response`
			`err error`
			`}`

			`type requestPart struct {`
			`req *http.Request`
			`e segments.Extent`
Remove events from webseed Manage this stuff inside the webseed peer instead. 2020-06-02 11:54:26 +08:00			`result chan requestPartResult`
Do webseed request parts sequentially This means we can treat the number of bytes in the result with enough accuracy to decide if we should count it as a wasted chunk. Also I'm not sure why it was a good idea to do parts of a request in parallel anyway, it could just lead to spikes in outstanding requests to the webseed, rather than sticking to the predictable maxRequests limit. 2021-12-02 10:47:06 +08:00			`start func()`
Apply download rate limiter to webseeds 2021-12-20 11:29:43 +08:00			`// Wrap http response bodies for such things as download rate limiting.`
			`responseBodyWrapper ResponseBodyWrapper`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`}`

Remove events from webseed Manage this stuff inside the webseed peer instead. 2020-06-02 11:54:26 +08:00			`type Request struct {`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`cancel func()`
Remove events from webseed Manage this stuff inside the webseed peer instead. 2020-06-02 11:54:26 +08:00			`Result chan RequestResult`
			`}`

			`func (r Request) Cancel() {`
			`r.cancel()`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`}`
webseed Client skeleton 2020-05-31 19:00:44 +08:00
			`type Client struct {`
			`HttpClient *http.Client`
			`Url string`
Don't use non-directory webseed URLs for multi-file torrents 2021-11-12 09:37:40 +08:00			`fileIndex segments.Index`
			`info *metainfo.Info`
			`// The pieces we can request with the Url. We're more likely to ban/block at the file-level`
			`// given that's how requests are mapped to webseeds, but the torrent.Client works at the piece`
Increment webseed peer piece availability 2021-11-12 10:41:55 +08:00			`// level. We can map our file-level adjustments to the pieces here. This probably need to be`
			`// private in the future, if Client ever starts removing pieces.`
Apply download rate limiter to webseeds 2021-12-20 11:29:43 +08:00			`Pieces roaring.Bitmap`
			`ResponseBodyWrapper ResponseBodyWrapper`
fixup! [webseed] Add a custom URL encoder for webseeds 2022-04-22 10:23:43 +08:00			`PathEscaper PathEscaper`
Don't use non-directory webseed URLs for multi-file torrents 2021-11-12 09:37:40 +08:00			`}`

Apply download rate limiter to webseeds 2021-12-20 11:29:43 +08:00			`type ResponseBodyWrapper func(io.Reader) io.Reader`

Don't use non-directory webseed URLs for multi-file torrents 2021-11-12 09:37:40 +08:00			`func (me Client) SetInfo(info metainfo.Info) {`
			`if !strings.HasSuffix(me.Url, "/") && info.IsDir() {`
			`// In my experience, this is a non-conforming webseed. For example the`
			`// http://ia600500.us.archive.org/1/items URLs in archive.org torrents.`
			`return`
			`}`
			`me.fileIndex = segments.NewIndex(common.LengthIterFromUpvertedFiles(info.UpvertedFiles()))`
			`me.info = info`
			`me.Pieces.AddRange(0, uint64(info.NumPieces()))`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`}`

Remove events from webseed Manage this stuff inside the webseed peer instead. 2020-06-02 11:54:26 +08:00			`type RequestResult struct {`
			`Bytes []byte`
			`Err error`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`}`

Remove events from webseed Manage this stuff inside the webseed peer instead. 2020-06-02 11:54:26 +08:00			`func (ws *Client) NewRequest(r RequestSpec) Request {`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`ctx, cancel := context.WithCancel(context.Background())`
			`var requestParts []requestPart`
Don't use non-directory webseed URLs for multi-file torrents 2021-11-12 09:37:40 +08:00			`if !ws.fileIndex.Locate(r, func(i int, e segments.Extent) bool {`
WebSeed PathEscaper API tweaks 2022-04-26 08:46:01 +08:00			`req, err := newRequest(`
[webseed] Add a custom URL encoder for webseeds 2022-04-21 22:21:29 +08:00			`ws.Url, i, ws.info, e.Start, e.Length,`
fixup! [webseed] Add a custom URL encoder for webseeds 2022-04-22 10:23:43 +08:00			`ws.PathEscaper,`
[webseed] Add a custom URL encoder for webseeds 2022-04-21 22:21:29 +08:00			`)`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`if err != nil {`
			`panic(err)`
			`}`
			`req = req.WithContext(ctx)`
			`part := requestPart{`
Apply download rate limiter to webseeds 2021-12-20 11:29:43 +08:00			`req: req,`
			`result: make(chan requestPartResult, 1),`
			`e: e,`
			`responseBodyWrapper: ws.ResponseBodyWrapper,`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`}`
Do webseed request parts sequentially This means we can treat the number of bytes in the result with enough accuracy to decide if we should count it as a wasted chunk. Also I'm not sure why it was a good idea to do parts of a request in parallel anyway, it could just lead to spikes in outstanding requests to the webseed, rather than sticking to the predictable maxRequests limit. 2021-12-02 10:47:06 +08:00			`part.start = func() {`
			`go func() {`
			`resp, err := ws.HttpClient.Do(req)`
			`part.result <- requestPartResult{`
			`resp: resp,`
			`err: err,`
			`}`
			`}()`
			`}`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`requestParts = append(requestParts, part)`
			`return true`
			`}) {`
			`panic("request out of file bounds")`
			`}`
Remove events from webseed Manage this stuff inside the webseed peer instead. 2020-06-02 11:54:26 +08:00			`req := Request{`
			`cancel: cancel,`
			`Result: make(chan RequestResult, 1),`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`}`
			`go func() {`
webseed: Close unused part responses after error Also don't bother to read their response bodies. 2021-11-12 09:40:15 +08:00			`b, err := readRequestPartResponses(ctx, requestParts)`
Remove events from webseed Manage this stuff inside the webseed peer instead. 2020-06-02 11:54:26 +08:00			`req.Result <- RequestResult{`
			`Bytes: b,`
			`Err: err,`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`}`
			`}()`
Remove events from webseed Manage this stuff inside the webseed peer instead. 2020-06-02 11:54:26 +08:00			`return req`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`}`

Treat 404 responses from webseed peers as fatal Fixes #464. 2021-02-18 11:36:08 +08:00			`type ErrBadResponse struct {`
			`Msg string`
			`Response *http.Response`
			`}`

			`func (me ErrBadResponse) Error() string {`
			`return me.Msg`
			`}`

webseed: Close unused part responses after error Also don't bother to read their response bodies. 2021-11-12 09:40:15 +08:00			`func recvPartResult(ctx context.Context, buf io.Writer, part requestPart) error {`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`result := <-part.result`
Do webseed request parts sequentially This means we can treat the number of bytes in the result with enough accuracy to decide if we should count it as a wasted chunk. Also I'm not sure why it was a good idea to do parts of a request in parallel anyway, it could just lead to spikes in outstanding requests to the webseed, rather than sticking to the predictable maxRequests limit. 2021-12-02 10:47:06 +08:00			`// Make sure there's no further results coming, it should be a one-shot channel.`
			`close(part.result)`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`if result.err != nil {`
			`return result.err`
			`}`
			`defer result.resp.Body.Close()`
Apply download rate limiter to webseeds 2021-12-20 11:29:43 +08:00			`var body io.Reader = result.resp.Body`
			`if part.responseBodyWrapper != nil {`
			`body = part.responseBodyWrapper(body)`
			`}`
			`// Prevent further accidental use`
			`result.resp.Body = nil`
webseed: Close unused part responses after error Also don't bother to read their response bodies. 2021-11-12 09:40:15 +08:00			`if ctx.Err() != nil {`
			`return ctx.Err()`
			`}`
webseed: Handle http response status codes 2020-10-15 09:42:27 +08:00			`switch result.resp.StatusCode {`
			`case http.StatusPartialContent:`
Apply download rate limiter to webseeds 2021-12-20 11:29:43 +08:00			`copied, err := io.Copy(buf, body)`
Allow non-partial webseed part responses for small files 2021-11-12 10:30:13 +08:00			`if err != nil {`
			`return err`
			`}`
			`if copied != part.e.Length {`
			`return fmt.Errorf("got %v bytes, expected %v", copied, part.e.Length)`
			`}`
			`return nil`
webseed: Handle http response status codes 2020-10-15 09:42:27 +08:00			`case http.StatusOK:`
Allow non-partial webseed part responses for small files 2021-11-12 10:30:13 +08:00			`// This number is based on`
			`// https://archive.org/download/BloodyPitOfHorror/BloodyPitOfHorror.asr.srt. It seems that`
			`// archive.org might be using a webserver implementation that refuses to do partial`
			`// responses to small files.`
			`if part.e.Start < 48<<10 {`
Don't log legitimate non-range requests 2021-11-12 12:58:32 +08:00			`if part.e.Start != 0 {`
			`log.Printf("resp status ok but requested range [url=%q, range=%q]",`
			`part.req.URL,`
			`part.req.Header.Get("Range"))`
			`}`
Add a comment about not discarding in webseed OK response bodies 2021-11-14 11:01:46 +08:00			`// Instead of discarding, we could try receiving all the chunks present in the response`
			`// body. I don't know how one would handle multiple chunk requests resulting in an OK`
			`// response for the same file. The request algorithm might be need to be smarter for`
			`// that.`
Apply download rate limiter to webseeds 2021-12-20 11:29:43 +08:00			`discarded, _ := io.CopyN(io.Discard, body, part.e.Start)`
Allow non-partial webseed part responses for small files 2021-11-12 10:30:13 +08:00			`if discarded != 0 {`
			`log.Printf("discarded %v bytes in webseed request response part", discarded)`
			`}`
Apply download rate limiter to webseeds 2021-12-20 11:29:43 +08:00			`_, err := io.CopyN(buf, body, part.e.Length)`
Allow non-partial webseed part responses for small files 2021-11-12 10:30:13 +08:00			`return err`
			`} else {`
			`return ErrBadResponse{"resp status ok but requested range", result.resp}`
webseed: Handle http response status codes 2020-10-15 09:42:27 +08:00			`}`
Handle 503 returns from webseed peer endpoints 2021-12-06 12:14:59 +08:00			`case http.StatusServiceUnavailable:`
			`return ErrTooFast`
webseed: Handle http response status codes 2020-10-15 09:42:27 +08:00			`default:`
Treat 404 responses from webseed peers as fatal Fixes #464. 2021-02-18 11:36:08 +08:00			`return ErrBadResponse{`
			`fmt.Sprintf("unhandled response status code (%v)", result.resp.StatusCode),`
			`result.resp,`
			`}`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`}`
webseed Client skeleton 2020-05-31 19:00:44 +08:00			`}`

Handle 503 returns from webseed peer endpoints 2021-12-06 12:14:59 +08:00			`var ErrTooFast = errors.New("making requests too fast")`

Do webseed request parts sequentially This means we can treat the number of bytes in the result with enough accuracy to decide if we should count it as a wasted chunk. Also I'm not sure why it was a good idea to do parts of a request in parallel anyway, it could just lead to spikes in outstanding requests to the webseed, rather than sticking to the predictable maxRequests limit. 2021-12-02 10:47:06 +08:00			`func readRequestPartResponses(ctx context.Context, parts []requestPart) (_ []byte, err error) {`
Further progress on webseeding 2020-06-01 16:25:45 +08:00			`var buf bytes.Buffer`
Do webseed request parts sequentially This means we can treat the number of bytes in the result with enough accuracy to decide if we should count it as a wasted chunk. Also I'm not sure why it was a good idea to do parts of a request in parallel anyway, it could just lead to spikes in outstanding requests to the webseed, rather than sticking to the predictable maxRequests limit. 2021-12-02 10:47:06 +08:00			`for _, part := range parts {`
			`part.start()`
			`err = recvPartResult(ctx, &buf, part)`
			`if err != nil {`
			`err = fmt.Errorf("reading %q at %q: %w", part.req.URL, part.req.Header.Get("Range"), err)`
			`break`
webseed: Close unused part responses after error Also don't bother to read their response bodies. 2021-11-12 09:40:15 +08:00			`}`
Do webseed request parts sequentially This means we can treat the number of bytes in the result with enough accuracy to decide if we should count it as a wasted chunk. Also I'm not sure why it was a good idea to do parts of a request in parallel anyway, it could just lead to spikes in outstanding requests to the webseed, rather than sticking to the predictable maxRequests limit. 2021-12-02 10:47:06 +08:00			`}`
webseed: Close unused part responses after error Also don't bother to read their response bodies. 2021-11-12 09:40:15 +08:00			`return buf.Bytes(), err`
webseed Client skeleton 2020-05-31 19:00:44 +08:00			`}`