From 64341d9c807391e538bd2d2c0c0698a8e008fe93 Mon Sep 17 00:00:00 2001 From: John Sebastian Peterson Date: Tue, 6 Jan 2015 19:50:10 +0100 Subject: [PATCH] Add option to download non-HTML links directly after their parent page when downloading recursively because it's more likely to download temporary links before they expire because it's more similar to the browsing experience --- doc/wget.texi | 10 ++++++++++ src/init.c | 21 +++++++++++++++++++++ src/main.c | 3 +++ src/options.h | 4 ++++ src/recur.c | 28 +++++++++++++++++++++------- 5 files changed, 59 insertions(+), 7 deletions(-) diff --git a/doc/wget.texi b/doc/wget.texi index d9ed17d68e..2f729d6910 100644 --- a/doc/wget.texi +++ b/doc/wget.texi @@ -1916,6 +1916,10 @@ case. Turn on recursive retrieving. @xref{Recursive Download}, for more details. The default maximum depth is 5. +@item --queue-type=@var{queuetype} +Specify the queue type (@pxref{Recursive Download}). Accepted values are @samp{fifo} +(the default) and @samp{browser}. + @item -l @var{depth} @itemx --level=@var{depth} Specify recursion maximum depth level @var{depth} (@pxref{Recursive @@ -2296,6 +2300,12 @@ documents linked by them, and so on. In other words, Wget first downloads the documents at depth 1, then those at depth 2, and so on until the specified maximum depth. +The @dfn{queue type} is FIFO (default) or browser. FIFO download the +first enqueued files first. Browser download non-HTML links directly +after their parent page. If the parent page contain temporary links +this can prevent that links expire before they're downloaded. Pages +sometimes use temporary links to prevent direct links to files. + The maximum @dfn{depth} to which the retrieval may descend is specified with the @samp{-l} option. The default maximum depth is five layers. diff --git a/src/init.c b/src/init.c index 569b25b2bd..ad72441a08 100644 --- a/src/init.c +++ b/src/init.c @@ -104,6 +104,7 @@ CMD_DECLARE (cmd_spec_htmlify); CMD_DECLARE (cmd_spec_mirror); CMD_DECLARE (cmd_spec_prefer_family); CMD_DECLARE (cmd_spec_progress); +CMD_DECLARE (cmd_spec_queue_type); CMD_DECLARE (cmd_spec_recursive); CMD_DECLARE (cmd_spec_regex_type); CMD_DECLARE (cmd_spec_restrict_file_names); @@ -247,6 +248,7 @@ static const struct { { "proxypasswd", &opt.proxy_passwd, cmd_string }, /* deprecated */ { "proxypassword", &opt.proxy_passwd, cmd_string }, { "proxyuser", &opt.proxy_user, cmd_string }, + { "queuetype", &opt.queue_type, cmd_spec_queue_type }, { "quiet", &opt.quiet, cmd_boolean }, { "quota", &opt.quota, cmd_bytes_sum }, #ifdef HAVE_SSL @@ -403,6 +405,8 @@ defaults (void) opt.restrict_files_nonascii = false; opt.restrict_files_case = restrict_no_case_restriction; + opt.queue_type = queue_type_fifo; + opt.regex_type = regex_type_posix; opt.max_redirect = 20; @@ -1441,6 +1445,23 @@ cmd_spec_recursive (const char *com, const char *val, void *place_ignored _GL_UN return true; } +/* Validate --queue-type and set the choice. */ + +static bool +cmd_spec_queue_type (const char *com, const char *val, void *place_ignored _GL_UNUSED) +{ + static const struct decode_item choices[] = { + { "fifo", queue_type_fifo }, + { "browser", queue_type_browser }, + }; + int queue_type = queue_type_fifo; + int ok = decode_string (val, choices, countof (choices), &queue_type); + if (!ok) + fprintf (stderr, _("%s: %s: Invalid value %s.\n"), exec_name, com, quote (val)); + opt.queue_type = queue_type; + return ok; +} + /* Validate --regex-type and set the choice. */ static bool diff --git a/src/main.c b/src/main.c index 6feb1403cc..048ffeb550 100644 --- a/src/main.c +++ b/src/main.c @@ -272,6 +272,7 @@ static struct cmdline_option option_data[] = { "proxy-passwd", 0, OPT_VALUE, "proxypassword", -1 }, /* deprecated */ { "proxy-password", 0, OPT_VALUE, "proxypassword", -1 }, { "proxy-user", 0, OPT_VALUE, "proxyuser", -1 }, + { "queue-type", 0, OPT_VALUE, "queuetype", -1 }, { "quiet", 'q', OPT_BOOLEAN, "quiet", -1 }, { "quota", 'Q', OPT_VALUE, "quota", -1 }, { "random-file", 0, OPT_VALUE, "randomfile", -1 }, @@ -736,6 +737,8 @@ WARC options:\n"), Recursive download:\n"), N_("\ -r, --recursive specify recursive download\n"), + N_("\ + --queue-type=TYPE queue type (fifo|browser).\n"), N_("\ -l, --level=NUMBER maximum recursion depth (inf or 0 for infinite)\n"), N_("\ diff --git a/src/options.h b/src/options.h index b99512650a..5a4435dedb 100644 --- a/src/options.h +++ b/src/options.h @@ -46,6 +46,10 @@ struct options bool relative_only; /* Follow only relative links. */ bool no_parent; /* Restrict access to the parent directory. */ + enum { + queue_type_fifo, + queue_type_browser + } queue_type; /* Recursion queue type */ int reclevel; /* Maximum level of recursion */ bool dirstruct; /* Do we build the directory structure as we go along? */ diff --git a/src/recur.c b/src/recur.c index b6b9dc6a9c..6f00dbd321 100644 --- a/src/recur.c +++ b/src/recur.c @@ -90,13 +90,16 @@ url_queue_delete (struct url_queue *queue) /* Enqueue a URL in the queue. The queue is FIFO: the items will be retrieved ("dequeued") from the queue in the order they were placed - into it. */ + into it. Or browser: Non-HTML links are retrieved directly after + their parent page. */ static void url_enqueue (struct url_queue *queue, struct iri *i, const char *url, const char *referer, int depth, bool html_allowed, bool css_allowed) { + int append = opt.queue_type == queue_type_fifo || html_allowed; + struct queue_element *qel = xnew (struct queue_element); qel->iri = i; qel->url = url; @@ -110,20 +113,31 @@ url_enqueue (struct url_queue *queue, struct iri *i, if (queue->count > queue->maxcount) queue->maxcount = queue->count; - DEBUGP (("Enqueuing %s at depth %d\n", + DEBUGP (("%s %s at depth %d\n", append ? "Appending" : "Prepending", quotearg_n_style (0, escape_quoting_style, url), depth)); DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); if (i) - DEBUGP (("[IRI Enqueuing %s with %s\n", quote_n (0, url), - i->uri_encoding ? quote_n (1, i->uri_encoding) : "None")); + DEBUGP (("[IRI %s %s with %s\n", append ? "Appending" : "Prepending", + quote_n (0, url), i->uri_encoding ? quote_n (1, i->uri_encoding) : "None")); - if (queue->tail) - queue->tail->next = qel; - queue->tail = qel; + if (append) + { + if (queue->tail) + queue->tail->next = qel; + queue->tail = qel; + } + else + { + if (queue->head) + qel->next = queue->head; + queue->head = qel; + } if (!queue->head) queue->head = queue->tail; + if (!queue->tail) + queue->tail = queue->head; } /* Take a URL out of the queue. Return true if this operation