SickChill · miigotu · May 15, 2019 · May 7, 2019 · May 13, 2019 · May 15, 2019
diff --git a/lib/cfscrape/__init__.py b/lib/cfscrape/__init__.py
@@ -1,30 +1,43 @@
+import json
 import logging
 import random
 import re
+import ssl
 import subprocess
 import copy
 import time
+import os
+from base64 import b64encode
+from collections import OrderedDict
 
 from requests.sessions import Session
+from requests.compat import urlparse, urlunparse
+from requests.exceptions import RequestException
 
-try:
-    from urlparse import urlparse
-except ImportError:
-    from urllib.parse import urlparse
 
-__version__ = "1.9.7"
+__version__ = "2.0.3"
 
-DEFAULT_USER_AGENTS = [
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
-    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/65.0.3325.181 Chrome/65.0.3325.181 Safari/537.36",
-    "Mozilla/5.0 (Linux; Android 7.0; Moto G (5) Build/NPPS25.137-93-8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.137 Mobile Safari/537.36",
-    "Mozilla/5.0 (iPhone; CPU iPhone OS 7_0_4 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11B554a Safari/9537.53",
-    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:59.0) Gecko/20100101 Firefox/59.0",
-    "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"
-]
+USER_AGENTS_PATH = os.path.join(os.path.dirname(__file__), "user_agents.json")
 
-DEFAULT_USER_AGENT = random.choice(DEFAULT_USER_AGENTS)
+with open(USER_AGENTS_PATH) as f:
+    user_agents = json.load(f)
+
+DEFAULT_USER_AGENT = random.choice(user_agents)
+
+DEFAULT_HEADERS = OrderedDict(
+    (
+        ("Host", None),
+        ("Connection", "keep-alive"),
+        ("Upgrade-Insecure-Requests", "1"),
+        ("User-Agent", DEFAULT_USER_AGENT),
+        (
+            "Accept",
+            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
+        ),
+        ("Accept-Language", "en-US,en;q=0.9"),
+        ("Accept-Encoding", "gzip, deflate"),
+    )
+)
 
 BUG_REPORT = """\
 Cloudflare may have changed their technique, or there may be a bug in the script.
@@ -43,32 +56,69 @@
 https://github.com/Anorov/cloudflare-scrape/issues\
 """
 
+
+class CloudflareError(RequestException):
+    pass
+
+
+class CloudflareCaptchaError(CloudflareError):
+    pass
+
+
 class CloudflareScraper(Session):
     def __init__(self, *args, **kwargs):
-        self.delay = kwargs.pop("delay", 8)
+        self.delay = kwargs.pop("delay", None)
+        # Use headers with a random User-Agent if no custom headers have been set
+        headers = OrderedDict(kwargs.pop("headers", DEFAULT_HEADERS))
+
+        # Set the User-Agent header if it was not provided
+        headers.setdefault("User-Agent", DEFAULT_USER_AGENT)
+
         super(CloudflareScraper, self).__init__(*args, **kwargs)
 
-        if "requests" in self.headers["User-Agent"]:
-            # Set a random User-Agent if no custom User-Agent has been set
-            self.headers["User-Agent"] = DEFAULT_USER_AGENT
+        # Define headers to force using an OrderedDict and preserve header order
+        self.headers = headers
 
-    def is_cloudflare_challenge(self, resp):
+    @staticmethod
+    def is_cloudflare_iuam_challenge(resp):
         return (
-            resp.status_code == 503
+            resp.status_code in (503, 429)
             and resp.headers.get("Server", "").startswith("cloudflare")
             and b"jschl_vc" in resp.content
             and b"jschl_answer" in resp.content
         )
 
+    @staticmethod
+    def is_cloudflare_captcha_challenge(resp):
+        return (
+            resp.status_code == 403
+            and resp.headers.get("Server", "").startswith("cloudflare")
+            and b"/cdn-cgi/l/chk_captcha" in resp.content
+        )
+
     def request(self, method, url, *args, **kwargs):
         resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs)
 
-        # Check if Cloudflare anti-bot is on
-        if self.is_cloudflare_challenge(resp):
+        # Check if Cloudflare captcha challenge is presented
+        if self.is_cloudflare_captcha_challenge(resp):
+            self.handle_captcha_challenge(resp, url)
+
+        # Check if Cloudflare anti-bot "I'm Under Attack Mode" is enabled
+        if self.is_cloudflare_iuam_challenge(resp):
             resp = self.solve_cf_challenge(resp, **kwargs)
 
         return resp
 
+    def handle_captcha_challenge(self, resp, url):
+        error = (
+            "Cloudflare captcha challenge presented for %s (cfscrape cannot solve captchas)"
+            % urlparse(url).netloc
+        )
+        if ssl.OPENSSL_VERSION_NUMBER < 0x10101000:
+            error += ". Your OpenSSL version is lower than 1.1.1. Please upgrade your OpenSSL library and recompile Python."
+
+        raise CloudflareCaptchaError(error, response=resp)
+
     def solve_cf_challenge(self, resp, **original_kwargs):
         start_time = time.time()
 
@@ -78,68 +128,136 @@ def solve_cf_challenge(self, resp, **original_kwargs):
         submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain)
 
         cloudflare_kwargs = copy.deepcopy(original_kwargs)
-        params = cloudflare_kwargs.setdefault("params", {})
+
         headers = cloudflare_kwargs.setdefault("headers", {})
         headers["Referer"] = resp.url
 
         try:
-            params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"', body).group(1)
-            params["pass"] = re.search(r'name="pass" value="(.+?)"', body).group(1)
-            params["s"] = re.search(r'name="s"\svalue="(?P<s_value>[^"]+)', body).group('s_value')
+            params = cloudflare_kwargs["params"] = OrderedDict(
+                re.findall(r'name="(s|jschl_vc|pass)"(?: [^<>]*)? value="(.+?)"', body)
+            )
+
+            for k in ("jschl_vc", "pass"):
+                if k not in params:
+                    raise ValueError("%s is missing from challenge form" % k)
         except Exception as e:
             # Something is wrong with the page.
             # This may indicate Cloudflare has changed their anti-bot
             # technique. If you see this and are running the latest version,
             # please open a GitHub issue so I can update the code accordingly.
-            raise ValueError("Unable to parse Cloudflare anti-bots page: %s %s" % (e.message, BUG_REPORT))
+            raise ValueError(
+                "Unable to parse Cloudflare anti-bot IUAM page: %s %s"
+                % (e.message, BUG_REPORT)
+            )
 
         # Solve the Javascript challenge
-        params["jschl_answer"] = self.solve_challenge(body, domain)
+        answer, delay = self.solve_challenge(body, domain)
+        params["jschl_answer"] = answer
 
         # Requests transforms any request into a GET after a redirect,
         # so the redirect has to be handled manually here to allow for
         # performing other types of requests even as the first request.
         method = resp.request.method
         cloudflare_kwargs["allow_redirects"] = False
 
-        end_time = time.time()
-        time.sleep(self.delay - (end_time - start_time)) # Cloudflare requires a delay before solving the challenge
+        # Cloudflare requires a delay before solving the challenge
+        time.sleep(max(delay - (time.time() - start_time), 0))
 
+        # Send the challenge response and handle the redirect manually
         redirect = self.request(method, submit_url, **cloudflare_kwargs)
-
         redirect_location = urlparse(redirect.headers["Location"])
+
         if not redirect_location.netloc:
-            redirect_url = "%s://%s%s" % (parsed_url.scheme, domain, redirect_location.path)
+            redirect_url = urlunparse(
+                (
+                    parsed_url.scheme,
+                    domain,
+                    redirect_location.path,
+                    redirect_location.params,
+                    redirect_location.query,
+                    redirect_location.fragment,
+                )
+            )
             return self.request(method, redirect_url, **original_kwargs)
         return self.request(method, redirect.headers["Location"], **original_kwargs)
 
     def solve_challenge(self, body, domain):
         try:
-            js = re.search(r"setTimeout\(function\(\){\s+(var "
-                        "s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n", body).group(1)
+            challenge, ms = re.search(
+                r"setTimeout\(function\(\){\s*(var "
+                r"s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value\s*=.+?)\r?\n"
+                r"(?:[^{<>]*},\s*(\d{4,}))?",
+                body,
+            ).groups()
+
+            # The challenge requires `document.getElementById` to get this content.
+            # Future proofing would require escaping newlines and double quotes
+            innerHTML = re.search(r"<div(?: [^<>]*)? id=\"cf-dn.*?\">([^<>]*)", body)
+            innerHTML = innerHTML.group(1) if innerHTML else ""
+
+            # Prefix the challenge with a fake document object.
+            # Interpolate the domain, div contents, and JS challenge.
+            # The `a.value` to be returned is tacked onto the end.
+            challenge = """
+                var document = {
+                    createElement: function () {
+                      return { firstChild: { href: "http://%s/" } }
+                    },
+                    getElementById: function () {
+                      return {"innerHTML": "%s"};
+                    }
+                  };
+
+                %s; a.value
+            """ % (
+                domain,
+                innerHTML,
+                challenge,
+            )
+            # Encode the challenge for security while preserving quotes and spacing.
+            challenge = b64encode(challenge.encode("utf-8")).decode("ascii")
+            # Use the provided delay, parsed delay, or default to 8 secs
+            delay = self.delay or (float(ms) / float(1000) if ms else 8)
         except Exception:
-            raise ValueError("Unable to identify Cloudflare IUAM Javascript on website. %s" % BUG_REPORT)
-
-        js = re.sub(r"a\.value = (.+ \+ t\.length(\).toFixed\(10\))?).+", r"\1", js)
-        js = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", js).replace("t.length", str(len(domain)))
-
-        # Strip characters that could be used to exit the string context
-        # These characters are not currently used in Cloudflare's arithmetic snippet
-        js = re.sub(r"[\n\\']", "", js)
-
-        if "toFixed" not in js:
-            raise ValueError("Error parsing Cloudflare IUAM Javascript challenge. %s" % BUG_REPORT)
+            raise ValueError(
+                "Unable to identify Cloudflare IUAM Javascript on website. %s"
+                % BUG_REPORT
+            )
 
         # Use vm.runInNewContext to safely evaluate code
         # The sandboxed code cannot use the Node.js standard library
-        js = "console.log(require('vm').runInNewContext('%s', Object.create(null), {timeout: 5000}));" % js
+        js = (
+            """\
+            var atob = Object.setPrototypeOf(function (str) {\
+                try {\
+                    return Buffer.from("" + str, "base64").toString("binary");\
+                } catch (e) {}\
+            }, null);\
+            var challenge = atob("%s");\
+            var context = Object.setPrototypeOf({ atob: atob }, null);\
+            var options = {\
+                filename: "iuam-challenge.js",\
+                contextOrigin: "cloudflare:iuam-challenge.js",\
+                contextCodeGeneration: { strings: true, wasm: false },\
+                timeout: 5000\
+            };\
+            process.stdout.write(String(\
+                require("vm").runInNewContext(challenge, context, options)\
+            ));\
+        """
+            % challenge
+        )
 
         try:
-            result = subprocess.check_output(["node", "-e", js]).strip()
+            result = subprocess.check_output(
+                ["node", "-e", js], stdin=subprocess.PIPE, stderr=subprocess.PIPE
+            )
         except OSError as e:
             if e.errno == 2:
-                raise EnvironmentError("Missing Node.js runtime. Node is required and must be in the PATH (check with `node -v`). Your Node binary may be called `nodejs` rather than `node`, in which case you may need to run `apt-get install nodejs-legacy` on some Debian-based systems. (Please read the cfscrape"
-                    " README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies.")
+                raise EnvironmentError(
+                    "Missing Node.js runtime. Node is required and must be in the PATH (check with `node -v`). Your Node binary may be called `nodejs` rather than `node`, in which case you may need to run `apt-get install nodejs-legacy` on some Debian-based systems. (Please read the cfscrape"
+                    " README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies."
+                )
             raise
         except Exception:
             logging.error("Error executing Cloudflare IUAM Javascript. %s" % BUG_REPORT)
@@ -148,9 +266,11 @@ def solve_challenge(self, body, domain):
         try:
             float(result)
         except Exception:
-            raise ValueError("Cloudflare IUAM challenge returned unexpected answer. %s" % BUG_REPORT)
+            raise ValueError(
+                "Cloudflare IUAM challenge returned unexpected answer. %s" % BUG_REPORT
+            )
 
-        return result
+        return result, delay
 
     @classmethod
     def create_scraper(cls, sess=None, **kwargs):
@@ -160,15 +280,23 @@ def create_scraper(cls, sess=None, **kwargs):
         scraper = cls(**kwargs)
 
         if sess:
-            attrs = ["auth", "cert", "cookies", "headers", "hooks", "params", "proxies", "data"]
+            attrs = [
+                "auth",
+                "cert",
+                "cookies",
+                "headers",
+                "hooks",
+                "params",
+                "proxies",
+                "data",
+            ]
             for attr in attrs:
                 val = getattr(sess, attr, None)
                 if val:
                     setattr(scraper, attr, val)
 
         return scraper
 
-
     ## Functions for integrating cloudflare-scrape with other applications and scripts
 
     @classmethod
@@ -180,7 +308,7 @@ def get_tokens(cls, url, user_agent=None, **kwargs):
         try:
             resp = scraper.get(url, **kwargs)
             resp.raise_for_status()
-        except Exception as e:
+        except Exception:
             logging.error("'%s' returned an error. Could not collect tokens." % url)
             raise
 
@@ -192,14 +320,19 @@ def get_tokens(cls, url, user_agent=None, **kwargs):
                 cookie_domain = d
                 break
         else:
-            raise ValueError("Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM (\"I'm Under Attack Mode\") enabled?")
+            raise ValueError(
+                'Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM ("I\'m Under Attack Mode") enabled?'
+            )
 
-        return ({
-                    "__cfduid": scraper.cookies.get("__cfduid", "", domain=cookie_domain),
-                    "cf_clearance": scraper.cookies.get("cf_clearance", "", domain=cookie_domain)
-                },
-                scraper.headers["User-Agent"]
-               )
+        return (
+            {
+                "__cfduid": scraper.cookies.get("__cfduid", "", domain=cookie_domain),
+                "cf_clearance": scraper.cookies.get(
+                    "cf_clearance", "", domain=cookie_domain
+                ),
+            },
+            scraper.headers["User-Agent"],
+        )
 
     @classmethod
     def get_cookie_string(cls, url, user_agent=None, **kwargs):
@@ -209,6 +342,7 @@ def get_cookie_string(cls, url, user_agent=None, **kwargs):
         tokens, user_agent = cls.get_tokens(url, user_agent=user_agent, **kwargs)
         return "; ".join("=".join(pair) for pair in tokens.items()), user_agent
 
+
 create_scraper = CloudflareScraper.create_scraper
 get_tokens = CloudflareScraper.get_tokens
-get_cookie_string = CloudflareScraper.get_cookie_string
+get_cookie_string = CloudflareScraper.get_cookie_string