From be6b4f555c9ba6f01584de3d7de99d8f79ab226a Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Wed, 6 Mar 2024 19:22:22 +0100 Subject: [PATCH 1/9] add env var for namespace and deployment name to k8s yaml (#22) --- kubernetes.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kubernetes.yaml b/kubernetes.yaml index 1e49254..92bf89a 100644 --- a/kubernetes.yaml +++ b/kubernetes.yaml @@ -22,6 +22,15 @@ spec: - containerPort: 6800 name: http protocol: TCP + env: + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: MY_DEPLOYMENT_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['app.kubernetes.io/name'] readinessProbe: failureThreshold: 3 httpGet: From 48f15c77fa0dbffae8ff63b6b724701fc06b1825 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Wed, 6 Mar 2024 19:22:59 +0100 Subject: [PATCH 2/9] add middleware to enrich responses with node_name (#13) --- scrapyd_k8s/api.py | 12 +++++++++++- scrapyd_k8s/launcher/docker.py | 5 +++++ scrapyd_k8s/launcher/k8s.py | 5 +++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/scrapyd_k8s/api.py b/scrapyd_k8s/api.py index 6f8e576..8ca2568 100644 --- a/scrapyd_k8s/api.py +++ b/scrapyd_k8s/api.py @@ -1,7 +1,8 @@ #!/usr/bin/env python3 +import json import uuid -from flask import Flask, request +from flask import Flask, request, Response, jsonify from flask_basicauth import BasicAuth from markupsafe import escape from natsort import natsort_keygen, ns @@ -14,6 +15,15 @@ launcher = (config.launcher_cls())(config) scrapyd_config = config.scrapyd() +# middleware that adds "node_name" to each response if it is a JSON +@app.after_request +def after_request(response: Response): + if response.is_json: + data = response.json + data["node_name"] = config.scrapyd().get("node_name", launcher.get_node_name()) + response.data = jsonify(data).data + return response + @app.get("/") def home(): return "

scrapyd-k8s

" diff --git a/scrapyd_k8s/launcher/docker.py b/scrapyd_k8s/launcher/docker.py index 15e22d1..13166c6 100644 --- a/scrapyd_k8s/launcher/docker.py +++ b/scrapyd_k8s/launcher/docker.py @@ -1,4 +1,6 @@ import re +import os + import docker from ..utils import native_stringify_dict @@ -18,6 +20,9 @@ class Docker: def __init__(self, config): self._docker = docker.from_env() + def get_node_name(self): + return os.getenv('HOSTNAME') + def listjobs(self, project_id=None): label = self.LABEL_PROJECT + ('=%s'%(project_id) if project_id else '') jobs = self._docker.containers.list(all=True, filters={ 'label': label }) diff --git a/scrapyd_k8s/launcher/k8s.py b/scrapyd_k8s/launcher/k8s.py index baad26f..2eb40d2 100644 --- a/scrapyd_k8s/launcher/k8s.py +++ b/scrapyd_k8s/launcher/k8s.py @@ -1,3 +1,5 @@ +import os + import kubernetes import kubernetes.stream from signal import Signals @@ -23,6 +25,9 @@ def __init__(self, config): self._k8s = kubernetes.client.CoreV1Api() self._k8s_batch = kubernetes.client.BatchV1Api() + def get_node_name(self): + return "%s.%s" % (os.getenv('MY_DEPLOYMENT_NAME'), os.getenv('MY_POD_NAMESPACE')) + def listjobs(self, project=None): label = self.LABEL_PROJECT + ('=%s'%(project) if project else '') jobs = self._k8s_batch.list_namespaced_job(namespace=self._namespace, label_selector=label) From acdb7ddc21fd06512a862cd80c649545eabb060c Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Wed, 6 Mar 2024 19:36:36 +0100 Subject: [PATCH 3/9] delete unused import --- scrapyd_k8s/api.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapyd_k8s/api.py b/scrapyd_k8s/api.py index 8ca2568..0676b71 100644 --- a/scrapyd_k8s/api.py +++ b/scrapyd_k8s/api.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -import json import uuid from flask import Flask, request, Response, jsonify From dc0a8d9fab2440b9ec8bd6d66b9e1a95476222b6 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Wed, 6 Mar 2024 19:54:06 +0100 Subject: [PATCH 4/9] adjust tests --- test_api.py | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/test_api.py b/test_api.py index 8da1c4d..97d8ef3 100644 --- a/test_api.py +++ b/test_api.py @@ -38,13 +38,20 @@ def test_daemonstatus_ok(): def test_listprojects_ok(): response = requests.get(BASE_URL + '/listprojects.json') assert_response_ok(response) - assert response.json() == { 'status': 'ok', 'projects': AVAIL_PROJECTS } + + json = response.json() + assert json['projects'] == AVAIL_PROJECTS + assert 'node_name' in json def test_listversions_ok(): response = requests.get(BASE_URL + '/listversions.json?project=' + RUN_PROJECT) assert_response_ok(response) - assert response.json() == { 'status': 'ok', 'versions': AVAIL_VERSIONS } + + json = response.json() + assert json['versions'] == AVAIL_VERSIONS + assert 'node_name' in json + def test_listversions_project_missing(): response = requests.get(BASE_URL + '/listversions.json') @@ -58,21 +65,31 @@ def test_listversions_project_not_found(): def test_listspiders_ok(): response = requests.get(BASE_URL + '/listspiders.json?project=' + RUN_PROJECT + '&_version=' + RUN_VERSION) assert_response_ok(response) - assert response.json() == { 'status': 'ok', 'spiders': AVAIL_SPIDERS } + + json = response.json() + assert json['spiders'] == AVAIL_SPIDERS + assert 'node_name' in json + def test_listspiders_ok_without_version(): response = requests.get(BASE_URL + '/listspiders.json?project=' + RUN_PROJECT) assert_response_ok(response) - assert response.json() == { 'status': 'ok', 'spiders': AVAIL_SPIDERS } + + json = response.json() + assert json['spiders'] == AVAIL_SPIDERS + assert 'node_name' in json + def test_listspiders_project_missing(): response = requests.get(BASE_URL + '/listspiders.json') assert_response_error(response, 400) + def test_listspiders_project_not_found(): response = requests.get(BASE_URL + '/listspiders.json?project=' + 'nonexistant' + '&_version=' + RUN_VERSION) assert_response_error(response, 404) + def test_listspiders_version_not_found(): response = requests.get(BASE_URL + '/listspiders.json?project=' + RUN_PROJECT + '&_version=' + 'nonexistant') assert_response_error(response, 404) @@ -82,10 +99,12 @@ def test_schedule_project_missing(): response = requests.post(BASE_URL + '/schedule.json', data={}) assert_response_error(response, 400) + def test_schedule_project_not_found(): response = requests.post(BASE_URL + '/schedule.json', data={ 'project': 'nonexistant' }) assert_response_error(response, 400) + def test_schedule_spider_missing(): response = requests.post(BASE_URL + '/schedule.json', data={ 'project': RUN_PROJECT }) assert_response_error(response, 400) @@ -98,8 +117,8 @@ def test_cancel_project_missing(): response = requests.post(BASE_URL + '/cancel.json', data={}) assert_response_error(response, 400) -# we don't test cancelling a spider from a project not in the config file +# we don't test cancelling a spider from a project not in the config file def test_cancel_jobid_missing(): response = requests.post(BASE_URL + '/cancel.json', data={ 'project': RUN_PROJECT }) assert_response_error(response, 400) @@ -113,14 +132,15 @@ def test_scenario_regular_ok(): 'setting': 'STATIC_SLEEP=%d' % STATIC_SLEEP }) + def test_scenario_regular_ok_without_version(): scenario_regular({ 'project': RUN_PROJECT, 'spider': RUN_SPIDER, 'setting': 'STATIC_SLEEP=%d' % STATIC_SLEEP }) -# TODO test_scenario_cancel_scheduled_ok (needs a way to make sure a job is not running yet) +# TODO test_scenario_cancel_scheduled_ok (needs a way to make sure a job is not running yet) def test_scenario_cancel_running_finished_ok(): assert_listjobs() # schedule a new job and wait until it is running @@ -138,7 +158,11 @@ def test_scenario_cancel_running_finished_ok(): 'project': RUN_PROJECT, 'job': jobid, 'signal': 'KILL' }) assert_response_ok(response) - assert response.json() == { 'status': 'ok', 'prevstate': 'running' } + + json = response.json() + assert json['prevstate'] == 'running' + assert 'node_name' in json + # wait until the job has stopped listjobs_wait(jobid, 'finished') jobinfo = assert_listjobs(finished=jobid) @@ -146,7 +170,10 @@ def test_scenario_cancel_running_finished_ok(): # then cancel it again, though nothing would happen response = requests.post(BASE_URL + '/cancel.json', data={ 'project': RUN_PROJECT, 'job': jobid }) assert_response_ok(response) - assert response.json() == { 'status': 'ok', 'prevstate': 'finished' } + + json = response.json() + assert json['prevstate'] == 'finished' + assert 'node_name' in json def scenario_regular(schedule_args): From dff86a62e5d0faf5e4c005ae64bc208d0a526cc6 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Tue, 26 Mar 2024 23:09:51 +0100 Subject: [PATCH 5/9] add default as a fallback for k8s; delete extra empty lines --- scrapyd_k8s/api.py | 20 +++++++++++--------- scrapyd_k8s/launcher/docker.py | 6 +++++- scrapyd_k8s/launcher/k8s.py | 4 ++++ test_api.py | 17 ----------------- 4 files changed, 20 insertions(+), 27 deletions(-) diff --git a/scrapyd_k8s/api.py b/scrapyd_k8s/api.py index 0676b71..780e535 100644 --- a/scrapyd_k8s/api.py +++ b/scrapyd_k8s/api.py @@ -14,14 +14,6 @@ launcher = (config.launcher_cls())(config) scrapyd_config = config.scrapyd() -# middleware that adds "node_name" to each response if it is a JSON -@app.after_request -def after_request(response: Response): - if response.is_json: - data = response.json - data["node_name"] = config.scrapyd().get("node_name", launcher.get_node_name()) - response.data = jsonify(data).data - return response @app.get("/") def home(): @@ -133,4 +125,14 @@ def run(): enable_authentication(app, config_username, config_password) # run server - app.run(host=host, port=port) \ No newline at end of file + app.run(host=host, port=port) + + +# middleware that adds "node_name" to each response if it is a JSON +@app.after_request +def after_request(response: Response): + if response.is_json: + data = response.json + data["node_name"] = config.scrapyd().get("node_name", launcher.get_node_name()) + response.data = jsonify(data).data + return response diff --git a/scrapyd_k8s/launcher/docker.py b/scrapyd_k8s/launcher/docker.py index 13166c6..7b458a3 100644 --- a/scrapyd_k8s/launcher/docker.py +++ b/scrapyd_k8s/launcher/docker.py @@ -21,7 +21,11 @@ def __init__(self, config): self._docker = docker.from_env() def get_node_name(self): - return os.getenv('HOSTNAME') + hostname = os.getenv('HOSTNAME') + if hostname in [None, "", "null"]: + hostname = os.popen("hostname").read().strip() + return hostname + def listjobs(self, project_id=None): label = self.LABEL_PROJECT + ('=%s'%(project_id) if project_id else '') diff --git a/scrapyd_k8s/launcher/k8s.py b/scrapyd_k8s/launcher/k8s.py index 2eb40d2..ed2a8b3 100644 --- a/scrapyd_k8s/launcher/k8s.py +++ b/scrapyd_k8s/launcher/k8s.py @@ -26,6 +26,10 @@ def __init__(self, config): self._k8s_batch = kubernetes.client.BatchV1Api() def get_node_name(self): + deployment_name = os.getenv('MY_DEPLOYMENT_NAME') + pod_name = os.getenv('MY_POD_NAMESPACE') + if not deployment_name and not pod_name: + return "default" return "%s.%s" % (os.getenv('MY_DEPLOYMENT_NAME'), os.getenv('MY_POD_NAMESPACE')) def listjobs(self, project=None): diff --git a/test_api.py b/test_api.py index 97d8ef3..3dc4c89 100644 --- a/test_api.py +++ b/test_api.py @@ -15,7 +15,6 @@ MAX_WAIT = int(os.getenv('TEST_MAX_WAIT', '6')) STATIC_SLEEP = float(os.getenv('TEST_STATIC_SLEEP', '2')) - def test_root_ok(): response = requests.get(BASE_URL) assert response.status_code == 200 @@ -23,12 +22,10 @@ def test_root_ok(): assert 'scrapyd-k8s' in response.text assert '' in response.text - def test_healthz_ok(): response = requests.get(BASE_URL + '/healthz') assert response.status_code == 200 - def test_daemonstatus_ok(): response = requests.get(BASE_URL + '/daemonstatus.json') assert_response_ok(response) @@ -52,7 +49,6 @@ def test_listversions_ok(): assert json['versions'] == AVAIL_VERSIONS assert 'node_name' in json - def test_listversions_project_missing(): response = requests.get(BASE_URL + '/listversions.json') assert_response_error(response, 400) @@ -79,32 +75,26 @@ def test_listspiders_ok_without_version(): assert json['spiders'] == AVAIL_SPIDERS assert 'node_name' in json - def test_listspiders_project_missing(): response = requests.get(BASE_URL + '/listspiders.json') assert_response_error(response, 400) - def test_listspiders_project_not_found(): response = requests.get(BASE_URL + '/listspiders.json?project=' + 'nonexistant' + '&_version=' + RUN_VERSION) assert_response_error(response, 404) - def test_listspiders_version_not_found(): response = requests.get(BASE_URL + '/listspiders.json?project=' + RUN_PROJECT + '&_version=' + 'nonexistant') assert_response_error(response, 404) - def test_schedule_project_missing(): response = requests.post(BASE_URL + '/schedule.json', data={}) assert_response_error(response, 400) - def test_schedule_project_not_found(): response = requests.post(BASE_URL + '/schedule.json', data={ 'project': 'nonexistant' }) assert_response_error(response, 400) - def test_schedule_spider_missing(): response = requests.post(BASE_URL + '/schedule.json', data={ 'project': RUN_PROJECT }) assert_response_error(response, 400) @@ -112,34 +102,28 @@ def test_schedule_spider_missing(): # scheduling a non-existing spider will try to start it, so no error # scheduling a non-existing project version will try to start it, so no error - def test_cancel_project_missing(): response = requests.post(BASE_URL + '/cancel.json', data={}) assert_response_error(response, 400) - # we don't test cancelling a spider from a project not in the config file def test_cancel_jobid_missing(): response = requests.post(BASE_URL + '/cancel.json', data={ 'project': RUN_PROJECT }) assert_response_error(response, 400) # TODO test cancel with invalid signal (currently returns server error, could be improved) - - def test_scenario_regular_ok(): scenario_regular({ 'project': RUN_PROJECT, '_version': RUN_VERSION, 'spider': RUN_SPIDER, 'setting': 'STATIC_SLEEP=%d' % STATIC_SLEEP }) - def test_scenario_regular_ok_without_version(): scenario_regular({ 'project': RUN_PROJECT, 'spider': RUN_SPIDER, 'setting': 'STATIC_SLEEP=%d' % STATIC_SLEEP }) - # TODO test_scenario_cancel_scheduled_ok (needs a way to make sure a job is not running yet) def test_scenario_cancel_running_finished_ok(): assert_listjobs() @@ -175,7 +159,6 @@ def test_scenario_cancel_running_finished_ok(): assert json['prevstate'] == 'finished' assert 'node_name' in json - def scenario_regular(schedule_args): assert_listjobs() # schedule a job From c5f3acb7e6dd2be87ac4c9e17a55f46e95e654e2 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Tue, 9 Apr 2024 18:14:52 +0200 Subject: [PATCH 6/9] refactor code according to PR comments: relocate middleware in the file, rewrite function to retrieve deployment name and pod namespace, use socket.gethostname instead of sys popopen --- scrapyd_k8s/api.py | 19 +++++++++---------- scrapyd_k8s/launcher/docker.py | 3 ++- scrapyd_k8s/launcher/k8s.py | 8 +++----- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/scrapyd_k8s/api.py b/scrapyd_k8s/api.py index 780e535..e0348e6 100644 --- a/scrapyd_k8s/api.py +++ b/scrapyd_k8s/api.py @@ -103,6 +103,15 @@ def api_listjobs(): # TODO perhaps remove state from jobs return { 'status': 'ok', 'pending': pending, 'running': running, 'finished': finished } +# middleware that adds "node_name" to each response if it is a JSON +@app.after_request +def after_request(response: Response): + if response.is_json: + data = response.json + data["node_name"] = config.scrapyd().get("node_name", launcher.get_node_name()) + response.data = jsonify(data).data + return response + def error(msg, status=200): return { 'status': 'error', 'message': msg }, status @@ -126,13 +135,3 @@ def run(): # run server app.run(host=host, port=port) - - -# middleware that adds "node_name" to each response if it is a JSON -@app.after_request -def after_request(response: Response): - if response.is_json: - data = response.json - data["node_name"] = config.scrapyd().get("node_name", launcher.get_node_name()) - response.data = jsonify(data).data - return response diff --git a/scrapyd_k8s/launcher/docker.py b/scrapyd_k8s/launcher/docker.py index 7b458a3..36433a4 100644 --- a/scrapyd_k8s/launcher/docker.py +++ b/scrapyd_k8s/launcher/docker.py @@ -1,5 +1,6 @@ import re import os +import socket import docker from ..utils import native_stringify_dict @@ -23,7 +24,7 @@ def __init__(self, config): def get_node_name(self): hostname = os.getenv('HOSTNAME') if hostname in [None, "", "null"]: - hostname = os.popen("hostname").read().strip() + hostname = socket.gethostname() return hostname diff --git a/scrapyd_k8s/launcher/k8s.py b/scrapyd_k8s/launcher/k8s.py index ed2a8b3..3c969b7 100644 --- a/scrapyd_k8s/launcher/k8s.py +++ b/scrapyd_k8s/launcher/k8s.py @@ -26,11 +26,9 @@ def __init__(self, config): self._k8s_batch = kubernetes.client.BatchV1Api() def get_node_name(self): - deployment_name = os.getenv('MY_DEPLOYMENT_NAME') - pod_name = os.getenv('MY_POD_NAMESPACE') - if not deployment_name and not pod_name: - return "default" - return "%s.%s" % (os.getenv('MY_DEPLOYMENT_NAME'), os.getenv('MY_POD_NAMESPACE')) + deployment = os.getenv('MY_DEPLOYMENT_NAME', 'default') + namespace = os.getenv('MY_NAMESPACE') + return ".".join([n for n in [namespace, deployment] if n]) def listjobs(self, project=None): label = self.LABEL_PROJECT + ('=%s'%(project) if project else '') From 405f1e7771870d21a3ddeca34442a3f946cc443b Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Wed, 10 Apr 2024 10:02:43 +0200 Subject: [PATCH 7/9] use gethostname; delete if statement to check if hostname is null --- scrapyd_k8s/launcher/docker.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scrapyd_k8s/launcher/docker.py b/scrapyd_k8s/launcher/docker.py index 36433a4..8e958a9 100644 --- a/scrapyd_k8s/launcher/docker.py +++ b/scrapyd_k8s/launcher/docker.py @@ -22,9 +22,7 @@ def __init__(self, config): self._docker = docker.from_env() def get_node_name(self): - hostname = os.getenv('HOSTNAME') - if hostname in [None, "", "null"]: - hostname = socket.gethostname() + hostname = socket.gethostname() return hostname From b84696e05093ff3ce33ebf45e9d13f085c11e56f Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Wed, 10 Apr 2024 10:52:41 +0200 Subject: [PATCH 8/9] delete whitespace --- scrapyd_k8s/launcher/docker.py | 1 - test_api.py | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/scrapyd_k8s/launcher/docker.py b/scrapyd_k8s/launcher/docker.py index 8e958a9..a21cd5b 100644 --- a/scrapyd_k8s/launcher/docker.py +++ b/scrapyd_k8s/launcher/docker.py @@ -25,7 +25,6 @@ def get_node_name(self): hostname = socket.gethostname() return hostname - def listjobs(self, project_id=None): label = self.LABEL_PROJECT + ('=%s'%(project_id) if project_id else '') jobs = self._docker.containers.list(all=True, filters={ 'label': label }) diff --git a/test_api.py b/test_api.py index 3dc4c89..9ea514a 100644 --- a/test_api.py +++ b/test_api.py @@ -31,7 +31,6 @@ def test_daemonstatus_ok(): assert_response_ok(response) # TODO assert response.json() == { 'status': 'ok', ... } - def test_listprojects_ok(): response = requests.get(BASE_URL + '/listprojects.json') assert_response_ok(response) @@ -40,7 +39,6 @@ def test_listprojects_ok(): assert json['projects'] == AVAIL_PROJECTS assert 'node_name' in json - def test_listversions_ok(): response = requests.get(BASE_URL + '/listversions.json?project=' + RUN_PROJECT) assert_response_ok(response) @@ -57,7 +55,6 @@ def test_listversions_project_not_found(): response = requests.get(BASE_URL + '/listversions.json?project=' + 'nonexistant') assert_response_error(response, 404) - def test_listspiders_ok(): response = requests.get(BASE_URL + '/listspiders.json?project=' + RUN_PROJECT + '&_version=' + RUN_VERSION) assert_response_ok(response) @@ -66,7 +63,6 @@ def test_listspiders_ok(): assert json['spiders'] == AVAIL_SPIDERS assert 'node_name' in json - def test_listspiders_ok_without_version(): response = requests.get(BASE_URL + '/listspiders.json?project=' + RUN_PROJECT) assert_response_ok(response) @@ -107,6 +103,7 @@ def test_cancel_project_missing(): assert_response_error(response, 400) # we don't test cancelling a spider from a project not in the config file + def test_cancel_jobid_missing(): response = requests.post(BASE_URL + '/cancel.json', data={ 'project': RUN_PROJECT }) assert_response_error(response, 400) From d7f348041174cd1c4692d84702710f6d83baa454 Mon Sep 17 00:00:00 2001 From: Valeriia Klestova Date: Wed, 10 Apr 2024 10:54:00 +0200 Subject: [PATCH 9/9] return gethostname directly --- scrapyd_k8s/launcher/docker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scrapyd_k8s/launcher/docker.py b/scrapyd_k8s/launcher/docker.py index a21cd5b..65a7375 100644 --- a/scrapyd_k8s/launcher/docker.py +++ b/scrapyd_k8s/launcher/docker.py @@ -22,8 +22,7 @@ def __init__(self, config): self._docker = docker.from_env() def get_node_name(self): - hostname = socket.gethostname() - return hostname + return socket.gethostname() def listjobs(self, project_id=None): label = self.LABEL_PROJECT + ('=%s'%(project_id) if project_id else '')