diff --git a/Dockerfile b/Dockerfile index f30c61c0..ac7333d5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -# VERSION 1.7.1.3-7 +# VERSION 1.8.0 # AUTHOR: Matthieu "Puckel_" Roisil # DESCRIPTION: Basic Airflow container # BUILD: docker build --rm -t puckel/docker-airflow . @@ -12,8 +12,8 @@ ENV DEBIAN_FRONTEND noninteractive ENV TERM linux # Airflow -ARG AIRFLOW_VERSION=1.7.1.3 -ENV AIRFLOW_HOME /usr/local/airflow +ARG AIRFLOW_VERSION=1.8.0 +ARG AIRFLOW_HOME=/usr/local/airflow # Define en_US. ENV LANGUAGE en_US.UTF-8 @@ -21,7 +21,7 @@ ENV LANG en_US.UTF-8 ENV LC_ALL en_US.UTF-8 ENV LC_CTYPE en_US.UTF-8 ENV LC_MESSAGES en_US.UTF-8 -ENV LC_ALL en_US.UTF-8 +ENV LC_ALL en_US.UTF-8 RUN set -ex \ && buildDeps=' \ @@ -34,24 +34,24 @@ RUN set -ex \ libblas-dev \ liblapack-dev \ libpq-dev \ + git \ ' \ - && echo "deb http://http.debian.net/debian jessie-backports main" >/etc/apt/sources.list.d/backports.list \ && apt-get update -yqq \ && apt-get install -yqq --no-install-recommends \ $buildDeps \ python-pip \ + python-requests \ apt-utils \ curl \ netcat \ locales \ - && apt-get install -yqq -t jessie-backports python-requests \ && sed -i 's/^# en_US.UTF-8 UTF-8$/en_US.UTF-8 UTF-8/g' /etc/locale.gen \ && locale-gen \ && update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 \ && useradd -ms /bin/bash -d ${AIRFLOW_HOME} airflow \ && python -m pip install -U pip \ && pip install Cython \ - && pip install pytz==2015.7 \ + && pip install pytz \ && pip install pyOpenSSL \ && pip install ndg-httpsclient \ && pip install pyasn1 \ diff --git a/README.md b/README.md index 0a33efe6..ee0dc3e1 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ Check [Airflow Documentation](https://pythonhosted.org/airflow/) ## Install custom python package -- Create a file "requirements.txt" with the dedired python modules +- Create a file "requirements.txt" with the desired python modules - Mount this file as a volume `-v $(pwd)/requirements.txt:/requirements.txt` - The entrypoint.sh script execute the pip install command (with --user option) @@ -82,10 +82,6 @@ Easy scaling using docker-compose: This can be used to scale to a multi node setup using docker swarm. -## Links - - - Airflow on Kubernetes [kube-airflow](https://github.com/mumoshu/kube-airflow) - # Wanna help? Fork, improve and PR. ;-) diff --git a/circle.yml b/circle.yml index 227db635..282f47e8 100644 --- a/circle.yml +++ b/circle.yml @@ -12,4 +12,4 @@ test: pre: - sleep 5 override: - - docker run puckel/docker-airflow version |grep '1.7.1.3' + - docker run puckel/docker-airflow version |grep '1.8.0' diff --git a/config/airflow.cfg b/config/airflow.cfg index 1b7dbd02..6d313f03 100644 --- a/config/airflow.cfg +++ b/config/airflow.cfg @@ -4,9 +4,11 @@ airflow_home = /usr/local/airflow # The folder where your airflow pipelines live, most likely a # subfolder in a code repository +# This path must be absolute dags_folder = /usr/local/airflow/dags -# The folder where airflow should store its log files. This location +# The folder where airflow should store its log files +# This path must be absolute base_log_folder = /usr/local/airflow/logs # Airflow can store logs remotely in AWS S3 or Google Cloud Storage. Users @@ -17,8 +19,8 @@ remote_base_log_folder = remote_log_conn_id = # Use server-side encryption for logs stored in S3 encrypt_s3_logs = False -# deprecated option for remote log storage, use remote_base_log_folder instead! -# s3_log_folder = +# DEPRECATED option for remote log storage, use remote_base_log_folder instead! +s3_log_folder = # The executor class that airflow should use. Choices include # SequentialExecutor, LocalExecutor, CeleryExecutor @@ -73,10 +75,39 @@ donot_pickle = False # How long before timing out a python file import while filling the DagBag dagbag_import_timeout = 30 +# The class to use for running task instances in a subprocess +task_runner = BashTaskRunner + +# If set, tasks without a `run_as_user` argument will be run with this user +# Can be used to de-elevate a sudo user running Airflow when executing tasks +default_impersonation = + +# What security module to use (for example kerberos): +security = + +# Turn unit test mode on (overwrites many configuration options with test +# values at runtime) +unit_test_mode = False + +[cli] +# In what way should the cli access the API. The LocalClient will use the +# database directly, while the json_client will use the api running on the +# webserver +api_client = airflow.api.client.local_client +endpoint_url = http://localhost:8080 + +[api] +# How to authenticate users of the API +auth_backend = airflow.api.auth.backend.default + [operators] # The default owner assigned to each new operator, unless # provided explicitly or passed via `default_args` default_owner = Airflow +default_cpus = 1 +default_ram = 512 +default_disk = 512 +default_gpus = 0 [webserver] # The base url of your website as airflow cannot guess what domain or @@ -90,9 +121,22 @@ web_server_host = 0.0.0.0 # The port on which to run the web server web_server_port = 8080 -# The time the gunicorn webserver waits before timing out on a worker +# Paths to the SSL certificate and key for the web server. When both are +# provided SSL will be enabled. This does not change the web server port. +web_server_ssl_cert = +web_server_ssl_key = + +# Number of seconds the gunicorn webserver waits before timing out on a worker web_server_worker_timeout = 120 +# Number of workers to refresh at a time. When set to 0, worker refresh is +# disabled. When nonzero, airflow periodically refreshes webserver workers by +# bringing up new ones and killing old ones. +worker_refresh_batch_size = 1 + +# Number of seconds to wait before refreshing a batch of workers. +worker_refresh_interval = 30 + # Secret key used to run your flask app secret_key = temporary_key @@ -103,30 +147,58 @@ workers = 4 # sync (default), eventlet, gevent worker_class = sync +# Log files for the gunicorn webserver. '-' means log to stderr. +access_logfile = - +error_logfile = - + # Expose the configuration file in the web server -expose_config = true +expose_config = True # Set to true to turn on authentication: -# https://pythonhosted.org/airflow/security.html#web-authentication +# http://pythonhosted.org/airflow/security.html#web-authentication authenticate = False # Filter the list of dags by owner name (requires authentication to be enabled) filter_by_owner = False +# Filtering mode. Choices include user (default) and ldapgroup. +# Ldap group filtering requires using the ldap backend +# +# Note that the ldap server needs the "memberOf" overlay to be set up +# in order to user the ldapgroup mode. +owner_mode = user + +# Default DAG orientation. Valid values are: +# LR (Left->Right), TB (Top->Bottom), RL (Right->Left), BT (Bottom->Top) +dag_orientation = LR + +# Puts the webserver in demonstration mode; blurs the names of Operators for +# privacy. +demo_mode = False + +# The amount of time (in secs) webserver will wait for initial handshake +# while fetching logs from other worker machine +log_fetch_timeout_sec = 5 + +# By default, the webserver shows paused DAGs. Flip this to hide paused +# DAGs by default +hide_paused_dags_by_default = False + [email] email_backend = airflow.utils.email.send_email_smtp [smtp] # If you want airflow to send emails on retries, failure, and you want to use -# the airflow.utils.email.send_email_smtp function, you have to configure an smtp -# server here +# the airflow.utils.email.send_email_smtp function, you have to configure an +# smtp server here smtp_host = localhost smtp_starttls = True smtp_ssl = False -smtp_user = airflow +# Uncomment and set the user/pass settings if you want to use SMTP AUTH +# smtp_user = airflow +# smtp_password = airflow smtp_port = 25 -smtp_password = airflow -smtp_mail_from = airflow@airflow.local +smtp_mail_from = airflow@airflow.com [celery] # This section only applies if you are using the CeleryExecutor in @@ -154,10 +226,13 @@ worker_log_server_port = 8793 broker_url = redis://redis:6379/1 # Another key Celery setting -celery_result_backend = redis://redis:6379/1 +celery_result_backend = db+postgresql://airflow:airflow@postgres/airflow # Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start -# it `airflow flower`. This defines the port that Celery Flower runs on +# it `airflow flower`. This defines the IP that Celery Flower runs on +flower_host = 0.0.0.0 + +# This defines the port that Celery Flower runs on flower_port = 5555 # Default queue that tasks get assigned to and that worker listen on. @@ -174,17 +249,46 @@ job_heartbeat_sec = 5 # how often the scheduler should run (in seconds). scheduler_heartbeat_sec = 5 +# after how much time should the scheduler terminate in seconds +# -1 indicates to run continuously (see also num_runs) +run_duration = -1 + +# after how much time a new DAGs should be picked up from the filesystem +min_file_process_interval = 0 + +dag_dir_list_interval = 300 + +# How often should stats be printed to the logs +print_stats_interval = 30 + +child_process_log_directory = /usr/local/airflow/logs/scheduler + +# Local task jobs periodically heartbeat to the DB. If the job has +# not heartbeat in this many seconds, the scheduler will mark the +# associated task instance as failed and will re-schedule the task. +scheduler_zombie_task_threshold = 300 + +# Turn off scheduler catchup by setting this to False. +# Default behavior is unchanged and +# Command Line Backfills still work, but the scheduler +# will not do scheduler catchup if this is False, +# however it can be set on a per DAG basis in the +# DAG definition (catchup) +catchup_by_default = True + # Statsd (https://github.com/etsy/statsd) integration settings -# statsd_on = False -# statsd_host = localhost -# statsd_port = 8125 -# statsd_prefix = airflow +statsd_on = False +statsd_host = localhost +statsd_port = 8125 +statsd_prefix = airflow # The scheduler can run multiple threads in parallel to schedule dags. # This defines how many threads will run. However airflow will never # use more threads than the amount of cpu cores available. max_threads = 2 +authenticate = False + [mesos] # Mesos master address which MesosExecutor will connect to. master = localhost:5050 @@ -221,3 +325,18 @@ authenticate = False # Mesos credentials, if authentication is enabled # default_principal = admin # default_secret = admin + +[kerberos] +ccache = /tmp/airflow_krb5_ccache +# gets augmented with fqdn +principal = airflow +reinit_frequency = 3600 +kinit_path = kinit +keytab = airflow.keytab + +[github_enterprise] +api_rev = v3 + +[admin] +# UI to hide sensitive variable fields when set to True +hide_sensitive_variable_fields = True diff --git a/docker-compose-CeleryExecutor.yml b/docker-compose-CeleryExecutor.yml index 725f54d1..aff20970 100644 --- a/docker-compose-CeleryExecutor.yml +++ b/docker-compose-CeleryExecutor.yml @@ -11,23 +11,26 @@ services: - POSTGRES_DB=airflow webserver: - image: puckel/docker-airflow:1.7.1.3-7 + image: puckel/docker-airflow:1.8.0 restart: always depends_on: - postgres - redis environment: - # - LOAD_EX=n + - LOAD_EX=y - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho= - EXECUTOR=Celery + # - POSTGRES_USER=airflow + # - POSTGRES_PASSWORD=airflow + # - POSTGRES_DB=airflow # volumes: - # - /localpath/to/dags:/usr/local/airflow/dags + # - ~/docker-airflow/dags:/usr/local/airflow/dags ports: - "8080:8080" command: webserver flower: - image: puckel/docker-airflow:1.7.1.3-7 + image: puckel/docker-airflow:1.8.0 restart: always depends_on: - redis @@ -38,26 +41,32 @@ services: command: flower scheduler: - image: puckel/docker-airflow:1.7.1.3-7 + image: puckel/docker-airflow:1.8.0 restart: always depends_on: - webserver # volumes: - # - /localpath/to/dags:/usr/local/airflow/dags + # - ~/docker-airflow/dags:/usr/local/airflow/dags environment: - # - LOAD_EX=n + - LOAD_EX=y - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho= - EXECUTOR=Celery - command: scheduler -n 5 + # - POSTGRES_USER=airflow + # - POSTGRES_PASSWORD=airflow + # - POSTGRES_DB=airflow + command: scheduler worker: - image: puckel/docker-airflow:1.7.1.3-7 + image: puckel/docker-airflow:1.8.0 restart: always depends_on: - scheduler # volumes: - # - /localpath/to/dags:/usr/local/airflow/dags + # - ~/docker-airflow/dags:/usr/local/airflow/dags environment: - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho= - EXECUTOR=Celery + # - POSTGRES_USER=airflow + # - POSTGRES_PASSWORD=airflow + # - POSTGRES_DB=airflow command: worker diff --git a/docker-compose-LocalExecutor.yml b/docker-compose-LocalExecutor.yml index fbc1dbd7..a085c065 100644 --- a/docker-compose-LocalExecutor.yml +++ b/docker-compose-LocalExecutor.yml @@ -8,30 +8,15 @@ services: - POSTGRES_DB=airflow webserver: - image: puckel/docker-airflow:1.7.1.3-7 + image: puckel/docker-airflow:1.8.0 restart: always depends_on: - postgres environment: - # - LOAD_EX=n + - LOAD_EX=y - EXECUTOR=Local - - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho= # volumes: - # - /localpath/to/dags:/usr/local/airflow/dags + # - ~/github/docker-airflow/dags:/usr/local/airflow/dags ports: - "8080:8080" command: webserver - - scheduler: - image: puckel/docker-airflow:1.7.1.3-7 - restart: always - depends_on: - - webserver - # volumes: - # - ./requirements.txt:/requirements.txt:ro - # - /localpath/to/dags:/usr/local/airflow/dags - environment: - # - LOAD_EX=n - - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho= - - EXECUTOR=Local - command: scheduler -n 5 diff --git a/script/entrypoint.sh b/script/entrypoint.sh index c32e96a5..e1c741d6 100755 --- a/script/entrypoint.sh +++ b/script/entrypoint.sh @@ -2,15 +2,21 @@ AIRFLOW_HOME="/usr/local/airflow" CMD="airflow" -TRY_LOOP="10" -POSTGRES_HOST="postgres" -POSTGRES_PORT="5432" -REDIS_HOST="redis" -REDIS_PORT="6379" -: ${FERNET_KEY:=$(python -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print FERNET_KEY")} +TRY_LOOP="20" + +: ${REDIS_HOST:="redis"} +: ${REDIS_PORT:="6379"} + +: ${POSTGRES_HOST:="postgres"} +: ${POSTGRES_PORT:="5432"} +: ${POSTGRES_USER:="airflow"} +: ${POSTGRES_PASSWORD:="airflow"} +: ${POSTGRES_DB:="airflow"} + +: ${FERNET_KEY:=$(python -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print(FERNET_KEY)")} # Load DAGs exemples (default: Yes) -if [ "x$LOAD_EX" = "xn" ]; then +if [ "$LOAD_EX" = "n" ]; then sed -i "s/load_examples = True/load_examples = False/" "$AIRFLOW_HOME"/airflow.cfg fi @@ -19,31 +25,29 @@ if [ -e "/requirements.txt" ]; then $(which pip) install --user -r /requirements.txt fi -# Generate Fernet key +# Update airflow config - Fernet key sed -i "s|\$FERNET_KEY|$FERNET_KEY|" "$AIRFLOW_HOME"/airflow.cfg -# wait for DB +# Wait for Postresql if [ "$1" = "webserver" ] || [ "$1" = "worker" ] || [ "$1" = "scheduler" ] ; then i=0 while ! nc -z $POSTGRES_HOST $POSTGRES_PORT >/dev/null 2>&1 < /dev/null; do i=$((i+1)) - if [ $i -ge $TRY_LOOP ]; then - echo "$(date) - ${POSTGRES_HOST}:${POSTGRES_PORT} still not reachable, giving up" - exit 1 + if [ "$1" = "webserver" ]; then + echo "$(date) - waiting for ${POSTGRES_HOST}:${POSTGRES_PORT}... $i/$TRY_LOOP" + if [ $i -ge $TRY_LOOP ]; then + echo "$(date) - ${POSTGRES_HOST}:${POSTGRES_PORT} still not reachable, giving up" + exit 1 + fi fi - echo "$(date) - waiting for ${POSTGRES_HOST}:${POSTGRES_PORT}... $i/$TRY_LOOP" sleep 10 done - if [ "$1" = "webserver" ]; then - echo "Initialize database..." - $CMD initdb - fi - sleep 5 fi -# If we use docker-compose, we use Celery. -if [ "x$EXECUTOR" = "xCelery" ] +# Update configuration depending the type of Executor +if [ "$EXECUTOR" = "Celery" ] then + # Wait for Redis if [ "$1" = "webserver" ] || [ "$1" = "worker" ] || [ "$1" = "scheduler" ] || [ "$1" = "flower" ] ; then j=0 while ! nc -z $REDIS_HOST $REDIS_PORT >/dev/null 2>&1 < /dev/null; do @@ -56,14 +60,31 @@ then sleep 5 done fi - exec $CMD "$@" -elif [ "x$EXECUTOR" = "xLocal" ] + sed -i "s#celery_result_backend = db+postgresql://airflow:airflow@postgres/airflow#celery_result_backend = db+postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB#" "$AIRFLOW_HOME"/airflow.cfg + sed -i "s#sql_alchemy_conn = postgresql+psycopg2://airflow:airflow@postgres/airflow#sql_alchemy_conn = postgresql+psycopg2://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB#" "$AIRFLOW_HOME"/airflow.cfg + sed -i "s#broker_url = redis://redis:6379/1#broker_url = redis://$REDIS_HOST:$REDIS_PORT/1#" "$AIRFLOW_HOME"/airflow.cfg + if [ "$1" = "webserver" ]; then + echo "Initialize database..." + $CMD initdb + exec $CMD webserver + else + sleep 10 + exec $CMD "$@" + fi +elif [ "$EXECUTOR" = "Local" ] then sed -i "s/executor = CeleryExecutor/executor = LocalExecutor/" "$AIRFLOW_HOME"/airflow.cfg - exec $CMD "$@" + sed -i "s#sql_alchemy_conn = postgresql+psycopg2://airflow:airflow@postgres/airflow#sql_alchemy_conn = postgresql+psycopg2://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB#" "$AIRFLOW_HOME"/airflow.cfg + sed -i "s#broker_url = redis://redis:6379/1#broker_url = redis://$REDIS_HOST:$REDIS_PORT/1#" "$AIRFLOW_HOME"/airflow.cfg + echo "Initialize database..." + $CMD initdb + exec $CMD webserver & + exec $CMD scheduler +# By default we use SequentialExecutor else if [ "$1" = "version" ]; then exec $CMD version + exit fi sed -i "s/executor = CeleryExecutor/executor = SequentialExecutor/" "$AIRFLOW_HOME"/airflow.cfg sed -i "s#sql_alchemy_conn = postgresql+psycopg2://airflow:airflow@postgres/airflow#sql_alchemy_conn = sqlite:////usr/local/airflow/airflow.db#" "$AIRFLOW_HOME"/airflow.cfg