Skip to content

Commit

Permalink
Merge pull request #74 from puckel/v1.8.0
Browse files Browse the repository at this point in the history
v1.8.0
  • Loading branch information
puckel committed Mar 21, 2017
2 parents aa44b8e + d178d0b commit a2a55ef
Show file tree
Hide file tree
Showing 7 changed files with 211 additions and 81 deletions.
14 changes: 7 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# VERSION 1.7.1.3-7
# VERSION 1.8.0
# AUTHOR: Matthieu "Puckel_" Roisil
# DESCRIPTION: Basic Airflow container
# BUILD: docker build --rm -t puckel/docker-airflow .
Expand All @@ -12,16 +12,16 @@ ENV DEBIAN_FRONTEND noninteractive
ENV TERM linux

# Airflow
ARG AIRFLOW_VERSION=1.7.1.3
ENV AIRFLOW_HOME /usr/local/airflow
ARG AIRFLOW_VERSION=1.8.0
ARG AIRFLOW_HOME=/usr/local/airflow

# Define en_US.
ENV LANGUAGE en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LC_ALL en_US.UTF-8
ENV LC_CTYPE en_US.UTF-8
ENV LC_MESSAGES en_US.UTF-8
ENV LC_ALL en_US.UTF-8
ENV LC_ALL en_US.UTF-8

RUN set -ex \
&& buildDeps=' \
Expand All @@ -34,24 +34,24 @@ RUN set -ex \
libblas-dev \
liblapack-dev \
libpq-dev \
git \
' \
&& echo "deb http://http.debian.net/debian jessie-backports main" >/etc/apt/sources.list.d/backports.list \
&& apt-get update -yqq \
&& apt-get install -yqq --no-install-recommends \
$buildDeps \
python-pip \
python-requests \
apt-utils \
curl \
netcat \
locales \
&& apt-get install -yqq -t jessie-backports python-requests \
&& sed -i 's/^# en_US.UTF-8 UTF-8$/en_US.UTF-8 UTF-8/g' /etc/locale.gen \
&& locale-gen \
&& update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 \
&& useradd -ms /bin/bash -d ${AIRFLOW_HOME} airflow \
&& python -m pip install -U pip \
&& pip install Cython \
&& pip install pytz==2015.7 \
&& pip install pytz \
&& pip install pyOpenSSL \
&& pip install ndg-httpsclient \
&& pip install pyasn1 \
Expand Down
6 changes: 1 addition & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ Check [Airflow Documentation](https://pythonhosted.org/airflow/)

## Install custom python package

- Create a file "requirements.txt" with the dedired python modules
- Create a file "requirements.txt" with the desired python modules
- Mount this file as a volume `-v $(pwd)/requirements.txt:/requirements.txt`
- The entrypoint.sh script execute the pip install command (with --user option)

Expand All @@ -82,10 +82,6 @@ Easy scaling using docker-compose:

This can be used to scale to a multi node setup using docker swarm.

## Links

- Airflow on Kubernetes [kube-airflow](https://github.com/mumoshu/kube-airflow)

# Wanna help?

Fork, improve and PR. ;-)
2 changes: 1 addition & 1 deletion circle.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ test:
pre:
- sleep 5
override:
- docker run puckel/docker-airflow version |grep '1.7.1.3'
- docker run puckel/docker-airflow version |grep '1.8.0'
153 changes: 136 additions & 17 deletions config/airflow.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ airflow_home = /usr/local/airflow

# The folder where your airflow pipelines live, most likely a
# subfolder in a code repository
# This path must be absolute
dags_folder = /usr/local/airflow/dags

# The folder where airflow should store its log files. This location
# The folder where airflow should store its log files
# This path must be absolute
base_log_folder = /usr/local/airflow/logs

# Airflow can store logs remotely in AWS S3 or Google Cloud Storage. Users
Expand All @@ -17,8 +19,8 @@ remote_base_log_folder =
remote_log_conn_id =
# Use server-side encryption for logs stored in S3
encrypt_s3_logs = False
# deprecated option for remote log storage, use remote_base_log_folder instead!
# s3_log_folder =
# DEPRECATED option for remote log storage, use remote_base_log_folder instead!
s3_log_folder =

# The executor class that airflow should use. Choices include
# SequentialExecutor, LocalExecutor, CeleryExecutor
Expand Down Expand Up @@ -73,10 +75,39 @@ donot_pickle = False
# How long before timing out a python file import while filling the DagBag
dagbag_import_timeout = 30

# The class to use for running task instances in a subprocess
task_runner = BashTaskRunner

# If set, tasks without a `run_as_user` argument will be run with this user
# Can be used to de-elevate a sudo user running Airflow when executing tasks
default_impersonation =

# What security module to use (for example kerberos):
security =

# Turn unit test mode on (overwrites many configuration options with test
# values at runtime)
unit_test_mode = False

[cli]
# In what way should the cli access the API. The LocalClient will use the
# database directly, while the json_client will use the api running on the
# webserver
api_client = airflow.api.client.local_client
endpoint_url = http://localhost:8080

[api]
# How to authenticate users of the API
auth_backend = airflow.api.auth.backend.default

[operators]
# The default owner assigned to each new operator, unless
# provided explicitly or passed via `default_args`
default_owner = Airflow
default_cpus = 1
default_ram = 512
default_disk = 512
default_gpus = 0

[webserver]
# The base url of your website as airflow cannot guess what domain or
Expand All @@ -90,9 +121,22 @@ web_server_host = 0.0.0.0
# The port on which to run the web server
web_server_port = 8080

# The time the gunicorn webserver waits before timing out on a worker
# Paths to the SSL certificate and key for the web server. When both are
# provided SSL will be enabled. This does not change the web server port.
web_server_ssl_cert =
web_server_ssl_key =

# Number of seconds the gunicorn webserver waits before timing out on a worker
web_server_worker_timeout = 120

# Number of workers to refresh at a time. When set to 0, worker refresh is
# disabled. When nonzero, airflow periodically refreshes webserver workers by
# bringing up new ones and killing old ones.
worker_refresh_batch_size = 1

# Number of seconds to wait before refreshing a batch of workers.
worker_refresh_interval = 30

# Secret key used to run your flask app
secret_key = temporary_key

Expand All @@ -103,30 +147,58 @@ workers = 4
# sync (default), eventlet, gevent
worker_class = sync

# Log files for the gunicorn webserver. '-' means log to stderr.
access_logfile = -
error_logfile = -

# Expose the configuration file in the web server
expose_config = true
expose_config = True

# Set to true to turn on authentication:
# https://pythonhosted.org/airflow/security.html#web-authentication
# http://pythonhosted.org/airflow/security.html#web-authentication
authenticate = False

# Filter the list of dags by owner name (requires authentication to be enabled)
filter_by_owner = False

# Filtering mode. Choices include user (default) and ldapgroup.
# Ldap group filtering requires using the ldap backend
#
# Note that the ldap server needs the "memberOf" overlay to be set up
# in order to user the ldapgroup mode.
owner_mode = user

# Default DAG orientation. Valid values are:
# LR (Left->Right), TB (Top->Bottom), RL (Right->Left), BT (Bottom->Top)
dag_orientation = LR

# Puts the webserver in demonstration mode; blurs the names of Operators for
# privacy.
demo_mode = False

# The amount of time (in secs) webserver will wait for initial handshake
# while fetching logs from other worker machine
log_fetch_timeout_sec = 5

# By default, the webserver shows paused DAGs. Flip this to hide paused
# DAGs by default
hide_paused_dags_by_default = False

[email]
email_backend = airflow.utils.email.send_email_smtp

[smtp]
# If you want airflow to send emails on retries, failure, and you want to use
# the airflow.utils.email.send_email_smtp function, you have to configure an smtp
# server here
# the airflow.utils.email.send_email_smtp function, you have to configure an
# smtp server here
smtp_host = localhost
smtp_starttls = True
smtp_ssl = False
smtp_user = airflow
# Uncomment and set the user/pass settings if you want to use SMTP AUTH
# smtp_user = airflow
# smtp_password = airflow
smtp_port = 25
smtp_password = airflow
smtp_mail_from = airflow@airflow.local
smtp_mail_from = airflow@airflow.com

[celery]
# This section only applies if you are using the CeleryExecutor in
Expand Down Expand Up @@ -154,10 +226,13 @@ worker_log_server_port = 8793
broker_url = redis://redis:6379/1

# Another key Celery setting
celery_result_backend = redis://redis:6379/1
celery_result_backend = db+postgresql://airflow:airflow@postgres/airflow

# Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start
# it `airflow flower`. This defines the port that Celery Flower runs on
# it `airflow flower`. This defines the IP that Celery Flower runs on
flower_host = 0.0.0.0

# This defines the port that Celery Flower runs on
flower_port = 5555

# Default queue that tasks get assigned to and that worker listen on.
Expand All @@ -174,17 +249,46 @@ job_heartbeat_sec = 5
# how often the scheduler should run (in seconds).
scheduler_heartbeat_sec = 5

# after how much time should the scheduler terminate in seconds
# -1 indicates to run continuously (see also num_runs)
run_duration = -1

# after how much time a new DAGs should be picked up from the filesystem
min_file_process_interval = 0

dag_dir_list_interval = 300

# How often should stats be printed to the logs
print_stats_interval = 30

child_process_log_directory = /usr/local/airflow/logs/scheduler

# Local task jobs periodically heartbeat to the DB. If the job has
# not heartbeat in this many seconds, the scheduler will mark the
# associated task instance as failed and will re-schedule the task.
scheduler_zombie_task_threshold = 300

# Turn off scheduler catchup by setting this to False.
# Default behavior is unchanged and
# Command Line Backfills still work, but the scheduler
# will not do scheduler catchup if this is False,
# however it can be set on a per DAG basis in the
# DAG definition (catchup)
catchup_by_default = True

# Statsd (https://github.com/etsy/statsd) integration settings
# statsd_on = False
# statsd_host = localhost
# statsd_port = 8125
# statsd_prefix = airflow
statsd_on = False
statsd_host = localhost
statsd_port = 8125
statsd_prefix = airflow

# The scheduler can run multiple threads in parallel to schedule dags.
# This defines how many threads will run. However airflow will never
# use more threads than the amount of cpu cores available.
max_threads = 2

authenticate = False

[mesos]
# Mesos master address which MesosExecutor will connect to.
master = localhost:5050
Expand Down Expand Up @@ -221,3 +325,18 @@ authenticate = False
# Mesos credentials, if authentication is enabled
# default_principal = admin
# default_secret = admin

[kerberos]
ccache = /tmp/airflow_krb5_ccache
# gets augmented with fqdn
principal = airflow
reinit_frequency = 3600
kinit_path = kinit
keytab = airflow.keytab

[github_enterprise]
api_rev = v3

[admin]
# UI to hide sensitive variable fields when set to True
hide_sensitive_variable_fields = True
29 changes: 19 additions & 10 deletions docker-compose-CeleryExecutor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,26 @@ services:
- POSTGRES_DB=airflow

webserver:
image: puckel/docker-airflow:1.7.1.3-7
image: puckel/docker-airflow:1.8.0
restart: always
depends_on:
- postgres
- redis
environment:
# - LOAD_EX=n
- LOAD_EX=y
- FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho=
- EXECUTOR=Celery
# - POSTGRES_USER=airflow
# - POSTGRES_PASSWORD=airflow
# - POSTGRES_DB=airflow
# volumes:
# - /localpath/to/dags:/usr/local/airflow/dags
# - ~/docker-airflow/dags:/usr/local/airflow/dags
ports:
- "8080:8080"
command: webserver

flower:
image: puckel/docker-airflow:1.7.1.3-7
image: puckel/docker-airflow:1.8.0
restart: always
depends_on:
- redis
Expand All @@ -38,26 +41,32 @@ services:
command: flower

scheduler:
image: puckel/docker-airflow:1.7.1.3-7
image: puckel/docker-airflow:1.8.0
restart: always
depends_on:
- webserver
# volumes:
# - /localpath/to/dags:/usr/local/airflow/dags
# - ~/docker-airflow/dags:/usr/local/airflow/dags
environment:
# - LOAD_EX=n
- LOAD_EX=y
- FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho=
- EXECUTOR=Celery
command: scheduler -n 5
# - POSTGRES_USER=airflow
# - POSTGRES_PASSWORD=airflow
# - POSTGRES_DB=airflow
command: scheduler

worker:
image: puckel/docker-airflow:1.7.1.3-7
image: puckel/docker-airflow:1.8.0
restart: always
depends_on:
- scheduler
# volumes:
# - /localpath/to/dags:/usr/local/airflow/dags
# - ~/docker-airflow/dags:/usr/local/airflow/dags
environment:
- FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho=
- EXECUTOR=Celery
# - POSTGRES_USER=airflow
# - POSTGRES_PASSWORD=airflow
# - POSTGRES_DB=airflow
command: worker
Loading

0 comments on commit a2a55ef

Please sign in to comment.