From 759c56aba35eead63835829a72a147d27d0e6fca Mon Sep 17 00:00:00 2001 From: Tatyana Bysheva Date: Tue, 28 Nov 2023 12:13:30 +0000 Subject: [PATCH] Impromevents on GitHub CI --- .github/workflows/build_docs.yml | 33 ++++++ .github/workflows/main.yml | 19 ++-- replay/utils/time.py | 188 +++++++++++++++++++------------ tests/models/test_word2vec.py | 1 + 4 files changed, 157 insertions(+), 84 deletions(-) create mode 100644 .github/workflows/build_docs.yml diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml new file mode 100644 index 000000000..a65cac798 --- /dev/null +++ b/.github/workflows/build_docs.yml @@ -0,0 +1,33 @@ +name: Build Docs + +# Controls when the workflow will run +on: + # Triggers the workflow on push and pull request events but only for the main branch + push: + branches: [main] + pull_request: + branches: [main, refactoring] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + build_docs: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: "3.9" + - name: Install full package + run: | + python -m venv venv + . ./venv/bin/activate + pip install --upgrade pip wheel poetry==1.5.1 lightfm + poetry cache clear pypi --all + ./poetry_wrapper.sh --experimental install --all-extras + - name: Build docs + run: | + . ./venv/bin/activate + make -C docs clean html diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 875edc76c..7b7326e9c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,7 +2,9 @@ name: CI # Controls when the workflow will run on: - # Triggers the workflow on pull request events but only for the main branch + # Triggers the workflow on push and pull request events but only for the main branch + push: + branches: [main] pull_request: branches: [main, refactoring] @@ -14,25 +16,22 @@ jobs: run_tests: runs-on: ubuntu-20.04 strategy: + fail-fast: false matrix: - python-version: ["3.7", "3.8", "3.9"] + python-version: ["3.9"] steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Install package + - name: Install full package run: | python -m venv venv . ./venv/bin/activate pip install --upgrade pip wheel poetry==1.5.1 lightfm poetry cache clear pypi --all - poetry install - - name: Build docs - run: | - . ./venv/bin/activate - make -C docs clean html + ./poetry_wrapper.sh --experimental install --all-extras - name: pycodestyle run: | . ./venv/bin/activate diff --git a/replay/utils/time.py b/replay/utils/time.py index 27e3aad2b..588bec952 100644 --- a/replay/utils/time.py +++ b/replay/utils/time.py @@ -26,10 +26,12 @@ def get_item_recency( :return: DataFrame with item weights >>> import pandas as pd - >>> d = {} - >>> d["item_idx"] = [1, 1, 2, 3, 3] - >>> d["timestamp"] = ["2099-03-19", "2099-03-20", "2099-03-22", "2099-03-27", "2099-03-25"] - >>> d["relevance"] = [1, 1, 1, 1, 1] + >>> from pyspark.sql.functions import round + >>> d = { + ... "item_idx": [1, 1, 2, 3, 3], + ... "timestamp": ["2099-03-19", "2099-03-20", "2099-03-22", "2099-03-27", "2099-03-25"], + ... "relevance": [1, 1, 1, 1, 1], + ... } >>> df = pd.DataFrame(d) >>> df item_idx timestamp relevance @@ -47,38 +49,53 @@ def get_item_recency( Power smoothing falls quickly in the beginning but decays slowly afterwards as ``age^c``. - >>> get_item_recency(df, kind="power").orderBy("item_idx").show() - +--------+-------------------+------------------+ - |item_idx| timestamp| relevance| - +--------+-------------------+------------------+ - | 1|2099-03-19 12:00:00|0.6632341020947187| - | 2|2099-03-22 00:00:00|0.7203662792445817| - | 3|2099-03-26 00:00:00| 1.0| - +--------+-------------------+------------------+ + >>> ( + ... get_item_recency(df, kind="power") + ... .select("item_idx", "timestamp", round("relevance", 4).alias("relevance")) + ... .orderBy("item_idx") + ... .show() + ... ) + +--------+-------------------+---------+ + |item_idx| timestamp|relevance| + +--------+-------------------+---------+ + | 1|2099-03-19 12:00:00| 0.6632| + | 2|2099-03-22 00:00:00| 0.7204| + | 3|2099-03-26 00:00:00| 1.0| + +--------+-------------------+---------+ Exponential smoothing is the other way around. Old objects decay more quickly as ``c^age``. - >>> get_item_recency(df, kind="exp").orderBy("item_idx").show() - +--------+-------------------+------------------+ - |item_idx| timestamp| relevance| - +--------+-------------------+------------------+ - | 1|2099-03-19 12:00:00|0.8605514372443304| - | 2|2099-03-22 00:00:00| 0.911722488558217| - | 3|2099-03-26 00:00:00| 1.0| - +--------+-------------------+------------------+ + >>> ( + ... get_item_recency(df, kind="exp") + ... .select("item_idx", "timestamp", round("relevance", 4).alias("relevance")) + ... .orderBy("item_idx") + ... .show() + ... ) + +--------+-------------------+---------+ + |item_idx| timestamp|relevance| + +--------+-------------------+---------+ + | 1|2099-03-19 12:00:00| 0.8606| + | 2|2099-03-22 00:00:00| 0.9117| + | 3|2099-03-26 00:00:00| 1.0| + +--------+-------------------+---------+ Last type is a linear smoothing: ``1 - c*age``. - >>> get_item_recency(df, kind="linear").orderBy("item_idx").show() - +--------+-------------------+------------------+ - |item_idx| timestamp| relevance| - +--------+-------------------+------------------+ - | 1|2099-03-19 12:00:00|0.8916666666666666| - | 2|2099-03-22 00:00:00|0.9333333333333333| - | 3|2099-03-26 00:00:00| 1.0| - +--------+-------------------+------------------+ + >>> ( + ... get_item_recency(df, kind="linear") + ... .select("item_idx", "timestamp", round("relevance", 4).alias("relevance")) + ... .orderBy("item_idx") + ... .show() + ... ) + +--------+-------------------+---------+ + |item_idx| timestamp|relevance| + +--------+-------------------+---------+ + | 1|2099-03-19 12:00:00| 0.8917| + | 2|2099-03-22 00:00:00| 0.9333| + | 3|2099-03-26 00:00:00| 1.0| + +--------+-------------------+---------+ This function **does not** take relevance values of interactions into account. @@ -115,10 +132,12 @@ def smoothe_time( :return: modified DataFrame >>> import pandas as pd - >>> d = {} - >>> d["item_idx"] = [1, 1, 2, 3, 3] - >>> d["timestamp"] = ["2099-03-19", "2099-03-20", "2099-03-22", "2099-03-27", "2099-03-25"] - >>> d["relevance"] = [1, 1, 1, 1, 1] + >>> from pyspark.sql.functions import round + >>> d = { + ... "item_idx": [1, 1, 2, 3, 3], + ... "timestamp": ["2099-03-19", "2099-03-20", "2099-03-22", "2099-03-27", "2099-03-25"], + ... "relevance": [1, 1, 1, 1, 1], + ... } >>> df = pd.DataFrame(d) >>> df item_idx timestamp relevance @@ -130,67 +149,88 @@ def smoothe_time( Power smoothing falls quickly in the beginning but decays slowly afterwards as ``age^c``. - >>> smoothe_time(df, kind="power").orderBy("timestamp").show() - +--------+-------------------+------------------+ - |item_idx| timestamp| relevance| - +--------+-------------------+------------------+ - | 1|2099-03-19 00:00:00|0.6390430306850825| - | 1|2099-03-20 00:00:00| 0.654567945027101| - | 2|2099-03-22 00:00:00|0.6940913454809814| - | 3|2099-03-25 00:00:00|0.7994016704292545| - | 3|2099-03-27 00:00:00| 1.0| - +--------+-------------------+------------------+ + >>> ( + ... smoothe_time(df, kind="power") + ... .select("item_idx", "timestamp", round("relevance", 4).alias("relevance")) + ... .orderBy("timestamp") + ... .show() + ... ) + +--------+-------------------+---------+ + |item_idx| timestamp|relevance| + +--------+-------------------+---------+ + | 1|2099-03-19 00:00:00| 0.639| + | 1|2099-03-20 00:00:00| 0.6546| + | 2|2099-03-22 00:00:00| 0.6941| + | 3|2099-03-25 00:00:00| 0.7994| + | 3|2099-03-27 00:00:00| 1.0| + +--------+-------------------+---------+ Exponential smoothing is the other way around. Old objects decay more quickly as ``c^age``. - >>> smoothe_time(df, kind="exp").orderBy("timestamp").show() - +--------+-------------------+------------------+ - |item_idx| timestamp| relevance| - +--------+-------------------+------------------+ - | 1|2099-03-19 00:00:00|0.8312378961427882| - | 1|2099-03-20 00:00:00| 0.850667160950856| - | 2|2099-03-22 00:00:00|0.8908987181403396| - | 3|2099-03-25 00:00:00|0.9548416039104167| - | 3|2099-03-27 00:00:00| 1.0| - +--------+-------------------+------------------+ + >>> ( + ... smoothe_time(df, kind="exp") + ... .select("item_idx", "timestamp", round("relevance", 4).alias("relevance")) + ... .orderBy("timestamp") + ... .show() + ... ) + +--------+-------------------+---------+ + |item_idx| timestamp|relevance| + +--------+-------------------+---------+ + | 1|2099-03-19 00:00:00| 0.8312| + | 1|2099-03-20 00:00:00| 0.8507| + | 2|2099-03-22 00:00:00| 0.8909| + | 3|2099-03-25 00:00:00| 0.9548| + | 3|2099-03-27 00:00:00| 1.0| + +--------+-------------------+---------+ Last type is a linear smoothing: ``1 - c*age``. - >>> smoothe_time(df, kind="linear").orderBy("timestamp").show() - +--------+-------------------+------------------+ - |item_idx| timestamp| relevance| - +--------+-------------------+------------------+ - | 1|2099-03-19 00:00:00|0.8666666666666667| - | 1|2099-03-20 00:00:00|0.8833333333333333| - | 2|2099-03-22 00:00:00|0.9166666666666666| - | 3|2099-03-25 00:00:00|0.9666666666666667| - | 3|2099-03-27 00:00:00| 1.0| - +--------+-------------------+------------------+ + >>> ( + ... smoothe_time(df, kind="linear") + ... .select("item_idx", "timestamp", round("relevance", 4).alias("relevance")) + ... .orderBy("timestamp") + ... .show() + ... ) + +--------+-------------------+---------+ + |item_idx| timestamp|relevance| + +--------+-------------------+---------+ + | 1|2099-03-19 00:00:00| 0.8667| + | 1|2099-03-20 00:00:00| 0.8833| + | 2|2099-03-22 00:00:00| 0.9167| + | 3|2099-03-25 00:00:00| 0.9667| + | 3|2099-03-27 00:00:00| 1.0| + +--------+-------------------+---------+ These examples use constant relevance 1, so resulting weight equals the time dependent weight. But actually this value is an updated relevance. - >>> d = {} - >>> d["item_idx"] = [1, 2, 3] - >>> d["timestamp"] = ["2099-03-19", "2099-03-20", "2099-03-22"] - >>> d["relevance"] = [10, 3, 0.1] + >>> d = { + ... "item_idx": [1, 2, 3], + ... "timestamp": ["2099-03-19", "2099-03-20", "2099-03-22"], + ... "relevance": [10, 3, 0.1], + ... } >>> df = pd.DataFrame(d) >>> df item_idx timestamp relevance 0 1 2099-03-19 10.0 1 2 2099-03-20 3.0 2 3 2099-03-22 0.1 - >>> smoothe_time(df).orderBy("timestamp").show() - +--------+-------------------+-----------------+ - |item_idx| timestamp| relevance| - +--------+-------------------+-----------------+ - | 1|2099-03-19 00:00:00|9.330329915368075| - | 2|2099-03-20 00:00:00| 2.86452481173125| - | 3|2099-03-22 00:00:00| 0.1| - +--------+-------------------+-----------------+ + >>> ( + ... smoothe_time(df) + ... .select("item_idx", "timestamp", round("relevance", 4).alias("relevance")) + ... .orderBy("timestamp") + ... .show() + ... ) + +--------+-------------------+---------+ + |item_idx| timestamp|relevance| + +--------+-------------------+---------+ + | 1|2099-03-19 00:00:00| 9.3303| + | 2|2099-03-20 00:00:00| 2.8645| + | 3|2099-03-22 00:00:00| 0.1| + +--------+-------------------+---------+ """ log = convert2spark(log) diff --git a/tests/models/test_word2vec.py b/tests/models/test_word2vec.py index 64288c399..2b144353c 100644 --- a/tests/models/test_word2vec.py +++ b/tests/models/test_word2vec.py @@ -82,6 +82,7 @@ def test_fit(log, model): assert np.allclose( vectors, [[1, 5.33072205e-04], [0, 1.54904364e-01], [3, 2.13002899e-01]], + atol=1e-04, )