From b663c97f5709ad48ab3945504586f743f0a418fd Mon Sep 17 00:00:00 2001 From: Vincent Emonet Date: Fri, 19 Apr 2024 13:06:19 +0200 Subject: [PATCH] feat: Add functions `compress_or_standardize`, `expand_or_standardize`, `is_uri`, `is_curie`. Add git cliff to generate `CHANGELOG.md` and release notes --- .github/workflows/build.yml | 62 ++++++--- .github/workflows/test.yml | 8 +- cliff.toml | 72 ++++++++++ js/src/api.rs | 28 ++++ js/tests/curies.test.ts | 29 +++- lib/docs/docs/architecture.md | 55 ++++---- lib/docs/docs/contributing.md | 25 ++-- lib/docs/docs/index.md | 26 ++-- lib/docs/docs/javascript.md | 6 +- lib/docs/docs/python-devtools.md | 140 ++++++++++++++++++++ lib/docs/docs/python.md | 136 ++++++++++++++----- lib/docs/docs/reconciliation.md | 205 +++++++++++++++++++++++++++++ lib/docs/docs/rust.md | 12 +- lib/docs/includes/abbreviations.md | 3 + lib/docs/mkdocs.yml | 53 ++++---- lib/src/api.rs | 108 ++++++++++++++- lib/src/error.rs | 2 +- lib/tests/curies_test.rs | 8 +- python/requirements.txt | 2 + python/src/api.rs | 65 ++++++--- scripts/bump.sh | 16 --- scripts/docs.sh | 2 +- scripts/install-dev.sh | 4 +- scripts/release.sh | 34 +++++ 24 files changed, 925 insertions(+), 176 deletions(-) create mode 100644 cliff.toml create mode 100644 lib/docs/docs/python-devtools.md create mode 100644 lib/docs/docs/reconciliation.md delete mode 100755 scripts/bump.sh create mode 100755 scripts/release.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 01aad09..84e2115 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,9 +1,6 @@ name: Build on: workflow_dispatch: - release: - types: - - published push: tags: - "v*.*.*" @@ -12,7 +9,6 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true -# cf. https://github.com/oxigraph/oxigraph/blob/main/.github/workflows/artifacts.yml jobs: npm_tarball: @@ -46,20 +42,7 @@ jobs: working-directory: ./js env: NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} - if: github.event_name == 'release' - - publish_crates: - name: πŸ“¦οΈ Publish crates to crates.io - if: github.event_name == 'release' - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - run: rustup update - - run: cargo login $CRATES_IO_TOKEN - env: - CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} - - run: cargo publish - working-directory: ./lib + if: startsWith(github.ref, 'refs/tags/') # Inspired by pydantic: https://github.com/pydantic/pydantic-core/blob/main/.github/workflows/ci.yml build_wheels: @@ -165,9 +148,48 @@ jobs: path: python/dist - name: Publish to PyPI - if: github.event_name == 'release' - # if: startsWith(github.ref, 'refs/tags/') + if: startsWith(github.ref, 'refs/tags/') run: twine upload python/dist/* env: TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} + + + publish_crates: + name: πŸ“¦οΈ Publish crates to crates.io + if: startsWith(github.ref, 'refs/tags/') + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - run: rustup update + - run: cargo login $CRATES_IO_TOKEN + env: + CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} + - run: cargo publish + working-directory: ./lib + + generate-changelog: + name: πŸ”οΈ Generate changelog for GitHub release + runs-on: ubuntu-latest + outputs: + release_body: ${{ steps.git-cliff.outputs.content }} + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Generate a changelog + uses: orhun/git-cliff-action@main + id: git-cliff + with: + config: cliff.toml + args: -vv --latest --strip header + env: + OUTPUT: CHANGES.md + + - name: Release + uses: softprops/action-gh-release@v1 + with: + body_path: CHANGES.md + if: startsWith(github.ref, 'refs/tags/') diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2338726..5c6f1f1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -31,7 +31,7 @@ jobs: RUST_BACKTRACE: 1 cov-rust: - name: β˜‚οΈ Coverage Rust + name: β˜‚οΈ Test with coverage Rust runs-on: ubuntu-latest container: image: xd009642/tarpaulin:develop-nightly @@ -42,10 +42,10 @@ jobs: run: bash ./scripts/cov.sh - name: Upload to codecov.io - uses: codecov/codecov-action@v3 + uses: codecov/codecov-action@v4 with: - fail_ci_if_error: false - # token: ${{secrets.CODECOV_TOKEN}} + token: ${{secrets.CODECOV_TOKEN}} + # fail_ci_if_error: false test-js: diff --git a/cliff.toml b/cliff.toml new file mode 100644 index 0000000..ddfa05f --- /dev/null +++ b/cliff.toml @@ -0,0 +1,72 @@ +[git] +# NOTE: Add an exclamation mark in your commit message prefix to indicate a BREAKING change +# e.g. `feat!: changed things` or `feat(python)!: changed things` +# More details about the standard at https://www.conventionalcommits.org +commit_parsers = [ + { message = "^feat", group = "⛰️ Features" }, + { message = "^fix", group = "πŸ› Bug Fixes" }, + { message = "^doc", group = "πŸ“š Documentation" }, + { message = "^perf|DataOriented", group = "⚑ Performance" }, + { message = "^refactor", group = "🚜 Refactor" }, + { message = "^style|Formatting", group = "🎨 Styling" }, + { message = "^test", group = "πŸ§ͺ Testing" }, + { message = "^ci", group = "βš™οΈ Continuous Integration" }, + { message = "^chore\\(release\\): prepare for", skip = true }, + { message = "^chore\\(deps\\)", skip = true }, + { message = "^chore\\(pr\\)", skip = true }, + { message = "^chore\\(pull\\)", skip = true }, + { message = "^chore", group = "πŸ› οΈ Miscellaneous Tasks" }, + { body = ".*security", group = "πŸ›‘οΈ Security" }, + { message = "^revert", group = "◀️ Revert" }, +] +conventional_commits = true +filter_unconventional = false +split_commits = false +# Protect breaking changes from being skipped due to matching a skipping commit_parser +protect_breaking_commits = false +filter_commits = false +tag_pattern = "v?[0-9].*" + +# Sort the tags topologically +topo_order = false +sort_commits = "oldest" + +# skip_tags = "0.1.0-beta.1" # regex for skipping tags +# ignore_tags = "" # regex for ignoring tags +# limit_commits = 42 # limit the number of commits included in the changelog. +# regex for preprocessing the commit messages: +# commit_preprocessors = [ +# # { pattern = '\((\w+\s)?#([0-9]+)\)', replace = "([#${2}](/issues/${2}))"}, # replace issue numbers +# ] + + +[changelog] +header = """ +# πŸ“œ Changelog\n +All notable changes to this project will be documented in this file.\n +""" +# Template for the changelog: https://keats.github.io/tera/docs +body = """ +{% if version %}\ + {% if previous.version %}\ + ## [{{ version | trim_start_matches(pat="v") }}](/compare/{{ previous.version }}..{{ version }}) - {{ timestamp | date(format="%Y-%m-%d") }} + {% else %}\ + ## [{{ version | trim_start_matches(pat="v") }}](/tree/{{ version }}) - {{ timestamp | date(format="%Y-%m-%d") }} + {% endif %}\ +{% else %}\ + ## [unreleased] +{% endif %}\ +{% for group, commits in commits | group_by(attribute="group") %} + ### {{ group | upper_first }} + {% for commit in commits %} + - {% if commit.breaking %}[**breaking**] {% endif %}{{ commit.message | upper_first | trim }} - ([{{ commit.id | truncate(length=7, end="") }}](/commit/{{ commit.id }}))\ + {% endfor %} +{% endfor %}\n +""" +trim = true +footer = """ + +""" +postprocessors = [ + { pattern = '', replace = "https://github.com/vemonet/nanopub-rs" }, +] diff --git a/js/src/api.rs b/js/src/api.rs index f69833b..35cc894 100644 --- a/js/src/api.rs +++ b/js/src/api.rs @@ -206,6 +206,34 @@ impl ConverterJs { .map_err(|e| JsValue::from_str(&e.to_string())) } + /// Checks if a given string is a valid URI according to the current `Converter` + #[wasm_bindgen(js_name = isUri)] + pub fn is_uri(&self, uri: String) -> bool { + self.converter.is_uri(&uri) + } + + /// Checks if a given string is a valid CURIE according to the current `Converter` + #[wasm_bindgen(js_name = isCurie)] + pub fn is_curie(&self, curie: String) -> bool { + self.converter.is_curie(&curie) + } + + /// Attempts to compress a URI to a CURIE, or standardize it if it's already a CURIE. + #[wasm_bindgen(js_name = compressOrStandardize)] + pub fn compress_or_standardize(&self, input: String) -> Result { + self.converter + .compress_or_standardize(&input) + .map_err(|e| JsValue::from_str(&e.to_string())) + } + + /// Attempts to expand a CURIE to a URI, or standardize it if it's already a URI. + #[wasm_bindgen(js_name = expandOrStandardize)] + pub fn expand_or_standardize(&self, input: String) -> Result { + self.converter + .expand_or_standardize(&input) + .map_err(|e| JsValue::from_str(&e.to_string())) + } + #[wasm_bindgen(js_name = getPrefixes)] pub fn get_prefixes(&self, include_synonyms: Option) -> Vec { self.converter diff --git a/js/tests/curies.test.ts b/js/tests/curies.test.ts index ad350dd..0417704 100644 --- a/js/tests/curies.test.ts +++ b/js/tests/curies.test.ts @@ -1,8 +1,8 @@ import {describe, expect, test} from '@jest/globals'; import {Record, Converter, getOboConverter, getBioregistryConverter, getMonarchConverter, getGoConverter} from "../pkg/node"; +// NOTE: `await init()` only needed in browser environment describe('Tests for the curies npm package', () => { - // NOTE: `await init()` only needed in browser environment test('from empty converter', async () => { const converter = new Converter(); @@ -110,16 +110,43 @@ describe('Tests for the curies npm package', () => { expect(converter.expand("doid:1234")).toBe("http://purl.obolibrary.org/obo/DOID_1234"); }); + test('compress/expand or standardize', async () => { + const converter = await Converter.fromExtendedPrefixMap(`[{ + "prefix": "CHEBI", + "prefix_synonyms": ["chebi"], + "uri_prefix": "http://purl.obolibrary.org/obo/CHEBI_", + "uri_prefix_synonyms": ["https://identifiers.org/chebi:"] + }]`); + expect(converter.expandOrStandardize("CHEBI:138488")).toBe("http://purl.obolibrary.org/obo/CHEBI_138488"); + expect(converter.expandOrStandardize("chebi:138488")).toBe("http://purl.obolibrary.org/obo/CHEBI_138488"); + expect(converter.expandOrStandardize("http://purl.obolibrary.org/obo/CHEBI_138488")).toBe("http://purl.obolibrary.org/obo/CHEBI_138488"); + expect(converter.expandOrStandardize("https://identifiers.org/chebi:138488")).toBe("http://purl.obolibrary.org/obo/CHEBI_138488"); + + expect(converter.compressOrStandardize("http://purl.obolibrary.org/obo/CHEBI_138488")).toBe("CHEBI:138488"); + expect(converter.compressOrStandardize("https://identifiers.org/chebi:138488")).toBe("CHEBI:138488"); + expect(converter.compressOrStandardize("CHEBI:138488")).toBe("CHEBI:138488"); + expect(converter.compressOrStandardize("chebi:138488")).toBe("CHEBI:138488"); + }); + test('get OBO converter', async () => { const converter = await getOboConverter(); expect(converter.compress("http://purl.obolibrary.org/obo/DOID_1234")).toBe("DOID:1234"); expect(converter.expand("DOID:1234")).toBe("http://purl.obolibrary.org/obo/DOID_1234"); + + expect(converter.isCurie("GO:1234567")).toBe(true); + expect(converter.isCurie("http://purl.obolibrary.org/obo/GO_1234567")).toBe(false); + expect(converter.isCurie("pdb:2gc4")).toBe(false); + + expect(converter.isUri("http://purl.obolibrary.org/obo/GO_1234567")).toBe(true); + expect(converter.isUri("GO:1234567")).toBe(false); + expect(converter.isUri("http://proteopedia.org/wiki/index.php/2gc4")).toBe(false); }); test('get Bioregistry converter', async () => { const converter = await getBioregistryConverter(); expect(converter.compress("http://purl.obolibrary.org/obo/DOID_1234")).toBe("doid:1234"); expect(converter.expand("doid:1234")).toBe("http://purl.obolibrary.org/obo/DOID_1234"); + expect(converter.standardizePrefix("gomf")).toBe("go"); expect(converter.standardizeCurie("gomf:0032571")).toBe("go:0032571"); expect(converter.standardizeUri("http://amigo.geneontology.org/amigo/term/GO:0032571")).toBe("http://purl.obolibrary.org/obo/GO_0032571"); diff --git a/lib/docs/docs/architecture.md b/lib/docs/docs/architecture.md index ce32740..eb03874 100644 --- a/lib/docs/docs/architecture.md +++ b/lib/docs/docs/architecture.md @@ -30,32 +30,35 @@ curies.rs/ List of features available per language binding, based on features defined in [curies.readthedocs.io](https://curies.readthedocs.io) -| Feature | Rust (core) | Python | JS | R | -| ------------------------------------------------ | ----------- | ------ | ---- | ---- | -| compress | βœ… | βœ… | βœ… | βœ… | -| expand | βœ… | βœ… | βœ… | βœ… | -| compress_list | βœ… | βœ… | βœ… | | -| expand_list | βœ… | βœ… | βœ… | | -| standardize (prefix, curie, uri) | βœ… | βœ… | βœ… | | -| chain converters | βœ… | βœ… | βœ… | | -| Record object and converter.add_record() | βœ… | βœ… | βœ… | | -| converter.add_prefix(prefix, ns) | βœ… | βœ… | βœ… | | -| converter.get_prefixes() and .get_uri_prefixes() | βœ… | βœ… | βœ… | | -| Load from prefix map | βœ… | βœ… | βœ… | | -| Load from extended prefix map | βœ… | βœ… | βœ… | | -| Load from JSON-LD context | βœ… | βœ… | βœ… | | -| Load from SHACL prefix definition | βœ… | βœ… | βœ… | | -| Load OBO converter | βœ… | βœ… | βœ… | | -| Load GO converter | βœ… | βœ… | βœ… | | -| Load Bioregistry converter | βœ… | βœ… | βœ… | βœ… | -| Load Monarch converter | βœ… | βœ… | βœ… | | -| Write converter to prefix map | βœ… | βœ… | βœ… | | -| Write converter to extended prefix map | βœ… | βœ… | βœ… | | -| Write converter to JSON-LD | βœ… | βœ… | βœ… | | -| Write converter to SHACL | βœ… | βœ… | βœ… | | -| Prefixes discovery | | | | | - -## βš οΈβ€‹ Differences between rust core and language bindings +| Feature | Rust (core) | Python | JS | R | +| ----------------------------------------------------- | ----------- | ------ | ---- | ---- | +| compress | βœ… | βœ… | βœ… | βœ… | +| expand | βœ… | βœ… | βœ… | βœ… | +| compress_list | βœ… | βœ… | βœ… | | +| expand_list | βœ… | βœ… | βœ… | | +| standardize (prefix, curie, uri) | βœ… | βœ… | βœ… | | +| is_uri() and is_curie() | βœ… | βœ… | βœ… | | +| expand_or_standardize() and compress_or_standardize() | βœ… | βœ… | βœ… | | +| chain converters | βœ… | βœ… | βœ… | | +| Record object and converter.add_record() | βœ… | βœ… | βœ… | | +| converter.add_prefix(prefix, ns) | βœ… | βœ… | βœ… | | +| converter.get_prefixes() and .get_uri_prefixes() | βœ… | βœ… | βœ… | | +| Load from prefix map | βœ… | βœ… | βœ… | | +| Load from extended prefix map | βœ… | βœ… | βœ… | | +| Load from JSON-LD context | βœ… | βœ… | βœ… | | +| Load from SHACL prefix definition | βœ… | βœ… | βœ… | | +| Load OBO converter | βœ… | βœ… | βœ… | | +| Load GO converter | βœ… | βœ… | βœ… | | +| Load Bioregistry converter | βœ… | βœ… | βœ… | βœ… | +| Load Monarch converter | βœ… | βœ… | βœ… | | +| Write converter to prefix map | βœ… | βœ… | βœ… | | +| Write converter to extended prefix map | βœ… | βœ… | βœ… | | +| Write converter to JSON-LD | βœ… | βœ… | βœ… | | +| Write converter to SHACL | βœ… | βœ… | βœ… | | +| .get_subconverter() | | | | | +| Prefixes discovery | | | | | + +## βš οΈβ€‹ Differences between Rust core and language bindings 1. The **functions to Load** prefix map, extended prefix map and JSON-LD can take `HashMap` as input in rust. But for JS and python, we currently need to pass it as `String` (we need to figure out how to pass arbitrary objects). You can pass either a URL or a JSON object as string, the lib will automatically retrieve the content of the URL if it is one. The original python lib was taking directly JSON objects for all loaders, apart from SHACL which takes a URL (which was not convenient when wanting to provide a local SHACL file) 2. In rust **chain()** is a static function taking a list of converters, `chained = Converter::chain([conv1, conv2])`. In JS and python we cannot easily pass a list of complex objects like converters, so chain is a normal function that takes 1 converter to chain: `chained = conv1.chain(conv2)` diff --git a/lib/docs/docs/contributing.md b/lib/docs/docs/contributing.md index 0126edd..5d943ff 100644 --- a/lib/docs/docs/contributing.md +++ b/lib/docs/docs/contributing.md @@ -31,7 +31,9 @@ Install development dependencies: ./scripts/install-dev.sh ``` -> If you are using VSCode we strongly recommend to install the [`rust-lang.rust-analyzer`](https://marketplace.visualstudio.com/items?itemName=rust-lang.rust-analyzer) extension. +!!! tip "VSCode extension" + + If you are using VSCode we strongly recommend to install the [`rust-lang.rust-analyzer`](https://marketplace.visualstudio.com/items?itemName=rust-lang.rust-analyzer) extension. ## πŸ§ͺ Run tests @@ -116,7 +118,10 @@ The first time you will need to add the `--install` flag to install dependencies ./scripts/test-r.sh --install ``` -> You can force `rextendr` to re-build the bindings by making a change to one of the docstring `///` in the `/r/rust/src` code +!!! info "Force build" + + You can force `rextendr` to re-build the bindings by making a change to one of the docstring `///` in the `/r/rust/src` code + ## 🧹 Format and lint @@ -155,16 +160,16 @@ cargo update cargo outdated ``` -## 🏷️ New release +## 🏷️ Publish a new release -Publishing artifacts will be done by the `build.yml` workflow, make sure you have set the following tokens as secrets for this repository: `PYPI_TOKEN`, `NPM_TOKEN`, `CRATES_IO_TOKEN`, `CODECOV_TOKEN` +Building and publishing artifacts will be done by the [`build.yml`](https://github.com/biopragmatics/curies.rs/actions/workflows/build.yml) GitHub actions workflow, make sure you have set the following tokens as secrets on GitHub for this repository: `PYPI_TOKEN`, `NPM_TOKEN`, `CRATES_IO_TOKEN`, `CODECOV_TOKEN` -1. Bump the version in the `Cargo.toml` file in folders `lib`, `python`, and `js`: +To release a new version, run the release script providing the new version following [semantic versioning](https://semver.org), it will bump the version in the `Cargo.toml` files, generate the changelog from commit messages, create a new tag, and push to GitHub: - ```bash - ./scripts/bump.sh 0.1.2 - ``` +```bash +./scripts/release.sh 0.1.2 +``` -2. Commit, push, and **create a new release on GitHub**. +!!! success "Automated release" -3. The `build.yml` workflow will automatically build artifacts (pip wheel, npm package), add them to the new release, and publish to public registries (crates.io, PyPI, NPM). + The `build.yml` workflow will automatically build artifacts (binaries, pip wheels, npm package), create a new release on GitHub, and add the generated artifacts to the new release. diff --git a/lib/docs/docs/index.md b/lib/docs/docs/index.md index 29775d4..813a91e 100644 --- a/lib/docs/docs/index.md +++ b/lib/docs/docs/index.md @@ -3,31 +3,41 @@ [![crates.io](https://img.shields.io/crates/v/curies.svg)](https://crates.io/crates/curies) [![PyPI](https://img.shields.io/pypi/v/curies-rs)](https://pypi.org/project/curies-rs/) [![npm](https://img.shields.io/npm/v/@biopragmatics/curies)](https://www.npmjs.com/package/@biopragmatics/curies) +[![Tests](https://github.com/biopragmatics/curies.rs/actions/workflows/test.yml/badge.svg)](https://github.com/biopragmatics/curies.rs/actions/workflows/test.yml) +[![Build](https://github.com/biopragmatics/curies.rs/actions/workflows/build.yml/badge.svg)](https://github.com/biopragmatics/curies.rs/actions/workflows/build.yml) -A cross-platform Rust library for idiomatic conversion between URIs and compact URIs (CURIEs). +A cross-platform library for idiomatic conversion between URIs and compact URIs (CURIEs). -Whether you're a developer looking to work with CURIEs (e.g. expand or compress) in your application, or a researcher seeking an efficient way to handle CURIEs, `curies` offers a suite of tools tailored to meet your needs. +Uniform resource identifiers (URIs) and compact URIs (CURIEs) have become the predominant syntaxes for identifying concepts in linked data applications. Therefore, efficient, faultless, and idiomatic conversion between them is a crucial low-level utility whose need is ubiquitous across many codebases. -## ✨ CURIEs management +[`curies`](https://curies.readthedocs.io/en/latest/api.html#module-curies) fills this need. This cross-platform package can be used by a variety of people: -- πŸ“₯ **Import converters** from JSON prefix maps or JSON-LD context, with helper functions for popular converters, such as `get_obo_converter()`, or create a custom converter programmatically. -- πŸ”— **Expand CURIEs** from their compressed form to URIs. -- πŸ—œοΈ **Compress URIs** to CURIEs. +1. **Data Scientist** - someone who consumes and modifies data to suit an analysis or application. For example, they might want to convert tabular data containing CURIEs into IRIs, translate into RDF, then query with SPARQL. +2. **Curator** - someone who creates data. For example, an ontologist may want to curate using CURIEs but have their toolchain 1) validate the syntax and semantics and 2) convert to IRIs for their data persistence +3. **Data Consumer** - someone who consumes data. This kind of user likely won’t interact with [`curies`](https://curies.readthedocs.io/en/latest/api.html#module-curies) directly, but will likely use tools that build on top of it. For example, someone using the Bioregistry resolution service uses this package’s expansion utilities indirectly. +4. **Software Developer** - someone who develops tools to support data creators, data consumers, and other software developers. For example, a software developer might want to make their toolchain more generic for loading, merging, and outputting prefix maps and extended prefix maps. -Example: +For many users, expansion (CURIE to URI) and contraction (URI to CURIE) are the two most important tools. Example: | CURIE | URI | | ----------- | ------------------------------------------------------------ | | `doid:1234` | [http://purl.obolibrary.org/obo/DOID_1234](http://purl.obolibrary.org/obo/DOID_1234) | +## ✨ Features + +- πŸ“₯ **Import converters** from JSON prefix maps or JSON-LD context, with helper functions for popular converters, such as `get_obo_converter()`, or create a custom converter programmatically. +- πŸ”— **Expand CURIEs** from their compressed form to URIs. +- πŸ—œοΈ **Compress URIs** to CURIEs. +- 🧩 **Standardize** prefixes, CURIEs, or URIs. + ## πŸ“¦οΈ Packaged for multiple interfaces This library is packaged for easy use across various interfaces and languages: - πŸ¦€ **Rust developers**: available as a Rust crate `curies` - 🐍 **Python programmers**: available as a Python pip package `curies-rs` -- 🌐 **Web developers**: available as a NPM package `@biopragmatics/curies`, compiled to [WebAssembly](https://webassembly.org/), for browser integrations with JavaScript, or NodeJS. +- 🌐 **JavaScript web developers**: available as a NPM package `@biopragmatics/curies`, compiled to [WebAssembly](https://webassembly.org/), for browser integrations with JavaScript, or NodeJS. - πŸ“ˆ **R data scientists**: soon available as a R package `curies` ## βš”οΈ Cross-platform support diff --git a/lib/docs/docs/javascript.md b/lib/docs/docs/javascript.md index db513bf..04f16b0 100644 --- a/lib/docs/docs/javascript.md +++ b/lib/docs/docs/javascript.md @@ -4,7 +4,7 @@ You can easily work with CURIEs in the browser or NodeJS, from JavaScript or TypeScript, with the [`@biopragmatics/curies`](https://www.npmjs.com/package/@biopragmatics/curies) NPM package. -## πŸ“₯️ Install +## πŸ“₯️ Installation Install the `npm` package (use `yarn` or `pnpm` if you prefer) to use it from your favorite framework: @@ -78,6 +78,10 @@ async function main() { main(); ``` +!!! example "More examples" + + Checkout the [`curies.test.ts` file](https://github.com/biopragmatics/curies.rs/blob/main/js/tests/curies.test.ts) for more code examples. + ## 🦊 Use it in a browser When using in a client browser you will need to initialize the wasm binary with `await init()`, after that you can use the same functions as in the NodeJS environments. diff --git a/lib/docs/docs/python-devtools.md b/lib/docs/docs/python-devtools.md new file mode 100644 index 0000000..cf1a2db --- /dev/null +++ b/lib/docs/docs/python-devtools.md @@ -0,0 +1,140 @@ +# 🧰 Tools for Developers and Semantic Engineers + +## πŸͺ„ Working with strings that might be a URI or a CURIE + +Sometimes, it’s not clear if a string is a CURIE or a URI. While the [SafeCURIE syntax](https://www.w3.org/TR/2010/NOTE-curie-20101216/#P_safe_curie) is intended to address this, it’s often overlooked. + +### β˜‘οΈ CURIE and URI Checks + +The first way to handle this ambiguity is to be able to check if the string is a CURIE or a URI. Therefore, each `Converter` comes with functions for checking if a string is a CURIE (`converter.is_curie()`) or a URI (`converter.is_uri()`) under its definition. + +```python +from curies_rs import get_obo_converter + +converter = get_obo_converter() + +assert converter.is_curie("GO:1234567") +assert not converter.is_curie("http://purl.obolibrary.org/obo/GO_1234567") +# This is a valid CURIE, but not under this converter's definition +assert not converter.is_curie("pdb:2gc4") + +assert converter.is_uri("http://purl.obolibrary.org/obo/GO_1234567") +assert not converter.is_uri("GO:1234567") +# This is a valid URI, but not under this converter's definition +assert not converter.is_uri("http://proteopedia.org/wiki/index.php/2gc4") +``` + +### πŸ—œοΈ Standardized Expansion and Compression + +The `converter.expand_or_standardize()` function extends the CURIE expansion function to handle the situation where you might get passed a CURIE or a URI. If it’s a CURIE, expansions happen with the normal rules. If it’s a URI, it tries to standardize it. + +```python +from curies_rs import Converter + +converter = Converter.from_extended_prefix_map("""[{ + "prefix": "CHEBI", + "prefix_synonyms": ["chebi"], + "uri_prefix": "http://purl.obolibrary.org/obo/CHEBI_", + "uri_prefix_synonyms": ["https://identifiers.org/chebi:"] +}]""") + +# Expand CURIEs +assert converter.expand_or_standardize("CHEBI:138488") == 'http://purl.obolibrary.org/obo/CHEBI_138488' +assert converter.expand_or_standardize("chebi:138488") == 'http://purl.obolibrary.org/obo/CHEBI_138488' + +# standardize URIs +assert converter.expand_or_standardize("http://purl.obolibrary.org/obo/CHEBI_138488") == 'http://purl.obolibrary.org/obo/CHEBI_138488' +assert converter.expand_or_standardize("https://identifiers.org/chebi:138488") == 'http://purl.obolibrary.org/obo/CHEBI_138488' + +# Handle cases that aren't valid w.r.t. the converter +try: + converter.expand_or_standardize("missing:0000000") + converter.expand_or_standardize("https://example.com/missing:0000000") +except Exception as e: + print(e) +``` + +A similar workflow is implemented in `converter.compress_or_standardize()` for compressing URIs where a CURIE might get passed. + +```python +from curies_rs import Converter + +converter = Converter.from_extended_prefix_map("""[{ + "prefix": "CHEBI", + "prefix_synonyms": ["chebi"], + "uri_prefix": "http://purl.obolibrary.org/obo/CHEBI_", + "uri_prefix_synonyms": ["https://identifiers.org/chebi:"] +}]""") + +# Compress URIs +assert converter.compress_or_standardize("http://purl.obolibrary.org/obo/CHEBI_138488") == 'CHEBI:138488' +assert converter.compress_or_standardize("https://identifiers.org/chebi:138488") == 'CHEBI:138488' + +# standardize CURIEs +assert converter.compress_or_standardize("CHEBI:138488") == 'CHEBI:138488' +assert converter.compress_or_standardize("chebi:138488") == 'CHEBI:138488' + +# Handle cases that aren't valid w.r.t. the converter +try: + converter.compress_or_standardize("missing:0000000") + converter.compress_or_standardize("https://example.com/missing:0000000") +except Exception as e: + print(e) + print(type(e)) +``` + +## 🚚 Bulk operations + +You can use the `expand_list()` and `compress_list()` functions to processes many URIs or CURIEs at once.. + +For example to create a new `URI` column in a pandas dataframe from a `CURIE` column: + +```python +import pandas as pd +from curies_rs import get_bioregistry_converter + +converter = get_bioregistry_converter() +df = pd.DataFrame({'CURIE': ['doid:1234', 'doid:5678', 'doid:91011']}) + +# Expand the list of CURIEs to URIs +df['URI'] = converter.expand_list(df['CURIE']) +print(df) +``` + +## 🧩 Integrating with [`rdflib`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html#module-rdflib) + +RDFlib is a pure Python package for manipulating RDF data. The following example shows how to bind the extended prefix map from a `Converter` to a graph ([`rdflib.Graph`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html#rdflib.Graph)). + +```python +import curies_rs, rdflib, rdflib.namespace, json + +converter = curies_rs.get_obo_converter() +g = rdflib.Graph() + +for prefix, uri_prefix in json.loads(converter.write_prefix_map()).items(): + g.bind(prefix, rdflib.Namespace(uri_prefix)) +``` + +A more flexible approach is to instantiate a namespace manager ([`rdflib.namespace.NamespaceManager`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.namespace.html#rdflib.namespace.NamespaceManager)) and bind directly to that. + +```python +import curies_rs, rdflib, json + +converter = curies_rs.get_obo_converter() +namespace_manager = rdflib.namespace.NamespaceManager(rdflib.Graph()) + +for prefix, uri_prefix in json.loads(converter.write_prefix_map()).items(): + namespace_manager.bind(prefix, rdflib.Namespace(uri_prefix)) +``` + +URI references for use in RDFLib’s graph class can be constructed from CURIEs using a combination of `converter.expand()` and [`rdflib.URIRef`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html#rdflib.URIRef). + +```python +import curies_rs, rdflib + +converter = curies_rs.get_obo_converter() + +uri_ref = rdflib.URIRef(converter.expand("CHEBI:138488")) +``` + + diff --git a/lib/docs/docs/python.md b/lib/docs/docs/python.md index 922ae2e..bf7797b 100644 --- a/lib/docs/docs/python.md +++ b/lib/docs/docs/python.md @@ -1,10 +1,11 @@ # 🐍 Use from Python [![PyPI](https://img.shields.io/pypi/v/curies-rs)](https://pypi.org/project/curies-rs/) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/curies-rs.svg?logo=python&label=Python&logoColor=silver)](https://pypi.org/project/curies-rs) You can easily work with `curies` from Python. -## πŸ“₯️ Install +## πŸ“₯️ Installation Install the `pip` package: @@ -34,21 +35,22 @@ uris = converter.expand_list(["DOID:1234", "doid:1235"]) assert converter.standardize_prefix("gomf") == "go" assert converter.standardize_curie("gomf:0032571") == "go:0032571" assert converter.standardize_uri("http://amigo.geneontology.org/amigo/term/GO:0032571") == "http://purl.obolibrary.org/obo/GO_0032571" - -# Get the list of prefixes or URI prefixes, argument include_synonyms default to False -prefixes_without_syn = converter.get_prefixes() -uri_prefixes_with_syn = converter.get_uri_prefixes(True) ``` -## πŸŒ€ Load a converter +## πŸŒ€ Loading a Context + +There are several ways to load a context with this package, including: -There are many ways to load a CURIE/URI converter. +1. pre-defined contexts +2. contexts encoded in the standard prefix map format +3. contexts encoded in the standard JSON-LD context format +4. contexts encoded in the extended prefix map format -### πŸ“¦ Import a predefined converter +### πŸ“¦ Loading a predefined context Easiest way to get started is to simply use one of the function available to import a converter from popular namespaces registries: -#### [Bioregistry](https://bioregistry.io/) converter +**[Bioregistry](https://bioregistry.io/) converter** ```python from curies_rs import get_bioregistry_converter @@ -56,7 +58,7 @@ from curies_rs import get_bioregistry_converter converter = get_bioregistry_converter() ``` -#### [OBO](http://obofoundry.org/) converter +**[OBO](http://obofoundry.org/) converter** ```python from curies_rs import get_obo_converter @@ -64,7 +66,7 @@ from curies_rs import get_obo_converter converter = get_obo_converter() ``` -#### [GO](https://geneontology.org/) converter +**[GO](https://geneontology.org/) converter** ```python from curies_rs import get_go_converter @@ -72,7 +74,7 @@ from curies_rs import get_go_converter converter = get_go_converter() ``` -#### [Monarch Initiative](https://monarchinitiative.org/) converter +**[Monarch Initiative](https://monarchinitiative.org/) converter** ```python from curies_rs import get_monarch_converter @@ -80,16 +82,7 @@ from curies_rs import get_monarch_converter converter = get_monarch_converter() ``` -### πŸ“‚ Load from file - -Converter can be loaded from a prefix map, an extended prefix map (which enables to provide more information for each prefix), or a JSON-LD context. - -!!! tip "Support URL" - - For each `Converter.from_` function you can either provide the file content, or the URL to the file as string. - - -#### Load from extended prefix map +### πŸ—ΊοΈ Loading Extended Prefix Maps Enable to provide prefix/URI synonyms and ID RegEx pattern for each record: @@ -126,7 +119,11 @@ extended_pm = """[ converter = Converter.from_extended_prefix_map(extended_pm) ``` -#### Load from prefix map +!!! tip "Support URL" + + For all `Converter.from_` functions you can either provide the file content, or the URL to the file as string. + +### πŸ“ Loading Prefix Maps A simple dictionary without synonyms information: @@ -141,7 +138,7 @@ prefix_map = """{ converter = Converter.from_prefix_map(prefix_map) ``` -#### Load from JSON-LD context +### πŸ“„ Loading JSON-LD contexts ```python from curies_rs import Converter @@ -164,7 +161,7 @@ from curies_rs import Converter converter = Converter.from_jsonld("https://purl.obolibrary.org/meta/obo_context.jsonld") ``` -#### Load from SHACL prefixes definition +### πŸ”— Loading SHACL prefixes definitions ```python from curies_rs import Converter @@ -181,24 +178,94 @@ shacl = """@prefix sh: . conv = Converter.from_shacl(shacl) ``` -### πŸ› οΈ Build the converter programmatically +## πŸ”Ž Introspecting on a Context + +After loading a context, it’s possible to get certain information out of the converter. For example, if you want to get all of the CURIE prefixes from the converter, you can use `converter.get_prefixes()`: + +```python +from curies_rs import get_bioregistry_converter + +converter = get_bioregistry_converter() -Create an empty `Converter`, and populate it with `Record`: +prefixes = converter.get_prefixes() +assert 'chebi' in prefixes +assert 'CHEBIID' not in prefixes, "No synonyms are included by default" + +prefixes = converter.get_prefixes(include_synonyms=True) +assert 'chebi' in prefixes +assert 'CHEBIID' in prefixes +``` + +Similarly, the URI prefixes can be extracted with `Converter.get_uri_prefixes()` like in: + +```python +from curies_rs import get_bioregistry_converter + +converter = get_bioregistry_converter() + +uri_prefixes = converter.get_uri_prefixes() +assert 'http://purl.obolibrary.org/obo/CHEBI_' in uri_prefixes +assert 'https://bioregistry.io/chebi:' not in uri_prefixes, "No synonyms are included by default" + +uri_prefixes = converter.get_uri_prefixes(include_synonyms=True) +assert 'http://purl.obolibrary.org/obo/CHEBI_' in uri_prefixes +assert 'https://bioregistry.io/chebi:' in uri_prefixes +``` + +It’s also possible to get a bijective prefix map, i.e., a dictionary from primary CURIE prefixes to primary URI prefixes. This is useful for compatibility with legacy systems which assume simple prefix maps. This can be done with the `bimap` property like in the following: + +```python +import json +from curies_rs import get_bioregistry_converter + +converter = get_bioregistry_converter() + +prefix_map = json.loads(converter.write_prefix_map()) +assert prefix_map['chebi'] == 'http://purl.obolibrary.org/obo/CHEBI_' +``` + +## πŸ› οΈ Modifying a Context + +### πŸ”¨ Incremental Converters + +New data can be added to an existing converter with either `converter.add_prefix()` or `converter.add_record()`. For example, a CURIE and URI prefix for HGNC can be added to the OBO Foundry converter with the following: + +```python +from curies_rs import get_obo_converter + +converter = get_obo_converter() +converter.add_prefix("hgnc", "https://bioregistry.io/hgnc:") +``` + +Similarly, an empty converter can be instantiated using an empty list for the records argument and prefixes can be added one at a time (note this currently does not allow for adding synonyms separately): ```python from curies_rs import Converter, Record rec1 = Record("doid", "http://purl.obolibrary.org/obo/DOID_", ["DOID"], ["https://identifiers.org/doid/"]) -print(rec1.dict()) +# print(rec1.dict()) converter = Converter() converter.add_record(rec1) -converter.add_prefix("obo", "http://purl.obolibrary.org/obo/") ``` -### ⛓️ Chain converters +A more flexible version of this operation first involves constructing a `Record` object: -Chain together multiple converters: +```python +from curies_rs import get_obo_converter, Record + +converter = get_obo_converter() +record = Record(prefix="hgnc", uri_prefix="https://bioregistry.io/hgnc:") +converter.add_record(record) +``` + +By default, both of these operations will fail if the new content conflicts with existing content. If desired, the `merge` argument can be set to true to enable merging. Further, checking for conflicts and merging can be made to be case insensitive by setting `case_sensitive` to false. + +Such a merging strategy is the basis for wholesale merging of converters, described below. + +### ⛓️ Chaining and merging + +Chain together multiple converters, prioritizes based on the order given. Therefore, if two prefix maps having the same prefix but different URI prefixes are given, the first is retained. The second is retained as a synonym ```python from curies_rs import get_obo_converter, get_go_converter, get_monarch_converter @@ -208,12 +275,13 @@ converter = ( .chain(get_go_converter()) .chain(get_monarch_converter()) ) -print(len(converter)) ``` -## βœ’οΈ Serialize a converter + + +## βœ’οΈ Writing a Context -Output the converter prefix map as a string in different serialization format: +Write the converter prefix map as a string in different serialization format: ```python from curies_rs import get_bioregistry_converter diff --git a/lib/docs/docs/reconciliation.md b/lib/docs/docs/reconciliation.md new file mode 100644 index 0000000..8f16e70 --- /dev/null +++ b/lib/docs/docs/reconciliation.md @@ -0,0 +1,205 @@ +# 🀝 Reconciliation + +Reconciliation is the high-level process of modifying an (extended) prefix map with domain-specific rules. This is important as it allows for building on existing (extended) prefix maps without having to start from scratch. Further, storing the rules to transform an existing prefix map allows for high-level discussion about the differences and their reasons. + +As a specific example, the [Bioregistry](https://bioregistry.io/) uses `snomedct` as a preferred prefix for the Systematized Nomenclature of Medicine - Clinical Terms (SNOMED-CT). The OBO Foundry community prefers to use `SCTID` as the preferred prefix for this resource. Rather than maintaining a different extended prefix map than the Bioregistry, the OBO Foundry community could enumerate its preferred modifications to the base (extended) prefix map, then create its prefix map by transforming the Bioregistry's. + +Similarly, a consumer of the OBO Foundry prefix map who's implementing a resolver might want to override the URI prefix associated with the [Ontology of Vaccine Adverse Events (OVAE)](https://bioregistry.io/registry/ovae) to point towards the Ontology Lookup Service instead of the default OntoBee. + +There are two operations that are useful for transforming an existing (extended) prefix map: + +1. **Remapping** is when a given CURIE prefix or URI prefix is replaced with another. See `curies.remap_curie_prefixes` and `curies.remap_uri_prefixes`. +2. **Rewiring** is when the correspondence between a CURIE prefix and URI prefix is updated. See `curies.rewire`. + +Throughout this document, we're going to use the following extended prefix map as an example to illustrate how these operations work from a high level. + +```json +[ + {"prefix": "a", "uri_prefix": "https://example.org/a/", "prefix_synonyms": ["a1"]}, + {"prefix": "b", "uri_prefix": "https://example.org/b/"} +] +``` + +## πŸ”„ CURIE Prefix Remapping + +CURIE prefix remapping is configured by a dictionary from existing CURIE prefixes to new CURIE prefixes. The following rules are applied for each pair of old/new prefixes: + +### 1. New prefix exists + +If the new prefix appears as a prefix synonym in the record corresponding to the old prefix, they are swapped. This means applying the CURIE prefix remapping `{"a": "a1"}` results in the following + +```json +[ + {"prefix": "a1", "uri_prefix": "https://example.org/a/", "prefix_synonyms": ["a"]}, + {"prefix": "b", "uri_prefix": "https://example.org/b/"} +] +``` + +If the new prefix appears as a preferred prefix or prefix synonym for any other record, one of two things can happen: + +1. Do nothing (lenient) +2. Raise an exception (strict) + +This means applying the CURIE prefix remapping `{"a": "b"}` results in either no change or an exception being raised. + +### 2. New prefix doesn't exist, old prefix exists + +If the old prefix appears in a record in the extended prefix map as a preferred prefix: + +1. Replace the record's preferred prefix with the new prefix +2. Add the record's old preferred prefix to the record's prefix synonyms + +This means applying the CURIE prefix remapping `{"a": "c"}` results in the following + +```json +[ + {"prefix": "c", "uri_prefix": "https://example.org/a/", "prefix_synonyms": ["a", "a1"]}, + {"prefix": "b", "uri_prefix": "https://example.org/b/"} +] +``` + +Similarly, if the old prefix appears in a record in the extended prefix map as a prefix synonym, do the same. This means applying the CURIE prefix remapping `{"a1": "c"}` results in the following + +```json +[ + {"prefix": "c", "uri_prefix": "https://example.org/a/", "prefix_synonyms": ["a", "a1"]}, + {"prefix": "b", "uri_prefix": "https://example.org/b/"} +] +``` + +### 3. New prefix doesn't exist, old prefix doesn't exist + +If neither the old prefix nor new prefix appear in the extended prefix maps, one of two things can happen: + +1. Do nothing (lenient) +2. Raise an exception (strict) + +## πŸ” Transitive CURIE Prefix Remapping + +There's a special case of CURIE prefix remapping where one prefix is supposed to overwrite another. For example, in the Bioregistry, the [Gene Expression Omnibus](https://www.ncbi.nlm.nih.gov/geo/) is given the prefix `geo` and the [Geographical Entity Ontology](https://obofoundry.org/ontology/geo) is given the prefix `geogeo`. OBO Foundry users will want to rename the Gene Expression Omnibus record to something else like `ncbi.geo` and rename `geogeo` to `geo`. Taken by themselves, these two operations would not accomplish the desired results: + +1. Remapping with `{"geo": "ncbi.geo"}` would retain `geo` as a CURIE prefix synonym +2. Remapping with `{"geogeo": "geo"}` would not change the mapping as `geo` is already part of a different record. + +The `curies.remap_curie_prefixes` implements special logic to identify scenarios where two (or more) remappings are dependent (we're calling these *transitive remappings*) and apply them in the expected way. + + + +!!! note "Not overwriting" + +This is not the same as an "overwrite" which would delete the original `geo` operation. This package expects that you give a new CURIE prefix to all "overwritten" records such that no records are lost. + +!!! warning "Transitive remapping" + + Primary prefixes must be used when doing transitive remappings. Handling synonyms proved to be too complex. Therefore, if you use a CURIE prefix remapping like in the following, you will get an exception. + + ```python + converter = Converter([ + Record( + prefix="geo", + prefix_synonyms=["ggg"], + uri_prefix="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=", + ), + Record(prefix="geogeo", uri_prefix="http://purl.obolibrary.org/obo/GEO_"), + ]) + curie_remapping = {"ggg": "ncbi.geo", "geogeo": "geo"} + ``` + +## ⛓️‍πŸ’₯ URI Prefix Remapping + +URI prefix remapping is configured by a mapping from existing URI prefixes to new URI prefixes. The rules work exactly the same as with CURIE prefix remapping, but for the `curies.Record.uri_prefix` and `curies.Record.uri_prefix_synonyms` fields. + +## πŸ”€ Rewiring + +Rewiring is configured by a dictionary from existing CURIE prefixes to new URI prefixes. The following rules are applied for each pair of CURIE prefix/URI prefix: + +### CURIE prefix exists, URI prefix doesn't exist + +If the CURIE prefix appears as either the preferred prefix or a prefix synonym, do the following + +1. Replace the record's preferred URI prefix with the new URI prefix +2. Add the record's old preferred URI prefix to the record's URI prefix synonyms + +This means applying the rewiring `{"b": "https://example.org/b_new/"}` results in the following + +```json +[ + {"prefix": "a", "uri_prefix": "https://example.org/a/", "prefix_synonyms": ["a1"]}, + {"prefix": "b", "uri_prefix": "https://example.org/b_new/", "uri_prefix_synonyms": ["https://example.org/b/"]} +] +``` + +### CURIE prefix exists, URI prefix exists + +If the CURIE prefix and URI prefix both appear in the extended prefix map, there are three possibilities. + +1. If they are in the same record and the URI prefix is already the preferred prefix, then nothing needs to be done. This means that the rewiring `{"a": "https://example.org/a/"}` results in no change. + +2. If they are in the same record and the URI prefix is a URI prefix synonym, then the URI prefix synonym is swapped with the preferred URI prefix. This means if we have the following extended prefix map + + ```json + [ + {"prefix": "a", "uri_prefix": "https://example.org/a/", "uri_prefix_synonyms": ["https://example.org/a1/"]} + ] + ``` + + and apply `{"a": "https://example.org/a1/"}`, we get the following result + + ```json + [ + {"prefix": "a", "uri_prefix": "https://example.org/a/", "uri_prefix_synonyms": ["https://example.org/a1/"]} + ] + ``` + +3. If they appear in different records, then either do nothing (lenient) or raise an exception (strict) + +### CURIE prefix doesn't exist, URI prefix doesn't exist + +If the CURIE prefix doesn't appear in the extended prefix map, then nothing is done. Adding fully novel content to the extended prefix map can be done with other operations such as :meth`:curies.Converter.add_record` or `curies.chain`. + +!!! note "Ongoing discussion" + + There is discussion whether this case could be extended with the following: if the CURIE prefix doesn't exist in the extended prefix map, then the pair is simply appended. This means applying the rewiring `{"c": "https://example.org/c"}` results in the following + + ```json + [ + {"prefix": "a", "uri_prefix": "https://example.org/a/", "prefix_synonyms": ["a1"]}, + {"prefix": "b", "uri_prefix": "https://example.org/b/"}, + {"prefix": "c", "uri_prefix": "https://example.org/c/"} + ] + ``` + + This is not included in the base implementation because it conflates the job of "rewiring" with appending to the extended prefix map + +### CURIE prefix doesn't exist, URI prefix exists + +If the URI prefix appears as either a preferred URI prefix or as a URI prefix synonym in any record in the extended prefix map, do one of the following: + +1. Do nothing (lenient) +2. Raise an exception (strict) diff --git a/lib/docs/docs/rust.md b/lib/docs/docs/rust.md index 208b0c2..1c76e3d 100644 --- a/lib/docs/docs/rust.md +++ b/lib/docs/docs/rust.md @@ -2,7 +2,13 @@ [![crates.io](https://img.shields.io/crates/v/curies.svg)](https://crates.io/crates/curies) -## πŸ› οΈ General usage +## πŸ“₯️ Installation + +```bash +cargo add curies +``` + +## πŸš€ Usage You can use the Rust crate to work with CURIEs: import converters, compress URIs, expand CURIEs. @@ -44,9 +50,9 @@ rt.block_on(async { }).unwrap(); ``` -## πŸ—οΈ Build a converter +## πŸ› οΈ Manipulate converters and records -You can also build a `Converter` programmatically from `Record`: +You can also build a `Converter` programmatically from `Record` structs: ```rust extern crate curies; diff --git a/lib/docs/includes/abbreviations.md b/lib/docs/includes/abbreviations.md index 8e2779c..09c48fd 100644 --- a/lib/docs/includes/abbreviations.md +++ b/lib/docs/includes/abbreviations.md @@ -39,6 +39,7 @@ *[URLs]: Uniform Resource Locators *[URI]: Uniform Resource Identifier *[URIs]: Uniform Resource Identifiers +*[IRIs]: International Resource Identifiers *[CURIE]: Compact Uniform Resource Identifier *[CURIEs]: Compact Uniform Resource Identifiers *[ID]: Identifier @@ -67,3 +68,5 @@ *[EPMs]: Extended Prefix Maps *[ChEBI]: Chemical Entities of Biological Interest *[NCBI]: The National Center for Biotechnology Information (USA) +*[SNOMED-CT]: Systematized Nomenclature of Medicine - Clinical Terms +*[VSCode]: VisualStudio Code diff --git a/lib/docs/mkdocs.yml b/lib/docs/mkdocs.yml index 843e27b..c1a2980 100644 --- a/lib/docs/mkdocs.yml +++ b/lib/docs/mkdocs.yml @@ -7,6 +7,28 @@ repo_url: https://github.com/biopragmatics/curies.rs edit_uri: "edit/main/docs/" copyright: Copyright © 2024 Charles Tapley Hoyt & Vincent Emonet +# Find icons: https://fontawesome.com/icons/ +# https://squidfunk.github.io/mkdocs-material/reference/icons-emojis/ +nav: + - Docs: + - Introduction: index.md + - Data structures: struct.md + # - Reconciliation: reconciliation.md + - Architecture details: architecture.md + - Contributing: contributing.md + - Rust: + - Use from Rust: rust.md + - Python: + - Use from Python: python.md + - Tools for Developers and Semantic Engineers: python-devtools.md + - JavaScript: + - Use from JavaScript: javascript.md + - Example bare HTML: javascript-example-html.md + - Example JS framework: javascript-example-framework.md + - R: + - Use from R: r.md + # - Issues: https://github.com/biopragmatics/curies.rs/issues" target="_blank + theme: name: "material" favicon: assets/logo.png @@ -50,25 +72,6 @@ theme: # - navigation.instant # - content.tabs.link -# Find icons: https://fontawesome.com/icons/ -# https://squidfunk.github.io/mkdocs-material/reference/icons-emojis/ -nav: - - Docs: - - Introduction: index.md - - Use from Rust: rust.md - - Data structures: struct.md - - Architecture details: architecture.md - - Contributing: contributing.md - - Python: - - Use from Python: python.md - - JavaScript: - - Use from JavaScript: javascript.md - - Example bare HTML: javascript-example-html.md - - Example JS framework: javascript-example-framework.md - - R: - - Use from R: r.md - # - Issues: https://github.com/biopragmatics/curies.rs/issues" target="_blank - plugins: - search - open-in-new-tab @@ -81,10 +84,6 @@ plugins: show_source: true # custom_templates: templates -watch: - - ../src - - docs - markdown_extensions: - admonition # Supported admonititions: https://squidfunk.github.io/mkdocs-material/reference/admonitions/#supported-types @@ -95,6 +94,9 @@ markdown_extensions: - pymdownx.superfences - pymdownx.tabbed: alternate_style: true + # slugify: !!python/object/apply:pymdownx.slugs.slugify + # kwds: + # case: lower - pymdownx.details - pymdownx.extra - abbr @@ -107,7 +109,6 @@ markdown_extensions: - smarty # - md_in_html - extra_css: - assets/custom.css @@ -125,3 +126,7 @@ extra: link: https://github.com/biopragmatics # - icon: fontawesome/brands/docker # link: https://github.com/biopragmatics/curies.rs/pkgs/container/curies.rs + +watch: + - ../src + - docs diff --git a/lib/src/api.rs b/lib/src/api.rs index 3aabd60..3d795ee 100644 --- a/lib/src/api.rs +++ b/lib/src/api.rs @@ -255,19 +255,25 @@ impl Converter { let shacl_ns = Namespace::new("http://www.w3.org/ns/shacl#")?; // Iterate over triples that match the SHACL prefix and namespace pattern for q_prefix in graph.quads_matching(Any, [shacl_ns.get("prefix")?], Any, Any) { - for q_namespace in + for q_ns in graph.quads_matching([q_prefix?.s()], [shacl_ns.get("namespace")?], Any, Any) { converter.add_prefix( q_prefix? .o() .lexical_form() - .ok_or(CuriesError::InvalidFormat("Term".to_string()))? + .ok_or(CuriesError::InvalidFormat(format!( + "Prefix term in SHACL graph {:?}", + q_prefix?.o() + )))? .as_ref(), - q_namespace? + q_ns? .o() .lexical_form() - .ok_or(CuriesError::InvalidFormat("Term".to_string()))? + .ok_or(CuriesError::InvalidFormat(format!( + "Namespace term in SHACL graph {:?}", + q_ns?.o() + )))? .as_ref(), )?; } @@ -357,9 +363,11 @@ impl Converter { pub fn write_shacl(&self) -> Result { let mut graph = LightGraph::new(); let shacl_ns = Namespace::new("http://www.w3.org/ns/shacl#")?; + let declare_subject = BnodeId::new_unchecked("declareNode".to_string()); for (i, arc_record) in self.records.iter().enumerate() { let record = Arc::clone(arc_record); let subject = BnodeId::new_unchecked(format!("{}", i)); + graph.insert(&declare_subject, shacl_ns.get("declare")?, &subject)?; graph.insert(&subject, shacl_ns.get("prefix")?, record.prefix.as_str())?; graph.insert( &subject, @@ -568,7 +576,9 @@ impl Converter { record .uri_prefix_synonyms .iter() - .find_map(|synonym| uri.strip_prefix(synonym)) + .filter(|synonym| uri.starts_with(&**synonym)) + .max_by_key(|synonym| synonym.len()) // Get longest first + .and_then(|synonym| uri.strip_prefix(synonym)) }) .ok_or_else(|| CuriesError::NotFound(uri.to_string()))?; self.validate_id(id, record)?; @@ -608,6 +618,94 @@ impl Converter { .collect() } + /// Checks if a given string is a valid CURIE according to the current `Converter` + /// + /// # Examples + /// + /// ``` + /// use curies::Converter; + /// + /// let mut converter = Converter::default(); + /// converter.add_prefix("doid", "http://purl.obolibrary.org/obo/DOID_").unwrap(); + /// + /// assert_eq!(converter.is_curie("doid:1234"), true); + /// assert_eq!(converter.is_curie("go:0001"), false); + /// ``` + pub fn is_curie(&self, curie: &str) -> bool { + self.expand(curie).is_ok() + } + + /// Checks if a given string is a valid URI according to the current `Converter` + /// + /// # Examples + /// + /// ``` + /// use curies::Converter; + /// + /// let mut converter = Converter::default(); + /// converter.add_prefix("doid", "http://purl.obolibrary.org/obo/DOID_").unwrap(); + /// + /// assert_eq!(converter.is_uri("http://purl.obolibrary.org/obo/DOID_1234"), true); + /// assert_eq!(converter.is_uri("http://purl.obolibrary.org/obo/GO_0001"), false); + /// ``` + pub fn is_uri(&self, uri: &str) -> bool { + self.compress(uri).is_ok() + } + + // TODO: Error for GO because those 2 synonyms are added: http://amigo.geneontology.org/amigo/term/GO: and http://amigo.geneontology.org/amigo/term/ + // And sometime compress picks the shorter one + // So we need to make sure the synonyms are not added if they are already in the trie + + /// Attempts to compress a URI to a CURIE, or standardize it if it's already a CURIE. + /// + /// # Examples + /// + /// ```rust + /// use curies::sources::get_bioregistry_converter; + /// use tokio::runtime; + /// + /// let rt = runtime::Runtime::new().expect("Failed to create Tokio runtime"); + /// let converter = rt.block_on(async { + /// get_bioregistry_converter().await + /// }).expect("Failed to create the converter"); + /// + /// assert_eq!(converter.compress_or_standardize("http://amigo.geneontology.org/amigo/term/GO:0032571").unwrap(), "go:0032571".to_string()); + /// assert_eq!(converter.compress_or_standardize("gomf:0032571").unwrap(), "go:0032571".to_string()); + /// assert!(converter.compress_or_standardize("http://purl.obolibrary.org/UNKNOWN_12345").is_err()); + /// ``` + pub fn compress_or_standardize(&self, input: &str) -> Result { + if self.is_curie(input) { + self.standardize_curie(input) + } else { + self.compress(input) + } + } + + /// Attempts to expand a CURIE to a URI, or standardize it if it's already a URI. + /// + /// # Examples + /// + /// ```rust + /// use curies::sources::get_bioregistry_converter; + /// use tokio::runtime; + /// + /// let rt = runtime::Runtime::new().expect("Failed to create Tokio runtime"); + /// let converter = rt.block_on(async { + /// get_bioregistry_converter().await + /// }).expect("Failed to create the converter"); + /// + /// assert_eq!(converter.expand_or_standardize("http://amigo.geneontology.org/amigo/term/GO:0032571").unwrap(), "http://purl.obolibrary.org/obo/GO_0032571".to_string()); + /// assert_eq!(converter.expand_or_standardize("gomf:0032571").unwrap(), "http://purl.obolibrary.org/obo/GO_0032571".to_string()); + /// assert!(converter.expand_or_standardize("http://purl.obolibrary.org/UNKNOWN_12345").is_err()); + /// ``` + pub fn expand_or_standardize(&self, input: &str) -> Result { + if self.is_curie(input) { + Ok(self.expand(input)?) + } else { + Ok(self.standardize_uri(input)?) + } + } + /// Get the standard prefix for a given prefix /// /// # Examples diff --git a/lib/src/error.rs b/lib/src/error.rs index 8e23052..8623bde 100644 --- a/lib/src/error.rs +++ b/lib/src/error.rs @@ -17,9 +17,9 @@ pub enum CuriesError { InvalidFormat(String), DuplicateRecord(String), Utf8(String), + StdIo(String), SerdeJson(String), Reqwest(String), - StdIo(String), } impl Error for CuriesError {} diff --git a/lib/tests/curies_test.rs b/lib/tests/curies_test.rs index aded2fc..b924f79 100644 --- a/lib/tests/curies_test.rs +++ b/lib/tests/curies_test.rs @@ -46,7 +46,7 @@ fn new_empty_converter() -> Result<(), Box> { assert!(converter.write_shacl()?.starts_with("PREFIX")); // println!("{:?}", converter.write_extended_prefix_map()); // println!("{:?}", converter.write_jsonld()); - // println!("{:?}", converter.write_shacl()); + println!("{:?}", converter.write_shacl()); // Find Record by prefix or URI assert_eq!(converter.find_by_prefix("doid")?.prefix, "doid"); @@ -252,5 +252,11 @@ async fn chain_converters() -> Result<(), Box> { ); assert!(Converter::chain(vec![]).is_err()); // assert!(converter.delete_record("Wrong").is_err()); + assert!(converter + .expand_or_standardize("http://purl.obolibrary.org/UNKNOWN_12345") + .is_err()); + assert!(converter + .compress_or_standardize("http://purl.obolibrary.org/UNKNOWN_12345") + .is_err()); Ok(()) } diff --git a/python/requirements.txt b/python/requirements.txt index 56bbcc9..bff3aba 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -2,3 +2,5 @@ maturin pre-commit pytest mktestdocs +pandas +rdflib diff --git a/python/src/api.rs b/python/src/api.rs index 276c514..a38d59b 100644 --- a/python/src/api.rs +++ b/python/src/api.rs @@ -12,7 +12,6 @@ use serde::{Deserialize, Serialize}; use tokio::runtime::Runtime; #[pyclass(name = "Record", module = "curies_rs")] -// #[pyclass(extends=Record, name = "Record", module = "curies_rs")] #[derive(Debug, Clone, Serialize, Deserialize)] pub struct RecordPy { record: Record, @@ -21,7 +20,7 @@ pub struct RecordPy { #[pymethods] impl RecordPy { #[new] - #[pyo3(text_signature = "(prefix, uri_prefix, prefix_synonyms, uri_prefix_synonyms)")] + #[pyo3(text_signature = "(prefix, uri_prefix, prefix_synonyms=[], uri_prefix_synonyms=[])")] fn new( prefix: String, uri_prefix: String, @@ -43,7 +42,7 @@ impl RecordPy { } // Return the Record as a python dictionary - #[pyo3(text_signature = "($self)")] + #[pyo3(text_signature = "()")] fn dict(&self, py: Python<'_>) -> PyResult { pythonize(py, &self.record).map_err(|e| { PyErr::new::(format!("Error converting struct Record to dict: {e}")) @@ -158,7 +157,7 @@ impl ConverterPy { } /// Add a record to the `Converter` - #[pyo3(text_signature = "($self, record)")] + #[pyo3(text_signature = "(record)")] fn add_record(&mut self, record: RecordPy) -> PyResult<()> { self.converter .add_record(record.record) @@ -166,7 +165,7 @@ impl ConverterPy { } /// Add a prefix/namespace to the `Converter` - #[pyo3(text_signature = "($self, prefix, namespace)")] + #[pyo3(text_signature = "(prefix, namespace)")] fn add_prefix(&mut self, prefix: String, namespace: String) -> PyResult<()> { self.converter .add_prefix(&prefix, &namespace) @@ -174,7 +173,7 @@ impl ConverterPy { } /// Compress a URI - #[pyo3(text_signature = "($self, uri)")] + #[pyo3(text_signature = "(uri)")] fn compress(&self, uri: String) -> PyResult { self.converter .compress(&uri) @@ -182,7 +181,7 @@ impl ConverterPy { } /// Expand a CURIE - #[pyo3(text_signature = "($self, curie)")] + #[pyo3(text_signature = "(curie)")] fn expand(&self, curie: String) -> PyResult { self.converter .expand(&curie) @@ -190,21 +189,21 @@ impl ConverterPy { } /// Expand a list of CURIEs - #[pyo3(text_signature = "($self, curies)")] + #[pyo3(text_signature = "(curies)")] fn expand_list(&self, curies: Vec) -> Vec> { self.converter .expand_list(curies.iter().map(|s| s.as_str()).collect()) } /// Compress a list of URIs - #[pyo3(text_signature = "($self, uris)")] + #[pyo3(text_signature = "(uris)")] fn compress_list(&self, uris: Vec) -> Vec> { self.converter .compress_list(uris.iter().map(|s| s.as_str()).collect()) } /// Standardize prefix - #[pyo3(text_signature = "($self, prefix)")] + #[pyo3(text_signature = "(prefix)")] fn standardize_prefix(&self, prefix: String) -> PyResult { self.converter .standardize_prefix(&prefix) @@ -212,7 +211,7 @@ impl ConverterPy { } /// Standardize a CURIE - #[pyo3(text_signature = "($self, curie)")] + #[pyo3(text_signature = "(curie)")] fn standardize_curie(&self, curie: String) -> PyResult { self.converter .standardize_curie(&curie) @@ -220,27 +219,55 @@ impl ConverterPy { } /// Standardize a URI - #[pyo3(text_signature = "($self, uri)")] + #[pyo3(text_signature = "(uri)")] fn standardize_uri(&self, uri: String) -> PyResult { self.converter .standardize_uri(&uri) .map_err(|e| PyErr::new::(e.to_string())) } - #[pyo3(text_signature = "($self, include_synonyms)")] + /// Checks if a given string is a valid URI according to the current `Converter` + #[pyo3(text_signature = "(uri)")] + fn is_uri(&self, uri: String) -> bool { + self.converter.is_uri(&uri) + } + + /// Checks if a given string is a valid CURIE according to the current `Converter` + #[pyo3(text_signature = "(curie)")] + fn is_curie(&self, curie: String) -> bool { + self.converter.is_curie(&curie) + } + + /// Attempts to compress a URI to a CURIE, or standardize it if it's already a CURIE. + #[pyo3(text_signature = "(input)")] + fn compress_or_standardize(&self, input: String) -> PyResult { + self.converter + .compress_or_standardize(&input) + .map_err(|e| PyErr::new::(e.to_string())) + } + + /// Attempts to expand a CURIE to a URI, or standardize it if it's already a URI. + #[pyo3(text_signature = "(input)")] + fn expand_or_standardize(&self, input: String) -> PyResult { + self.converter + .expand_or_standardize(&input) + .map_err(|e| PyErr::new::(e.to_string())) + } + + #[pyo3(text_signature = "(include_synonyms=False)")] fn get_prefixes(&self, include_synonyms: Option) -> Vec { self.converter .get_prefixes(include_synonyms.unwrap_or(false)) } - #[pyo3(text_signature = "($self, include_synonyms)")] + #[pyo3(text_signature = "(include_synonyms=False)")] fn get_uri_prefixes(&self, include_synonyms: Option) -> Vec { self.converter .get_uri_prefixes(include_synonyms.unwrap_or(false)) } /// Chain with another `Converter` - #[pyo3(text_signature = "($self, converter)")] + #[pyo3(text_signature = "(converter)")] fn chain(&self, converter: &ConverterPy) -> PyResult { Converter::chain(vec![self.converter.clone(), converter.converter.clone()]) .map(|converter| ConverterPy { converter }) @@ -248,13 +275,13 @@ impl ConverterPy { } /// Write the `Converter` as a simple prefix map JSON - #[pyo3(text_signature = "($self)")] + #[pyo3(text_signature = "()")] fn write_prefix_map(&self) -> String { format!("{:?}", self.converter.write_prefix_map()) } /// Write the `Converter` as a extended prefix map JSON - #[pyo3(text_signature = "($self)")] + #[pyo3(text_signature = "()")] fn write_extended_prefix_map(&self) -> PyResult { Ok((self .converter @@ -264,12 +291,12 @@ impl ConverterPy { } /// Write the `Converter` prefix map as JSON-LD context - #[pyo3(text_signature = "($self)")] + #[pyo3(text_signature = "()")] fn write_jsonld(&self) -> String { format!("{}", self.converter.write_jsonld()) } - #[pyo3(text_signature = "($self)")] + #[pyo3(text_signature = "()")] fn write_shacl(&self) -> PyResult { self.converter .write_shacl() diff --git a/scripts/bump.sh b/scripts/bump.sh deleted file mode 100755 index fb71bd5..0000000 --- a/scripts/bump.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash -set -e - -# Check if version argument is provided -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " - exit 1 -fi - -new_version=$1 - -sed -i "s/^version = \"[0-9]*\.[0-9]*\.[0-9]*\"\$/version = \"$new_version\"/" "Cargo.toml" -sed -i "s/curies = { version = \"[0-9]*\.[0-9]*\.[0-9]*\"/curies = { version = \"$new_version\"/" "Cargo.toml" -echo "🏷️ Updated version in Cargo.toml" - -gmsg "🏷️ Bump to $new_version" || true diff --git a/scripts/docs.sh b/scripts/docs.sh index 2aedaad..46811d9 100755 --- a/scripts/docs.sh +++ b/scripts/docs.sh @@ -8,7 +8,7 @@ if [ ! -d ".venv" ]; then python -m venv .venv fi -echo "Activating virtual environment" +echo "βš™οΈ Activating virtual environment" source .venv/bin/activate pip install -q -r lib/docs/requirements.txt diff --git a/scripts/install-dev.sh b/scripts/install-dev.sh index f008495..518ef21 100755 --- a/scripts/install-dev.sh +++ b/scripts/install-dev.sh @@ -10,7 +10,7 @@ pip install -r lib/docs/requirements.txt if [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then echo "Installing Linux specific dependency" - maturin[patchelf] + pip install "maturin[patchelf]" fi # On MacOS you might need to setup the default CRAN mirror @@ -19,6 +19,6 @@ fi rustup update rustup toolchain install nightly # For tarpaulin -cargo install wasm-pack cargo-tarpaulin cargo-deny cargo-outdated +cargo install wasm-pack cargo-tarpaulin cargo-deny cargo-outdated git-cliff pre-commit install diff --git a/scripts/release.sh b/scripts/release.sh new file mode 100755 index 0000000..6507bac --- /dev/null +++ b/scripts/release.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +set -e + +# Script to bump version in Cargo.toml, update CHANGELOG.md and create a new tag + +# Check if version argument is provided +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi +new_version=$1 + +echo "" +echo " πŸ”οΈ Update version in Cargo.toml" +echo "" + + +sed -i "s/^version = \"[0-9]*\.[0-9]*\.[0-9]*\"\$/version = \"$new_version\"/" "Cargo.toml" +sed -i "s/curies = { version = \"[0-9]*\.[0-9]*\.[0-9]*\"/curies = { version = \"$new_version\"/" "Cargo.toml" +echo "πŸ”Ό Updated version in Cargo.toml" + +git cliff -o CHANGELOG.md --tag $new_version +git add Cargo.toml */Cargo.toml CHANGELOG.md +git commit -S -m "chore: Bump version to $new_version" +git push + +echo "" +echo " 🏷️ Create and push tag" +echo "" +git tag -a v$new_version -m "v$new_version" +git push origin v$new_version + +echo "" +echo " πŸŽ‰ Version $new_version released"