Skip to content

Commit

Permalink
Add check for duplicate synonyms when add_record. Add the Converter::…
Browse files Browse the repository at this point in the history
…chain() function to merge multiple converters in 1. Add function to delete a Record. Move the trie_builder out of the Converter struct. Update readme
  • Loading branch information
vemonet committed Dec 19, 2023
1 parent 1e581e4 commit 9158193
Show file tree
Hide file tree
Showing 6 changed files with 176 additions and 54 deletions.
38 changes: 1 addition & 37 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,44 +19,8 @@
<img src="https://codecov.io/gh/biopragmatics/curies.rs/branch/main/graph/badge.svg" alt="Codecov status" />
</a>
</p>
Idiomatic conversion between URIs and compact URIs (CURIEs) in Rust.
Idiomatic conversion between URIs and compact URIs (CURIEs) in Rust, with bindings to Python, and JavaScript compiled to WebAssembly.

## 📖 Documentation

Checkout the **[biopragmatics.github.io/curies.rs](https://biopragmatics.github.io/curies.rs)** for more details on how to install and use it.


## 🧑‍💻 Development

### 📥 Install dependencies

[Rust](https://www.rust-lang.org/tools/install), python 3.8+ and npm are required for development.

```bash
rustup update
rustup component add rustfmt clippy
cargo install wasm-pack cargo-tarpaulin mdbook mdbook-admonish
```

> If you are using VSCode we strongly recommend to install the `rust-lang.rust-analyzer` extension.

### 🧪 Run tests

Run tests and display prints:

```shell
cargo test -- --nocapture
```

### 🧹 Format

```shell
cargo fmt
```

### 📖 Documentation

```shell
cargo doc --open
```
2 changes: 2 additions & 0 deletions lib/docs/contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ curies.rs/

[Rust](https://www.rust-lang.org/tools/install), python, and NodeJS are required for development.

> If you are using VSCode we strongly recommend to install the `rust-lang.rust-analyzer` extension.
Install development dependencies:

```bash
Expand Down
123 changes: 115 additions & 8 deletions lib/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ pub struct Converter {
records: Vec<Arc<Record>>,
prefix_map: HashMap<String, Arc<Record>>,
uri_map: HashMap<String, Arc<Record>>,
trie_builder: TrieBuilder<u8>,
trie: Trie<u8>,
delimiter: String,
}
Expand All @@ -108,7 +107,6 @@ impl Converter {
records: Vec::new(),
prefix_map: HashMap::new(),
uri_map: HashMap::new(),
trie_builder: TrieBuilder::new(),
trie: TrieBuilder::new().build(),
delimiter: delimiter.to_string(),
}
Expand Down Expand Up @@ -204,7 +202,7 @@ impl Converter {
Ok(converter)
}

/// Add a `Record` to the `Converter`
/// Add a `Record` to the `Converter`.
/// When adding a new record we create a reference to the `Record` (Arc)
/// And we use this reference in the prefix and URI hashmaps
pub fn add_record(&mut self, record: Record) -> Result<(), CuriesError> {
Expand All @@ -215,20 +213,27 @@ impl Converter {
if self.uri_map.contains_key(&rec.uri_prefix) {
return Err(CuriesError::DuplicateRecord(rec.uri_prefix.clone()));
}
// TODO: check if synonyms are unique?
// Check if any of the synonyms are already present in the maps
for prefix in &rec.prefix_synonyms {
if self.prefix_map.contains_key(prefix) {
return Err(CuriesError::DuplicateRecord(prefix.clone()));
}
}
for uri_prefix in &rec.uri_prefix_synonyms {
if self.uri_map.contains_key(uri_prefix) {
return Err(CuriesError::DuplicateRecord(uri_prefix.clone()));
}
}

self.records.push(rec.clone());
self.prefix_map.insert(rec.prefix.clone(), rec.clone());
self.uri_map.insert(rec.uri_prefix.clone(), rec.clone());
self.trie_builder.push(&rec.uri_prefix);
for prefix in &rec.prefix_synonyms {
self.prefix_map.insert(prefix.clone(), rec.clone());
}
for uri_prefix in &rec.uri_prefix_synonyms {
self.uri_map.insert(uri_prefix.clone(), rec.clone());
self.trie_builder.push(uri_prefix);
}
// self.trie = self.trie_builder.build();
Ok(())
}

Expand All @@ -239,7 +244,109 @@ impl Converter {

/// Build trie search once all `Records` have been added
pub fn build(&mut self) {
self.trie = self.trie_builder.build();
let mut trie_builder = TrieBuilder::new();
for record in &self.records {
trie_builder.push(&record.uri_prefix);
for uri_prefix in &record.uri_prefix_synonyms {
trie_builder.push(uri_prefix);
}
}
self.trie = trie_builder.build();
}

/// Chain multiple `Converters` into a single `Converter`. The first `Converter` in the list is used as the base.
/// If the same prefix is found in multiple converters, the first occurrence is kept,
/// but the `uri_prefix` and synonyms are added as synonyms if they are different.
///
/// ```
/// use curies::{sources::{get_go_converter, get_obo_converter}, Converter};
/// use std::path::Path;
///
/// let rt = tokio::runtime::Runtime::new().expect("Failed to create Tokio runtime");
/// let converter = rt.block_on(async {
/// Converter::chain(vec![
/// get_obo_converter().await.unwrap(),
/// get_go_converter().await.unwrap(),
/// ])
/// }).expect("Failed to create the chained converter");
/// ```
pub fn chain(mut converters: Vec<Converter>) -> Result<Converter, CuriesError> {
if converters.is_empty() {
return Err(CuriesError::InvalidFormat(
"The list of converters is empty".to_string(),
));
}
let mut base_converter = converters.remove(0);
for converter in converters {
for arc_record in converter.records {
let record = Arc::try_unwrap(arc_record).unwrap_or_else(|arc| (*arc).clone());
// Function to check if the record or its synonyms already exist in the base converter
let find_record = |r: &Record| -> Option<Arc<Record>> {
base_converter
.prefix_map
.get(&r.prefix)
.cloned()
.or_else(|| {
r.prefix_synonyms
.iter()
.find_map(|synonym| base_converter.prefix_map.get(synonym).cloned())
})
};
if let Some(existing_arc) = find_record(&record) {
if existing_arc.uri_prefix != record.uri_prefix {
// Add the uri_prefix of the record as a synonym to the existing record
let mut updated_record = Arc::try_unwrap(existing_arc.clone())
.unwrap_or_else(|arc| (*arc).clone());
// Merge synonyms
updated_record
.uri_prefix_synonyms
.insert(record.uri_prefix.clone());
updated_record
.uri_prefix_synonyms
.extend(record.uri_prefix_synonyms.clone());
updated_record
.prefix_synonyms
.extend(record.prefix_synonyms.clone());
base_converter.delete_record(&updated_record.prefix)?;
base_converter.add_record(updated_record)?;
}
} else {
// If the prefix does not exist, add the record
base_converter.add_record(record)?;
}
}
}
base_converter.build();
Ok(base_converter)
}

/// Delete a `Record` from the `Converter` based on its prefix.
///
/// ```
/// use curies::{Converter, Record};
///
/// let mut converter = Converter::default();
/// assert!(converter.delete_record("DOID").is_err());
/// ```
pub fn delete_record(&mut self, prefix: &str) -> Result<(), CuriesError> {
// Check if the record exists
let record = match self.prefix_map.get(prefix) {
Some(record) => Arc::clone(record),
None => return Err(CuriesError::NotFound(prefix.to_string())),
};
// Remove the record from the records vector, prefix map, and uri map
self.records.retain(|r| r.prefix != prefix);
self.prefix_map.remove(&record.prefix);
self.uri_map.remove(&record.uri_prefix);
// Also remove any synonyms from the maps
for p_synonym in &record.prefix_synonyms {
self.prefix_map.remove(p_synonym);
}
for u_synonym in &record.uri_prefix_synonyms {
self.uri_map.remove(u_synonym);
}
self.build();
Ok(())
}

/// Find corresponding CURIE `Record` given a prefix
Expand Down
8 changes: 4 additions & 4 deletions lib/src/sources.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use crate::{error::CuriesError, Converter};
///
/// ```rust
/// use curies::sources::{get_obo_converter};
/// use tokio::{runtime};
/// use tokio::runtime;
///
/// let rt = runtime::Runtime::new().expect("Failed to create Tokio runtime");
/// let converter = rt.block_on(async {
Expand Down Expand Up @@ -52,7 +52,7 @@ pub async fn get_obo_converter() -> Result<Converter, CuriesError> {
///
/// ```rust
/// use curies::sources::{get_monarch_converter};
/// use tokio::{runtime};
/// use tokio::runtime;
///
/// let rt = runtime::Runtime::new().expect("Failed to create Tokio runtime");
/// let converter = rt.block_on(async {
Expand Down Expand Up @@ -87,7 +87,7 @@ pub async fn get_monarch_converter() -> Result<Converter, CuriesError> {
///
/// ```rust
/// use curies::sources::{get_go_converter};
/// use tokio::{runtime};
/// use tokio::runtime;
///
/// let rt = runtime::Runtime::new().expect("Failed to create Tokio runtime");
/// let converter = rt.block_on(async {
Expand Down Expand Up @@ -116,7 +116,7 @@ pub async fn get_go_converter() -> Result<Converter, CuriesError> {
///
/// ```rust
/// use curies::sources::get_bioregistry_converter;
/// use tokio::{runtime};
/// use tokio::runtime;
///
/// let rt = runtime::Runtime::new().expect("Failed to create Tokio runtime");
/// let converter = rt.block_on(async {
Expand Down
45 changes: 42 additions & 3 deletions lib/tests/curies_test.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use curies::{Converter, Record};
use curies::{
sources::{get_bioregistry_converter, get_go_converter},
Converter, Record,
};
use serde_json::Value;
use std::{
collections::{HashMap, HashSet},
Expand Down Expand Up @@ -94,6 +97,22 @@ fn new_empty_converter() -> Result<(), Box<dyn std::error::Error>> {
.is_err());
assert!(converter.find_by_uri_prefix("wrong").is_err());
assert!(converter.expand("obo:1234").is_err());
let record3 = Record {
prefix: "wrong".to_string(),
uri_prefix: "http://wrong.org/".to_string(),
prefix_synonyms: HashSet::new(),
uri_prefix_synonyms: HashSet::from(["https://identifiers.org/obo/"].map(String::from)),
pattern: None,
};
assert!(converter.add_record(record3).is_err());
let record4 = Record {
prefix: "wrong".to_string(),
uri_prefix: "http://wrong.org/".to_string(),
prefix_synonyms: HashSet::from(["OBO".to_string()]),
uri_prefix_synonyms: HashSet::new(),
pattern: None,
};
assert!(converter.add_record(record4).is_err());
Ok(())
}

Expand Down Expand Up @@ -143,10 +162,10 @@ async fn from_extended_map_file() -> Result<(), Box<dyn std::error::Error>> {
Converter::from_extended_prefix_map(Path::new("tests/resources/extended_map.json")).await?;
assert_eq!(
converter.expand("doid:1234")?,
"http://purl.obolibrary.org/obo/DOID_1234"
"http://purl.obolibrary.org/obo/SPECIAL_DOID_1234"
);
assert_eq!(
converter.compress("http://purl.obolibrary.org/obo/DOID_1234")?,
converter.compress("https://purl.obolibrary.org/obo/DOID_1234")?,
"doid:1234"
);
assert!(converter
Expand Down Expand Up @@ -192,3 +211,23 @@ async fn from_converter_errors() -> Result<(), Box<dyn std::error::Error>> {
.is_err());
Ok(())
}

#[tokio::test]
async fn chain_converters() -> Result<(), Box<dyn std::error::Error>> {
let mut converter = Converter::chain(vec![
get_bioregistry_converter().await?,
Converter::from_extended_prefix_map(Path::new("tests/resources/extended_map.json")).await?,
// get_go_converter().await?,
])?;
assert_eq!(
converter.compress("http://purl.obolibrary.org/obo/SPECIAL_DOID_1234")?,
"doid:1234"
);
assert_eq!(
converter.expand("specialgo:1234567")?,
"http://purl.obolibrary.org/obo/GO_1234567"
);
assert!(Converter::chain(vec![]).is_err());
assert!(converter.delete_record("Wrong").is_err());
Ok(())
}
14 changes: 12 additions & 2 deletions lib/tests/resources/extended_map.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,10 @@
"pattern": "^\\d+$",
"prefix": "doid",
"prefix_synonyms": [
"do"
"do",
"DOID"
],
"uri_prefix": "http://purl.obolibrary.org/obo/DOID_",
"uri_prefix": "http://purl.obolibrary.org/obo/SPECIAL_DOID_",
"uri_prefix_synonyms": [
"DO:",
"DOID:",
Expand Down Expand Up @@ -88,5 +89,14 @@
"https://www.ebi.ac.uk/ols/ontologies/doid/terms?iri=http://purl.obolibrary.org/obo/DOID_",
"https://www.ebi.ac.uk/ols4/ontologies/doid/terms?obo_id=DOID:"
]
},
{
"pattern": "^\\d{7}$",
"prefix": "GO",
"prefix_synonyms": [
"go",
"specialgo"
],
"uri_prefix": "http://purl.obolibrary.org/obo/SPECIAL_GO_"
}
]

0 comments on commit 9158193

Please sign in to comment.