diff --git a/.travis.yml b/.travis.yml index 5da2af83242..b16c3bfa872 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,8 +20,9 @@ addons: env: - TEST_GROUP=platform_dependent # 16-18 minutes - TEST_GROUP=1 # 33-35 minutes - - TEST_GROUP=2 # 30-32 minutes - - TEST_GROUP=3 # ? minutes - under development + - TEST_GROUP=2 # 18-20 minutes + - TEST_GROUP=3 # 20-22 minutes + - TEST_GROUP=4 # 12-14 minutes # Run java tests - JOB_NAME=java_test # 4-11 minutes # Build ROCKSDB_LITE @@ -39,6 +40,8 @@ matrix: env: TEST_GROUP=2 - os: osx env: TEST_GROUP=3 + - os: osx + env: TEST_GROUP=4 - os : osx env: JOB_NAME=cmake-mingw - os : linux @@ -65,9 +68,10 @@ script: - ${CXX} --version - if [ `command -v ccache` ]; then ccache -C; fi - if [ "${TEST_GROUP}" == 'platform_dependent' ]; then OPT=-DTRAVIS V=1 ROCKSDBTESTS_END=db_block_cache_test make -j4 all_but_some_tests check_some; fi - - if [ "${TEST_GROUP}" == '1' ]; then OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_block_cache_test ROCKSDBTESTS_END=comparator_db_test make -j4 check_some; fi - - if [ "${TEST_GROUP}" == '2' ]; then OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=comparator_db_test ROCKSDBTESTS_END=write_prepared_transaction_test make -j4 check_some; fi - - if [ "${TEST_GROUP}" == '3' ]; then OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=write_prepared_transaction_test make -j4 check_some; fi + - if [ "${TEST_GROUP}" == '1' ]; then OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_block_cache_test ROCKSDBTESTS_END=full_filter_block_test make -j4 check_some; fi + - if [ "${TEST_GROUP}" == '2' ]; then OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=full_filter_block_test ROCKSDBTESTS_END=write_batch_with_index_test make -j4 check_some; fi + - if [ "${TEST_GROUP}" == '3' ]; then OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=write_batch_with_index_test ROCKSDBTESTS_END=write_prepared_transaction_test make -j4 check_some; fi + - if [ "${TEST_GROUP}" == '4' ]; then OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=write_prepared_transaction_test make -j4 check_some; fi - if [ "${JOB_NAME}" == 'java_test' ]; then OPT=-DTRAVIS V=1 make clean jclean && make rocksdbjava jtest; fi - if [ "${JOB_NAME}" == 'lite_build' ]; then OPT="-DTRAVIS -DROCKSDB_LITE" V=1 make -j4 static_lib tools; fi - if [ "${JOB_NAME}" == 'examples' ]; then OPT=-DTRAVIS V=1 make -j4 static_lib; cd examples; make -j4; fi diff --git a/CMakeLists.txt b/CMakeLists.txt index 1395952c9d8..8c74e1db415 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,7 +56,7 @@ if(MSVC) include(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty.inc) else() if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") - # FreeBSD has jemaloc as default malloc + # FreeBSD has jemalloc as default malloc # but it does not have all the jemalloc files in include/... set(WITH_JEMALLOC ON) else() @@ -569,12 +569,14 @@ set(SOURCES util/status_message.cc util/string_util.cc util/sync_point.cc + util/sync_point_impl.cc util/testutil.cc util/thread_local.cc util/threadpool_imp.cc util/transaction_test_util.cc util/xxhash.cc utilities/backupable/backupable_db.cc + utilities/blob_db/blob_compaction_filter.cc utilities/blob_db/blob_db.cc utilities/blob_db/blob_db_impl.cc utilities/blob_db/blob_dump_tool.cc @@ -602,6 +604,7 @@ set(SOURCES utilities/leveldb_options/leveldb_options.cc utilities/lua/rocks_lua_compaction_filter.cc utilities/memory/memory_util.cc + utilities/merge_operators/bytesxor.cc utilities/merge_operators/max.cc utilities/merge_operators/put.cc utilities/merge_operators/string_append/stringappend.cc diff --git a/HISTORY.md b/HISTORY.md index 8f28c0cee7b..b6e479d1679 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,7 +1,32 @@ # Rocksdb Change Log ## Unreleased ### Public API Change +* Add a BlockBasedTableOption to align uncompressed data blocks on the smaller of block size or page size boundary, to reduce flash reads by avoiding reads spanning 4K pages. + +### New Features + +### Bug Fixes +* Fsync after writing global seq number to the ingestion file in ExternalSstFileIngestionJob. +* Fix WAL corruption caused by race condition between user write thread and FlushWAL when two_write_queue is not set. + +### Java API Changes +* Add `BlockBasedTableConfig.setBlockCache` to allow sharing a block cache across DB instances. + +## 5.13.0 (3/20/2018) +### Public API Change * RocksDBOptionsParser::Parse()'s `ignore_unknown_options` argument will only be effective if the option file shows it is generated using a higher version of RocksDB than the current version. +* Remove CompactionEventListener. + +### New Features +* SstFileManager now can cancel compactions if they will result in max space errors. SstFileManager users can also use SetCompactionBufferSize to specify how much space must be leftover during a compaction for auxiliary file functions such as logging and flushing. +* Avoid unnecessarily flushing in `CompactRange()` when the range specified by the user does not overlap unflushed memtables. +* If `ColumnFamilyOptions::max_subcompactions` is set greater than one, we now parallelize large manual level-based compactions. +* Add "rocksdb.live-sst-files-size" DB property to return total bytes of all SST files belong to the latest LSM tree. +* NewSstFileManager to add an argument bytes_max_delete_chunk with default 64MB. With this argument, a file larger than 64MB will be ftruncated multiple times based on this size. + +### Bug Fixes +* Fix a leak in prepared_section_completed_ where the zeroed entries would not removed from the map. +* Fix WAL corruption caused by race condition between user write thread and backup/checkpoint thread. ## 5.12.0 (2/14/2018) ### Public API Change @@ -62,7 +87,7 @@ * `BackupableDBOptions::max_valid_backups_to_open == 0` now means no backups will be opened during BackupEngine initialization. Previously this condition disabled limiting backups opened. * `DBOptions::preserve_deletes` is a new option that allows one to specify that DB should not drop tombstones for regular deletes if they have sequence number larger than what was set by the new API call `DB::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum)`. Disabled by default. * API call `DB::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum)` was added, users who wish to preserve deletes are expected to periodically call this function to advance the cutoff seqnum (all deletes made before this seqnum can be dropped by DB). It's user responsibility to figure out how to advance the seqnum in the way so the tombstones are kept for the desired period of time, yet are eventually processed in time and don't eat up too much space. -* `ReadOptions::iter_start_seqnum` was added; if set to something > 0 user will see 2 changes in iterators behavior 1) only keys written with sequence larger than this parameter would be returned and 2) the `Slice` returned by iter->key() now points to the the memory that keep User-oriented representation of the internal key, rather than user key. New struct `FullKey` was added to represent internal keys, along with a new helper function `ParseFullKey(const Slice& internal_key, FullKey* result);`. +* `ReadOptions::iter_start_seqnum` was added; if set to something > 0 user will see 2 changes in iterators behavior 1) only keys written with sequence larger than this parameter would be returned and 2) the `Slice` returned by iter->key() now points to the memory that keep User-oriented representation of the internal key, rather than user key. New struct `FullKey` was added to represent internal keys, along with a new helper function `ParseFullKey(const Slice& internal_key, FullKey* result);`. * Deprecate trash_dir param in NewSstFileManager, right now we will rename deleted files to .trash instead of moving them to trash directory * Allow setting a custom trash/DB size ratio limit in the SstFileManager, after which files that are to be scheduled for deletion are deleted immediately, regardless of any delete ratelimit. * Return an error on write if write_options.sync = true and write_options.disableWAL = true to warn user of inconsistent options. Previously we will not write to WAL and not respecting the sync options in this case. diff --git a/INSTALL.md b/INSTALL.md index e4aba5e925d..f0a7d206a42 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -141,6 +141,27 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi export JAVA_HOME=/usr/local/openjdk7 gmake rocksdbjava +* **OpenBSD** (6.3/-current): + + * As RocksDB is not available in the ports yet you have to build it on your own: + + * Install the dependencies for RocksDB: + + pkg_add gmake gflags snappy bzip2 lz4 zstd git jdk bash findutils gnuwatch + + * Build RocksDB from source: + + cd ~ + git clone https://github.com/facebook/rocksdb.git + cd rocksdb + gmake static_lib + + * Build RocksJava from source (optional): + + cd rocksdb + export JAVA_HOME=/usr/local/jdk-1.8.0 + export PATH=$PATH:/usr/local/jdk-1.8.0/bin + gmake rocksdbjava * **iOS**: * Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define two important pre-processing macros: `ROCKSDB_LITE` and `IOS_CROSS_COMPILE`. diff --git a/Makefile b/Makefile index bcbf5b9e954..c0c530c1f4a 100644 --- a/Makefile +++ b/Makefile @@ -278,6 +278,10 @@ default: all WARNING_FLAGS = -W -Wextra -Wall -Wsign-compare -Wshadow \ -Wno-unused-parameter +ifeq ($(PLATFORM), OS_OPENBSD) + WARNING_FLAGS += -Wno-unused-lambda-capture +endif + ifndef DISABLE_WARNING_AS_ERROR WARNING_FLAGS += -Werror endif @@ -406,7 +410,6 @@ TESTS = \ db_range_del_test \ db_sst_test \ db_tailing_iter_test \ - db_universal_compaction_test \ db_io_failure_test \ db_properties_test \ db_table_properties_test \ @@ -508,6 +511,7 @@ TESTS = \ repair_test \ env_timed_test \ write_prepared_transaction_test \ + db_universal_compaction_test \ PARALLEL_TEST = \ backupable_db_test \ @@ -674,7 +678,7 @@ coverage: COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) J=1 all check cd coverage && ./coverage_test.sh # Delete intermediate files - find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; + $(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; ifneq (,$(filter check parallel_check,$(MAKECMDGOALS)),) # Use /dev/shm if it has the sticky bit set (otherwise, /tmp), @@ -791,7 +795,7 @@ check_0: | grep -E '$(tests-regexp)' \ | build_tools/gnu_parallel -j$(J) --plain --joblog=LOG $$eta --gnu '{} >& t/log-{/}' -valgrind-blacklist-regexp = InlineSkipTest.ConcurrentInsert|TransactionTest.DeadlockStress|DBCompactionTest.SuggestCompactRangeNoTwoLevel0Compactions|BackupableDBTest.RateLimiting|DBTest.CloseSpeedup|DBTest.ThreadStatusFlush|DBTest.RateLimitingTest|DBTest.EncodeDecompressedBlockSizeTest|FaultInjectionTest.UninstalledCompaction|HarnessTest.Randomized|ExternalSSTFileTest.CompactDuringAddFileRandom|ExternalSSTFileTest.IngestFileWithGlobalSeqnoRandomized +valgrind-blacklist-regexp = InlineSkipTest.ConcurrentInsert|TransactionTest.DeadlockStress|DBCompactionTest.SuggestCompactRangeNoTwoLevel0Compactions|BackupableDBTest.RateLimiting|DBTest.CloseSpeedup|DBTest.ThreadStatusFlush|DBTest.RateLimitingTest|DBTest.EncodeDecompressedBlockSizeTest|FaultInjectionTest.UninstalledCompaction|HarnessTest.Randomized|ExternalSSTFileTest.CompactDuringAddFileRandom|ExternalSSTFileTest.IngestFileWithGlobalSeqnoRandomized|MySQLStyleTransactionTest.TransactionStressTest .PHONY: valgrind_check_0 valgrind_check_0: @@ -821,7 +825,7 @@ CLEAN_FILES += t LOG $(TMPD) # regardless of their duration. As with any use of "watch", hit ^C to # interrupt. watch-log: - watch --interval=0 'sort -k7,7nr -k4,4gr LOG|$(quoted_perl_command)' + $(WATCH) --interval=0 'sort -k7,7nr -k4,4gr LOG|$(quoted_perl_command)' # If J != 1 and GNU parallel is installed, run the tests in parallel, # via the check_0 rule above. Otherwise, run them sequentially. @@ -986,14 +990,14 @@ rocksdb.h rocksdb.cc: build_tools/amalgamate.py Makefile $(LIB_SOURCES) unity.cc clean: rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(LIBRARY) $(SHARED) rm -rf $(CLEAN_FILES) ios-x86 ios-arm scan_build_report - find . -name "*.[oda]" -exec rm -f {} \; - find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; + $(FIND) . -name "*.[oda]" -exec rm -f {} \; + $(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; rm -rf bzip2* snappy* zlib* lz4* zstd* cd java; $(MAKE) clean tags: - ctags * -R - cscope -b `find . -name '*.cc'` `find . -name '*.h'` `find . -name '*.c'` + ctags -R . + cscope -b `$(FIND) . -name '*.cc'` `$(FIND) . -name '*.h'` `$(FIND) . -name '*.c'` ctags -e -R -o etags * format: @@ -1510,10 +1514,10 @@ uninstall: install-headers: install -d $(INSTALL_PATH)/lib - for header_dir in `find "include/rocksdb" -type d`; do \ + for header_dir in `$(FIND) "include/rocksdb" -type d`; do \ install -d $(INSTALL_PATH)/$$header_dir; \ done - for header in `find "include/rocksdb" -type f -name *.h`; do \ + for header in `$(FIND) "include/rocksdb" -type f -name *.h`; do \ install -C -m 644 $$header $(INSTALL_PATH)/$$header; \ done @@ -1540,6 +1544,12 @@ install: install-static JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux ifeq ($(PLATFORM), OS_SOLARIS) ARCH := $(shell isainfo -b) +else ifeq ($(PLATFORM), OS_OPENBSD) + ifneq (,$(filter $(MACHINE), amd64 arm64 sparc64)) + ARCH := 64 + else + ARCH := 32 + endif else ARCH := $(shell getconf LONG_BIT) endif @@ -1570,6 +1580,7 @@ LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive ZSTD_VER ?= 1.3.3 ZSTD_SHA256 ?= a77c47153ee7de02626c5b2a097005786b71688be61e9fb81806a011f90b297b ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive +CURL_SSL_OPTS ?= --tlsv1 ifeq ($(PLATFORM), OS_MACOSX) ROCKSDBJNILIB = librocksdbjni-osx.jnilib @@ -1598,6 +1609,11 @@ ifeq ($(PLATFORM), OS_AIX) EXTRACT_SOURCES = gunzip < TAR_GZ | tar xvf - SNAPPY_MAKE_TARGET = libsnappy.la endif +ifeq ($(PLATFORM), OS_OPENBSD) + JAVA_INCLUDE = -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/openbsd + ROCKSDBJNILIB = librocksdbjni-openbsd$(ARCH).so + ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-openbsd$(ARCH).jar +endif libz.a: -rm -rf zlib-$(ZLIB_VER) @@ -1625,7 +1641,7 @@ libbz2.a: libsnappy.a: -rm -rf snappy-$(SNAPPY_VER) - curl -O -L ${SNAPPY_DOWNLOAD_BASE}/$(SNAPPY_VER)/snappy-$(SNAPPY_VER).tar.gz + curl -O -L ${CURL_SSL_OPTS} ${SNAPPY_DOWNLOAD_BASE}/$(SNAPPY_VER)/snappy-$(SNAPPY_VER).tar.gz SNAPPY_SHA256_ACTUAL=`$(SHA256_CMD) snappy-$(SNAPPY_VER).tar.gz | cut -d ' ' -f 1`; \ if [ "$(SNAPPY_SHA256)" != "$$SNAPPY_SHA256_ACTUAL" ]; then \ echo snappy-$(SNAPPY_VER).tar.gz checksum mismatch, expected=\"$(SNAPPY_SHA256)\" actual=\"$$SNAPPY_SHA256_ACTUAL\"; \ @@ -1638,7 +1654,7 @@ libsnappy.a: liblz4.a: -rm -rf lz4-$(LZ4_VER) - curl -O -L ${LZ4_DOWNLOAD_BASE}/v$(LZ4_VER).tar.gz + curl -O -L ${CURL_SSL_OPTS} ${LZ4_DOWNLOAD_BASE}/v$(LZ4_VER).tar.gz mv v$(LZ4_VER).tar.gz lz4-$(LZ4_VER).tar.gz LZ4_SHA256_ACTUAL=`$(SHA256_CMD) lz4-$(LZ4_VER).tar.gz | cut -d ' ' -f 1`; \ if [ "$(LZ4_SHA256)" != "$$LZ4_SHA256_ACTUAL" ]; then \ @@ -1651,7 +1667,7 @@ liblz4.a: libzstd.a: -rm -rf zstd-$(ZSTD_VER) - curl -O -L ${ZSTD_DOWNLOAD_BASE}/v$(ZSTD_VER).tar.gz + curl -O -L ${CURL_SSL_OPTS} ${ZSTD_DOWNLOAD_BASE}/v$(ZSTD_VER).tar.gz mv v$(ZSTD_VER).tar.gz zstd-$(ZSTD_VER).tar.gz ZSTD_SHA256_ACTUAL=`$(SHA256_CMD) zstd-$(ZSTD_VER).tar.gz | cut -d ' ' -f 1`; \ if [ "$(ZSTD_SHA256)" != "$$ZSTD_SHA256_ACTUAL" ]; then \ diff --git a/README.md b/README.md index afe8a27dcc3..35b468c682c 100644 --- a/README.md +++ b/README.md @@ -26,3 +26,7 @@ rely on the details of any other header files in this package. Those internal APIs may be changed without warning. Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/ + +## License + +RocksDB is dual-licensed under both the GPLv2 (found in the COPYING file in the root directory) and Apache 2.0 License (found in the LICENSE.Apache file in the root directory). You may select, at your option, one of the above-listed licenses. diff --git a/ROCKSDB_LITE.md b/ROCKSDB_LITE.md index 41cfbecc2cc..8991b95063a 100644 --- a/ROCKSDB_LITE.md +++ b/ROCKSDB_LITE.md @@ -5,7 +5,7 @@ RocksDBLite is a project focused on mobile use cases, which don't need a lot of Some examples of the features disabled by ROCKSDB_LITE: * compiled-in support for LDB tool * No backupable DB -* No support for replication (which we provide in form of TrasactionalIterator) +* No support for replication (which we provide in form of TransactionalIterator) * No advanced monitoring tools * No special-purpose memtables that are highly optimized for specific use cases * No Transactions diff --git a/TARGETS b/TARGETS index 2846bb826e3..4123b35824b 100644 --- a/TARGETS +++ b/TARGETS @@ -46,7 +46,10 @@ rocksdb_preprocessor_flags = [ ] rocksdb_arch_preprocessor_flags = { - "x86_64": ["-DHAVE_SSE42"], + "x86_64": [ + "-DHAVE_SSE42", + "-DHAVE_PCLMUL", + ], } build_mode = read_config("fbcode", "build_mode") @@ -212,11 +215,13 @@ cpp_library( "util/status_message.cc", "util/string_util.cc", "util/sync_point.cc", + "util/sync_point_impl.cc", "util/thread_local.cc", "util/threadpool_imp.cc", "util/transaction_test_util.cc", "util/xxhash.cc", "utilities/backupable/backupable_db.cc", + "utilities/blob_db/blob_compaction_filter.cc", "utilities/blob_db/blob_db.cc", "utilities/blob_db/blob_db_impl.cc", "utilities/blob_db/blob_dump_tool.cc", @@ -242,6 +247,7 @@ cpp_library( "utilities/leveldb_options/leveldb_options.cc", "utilities/lua/rocks_lua_compaction_filter.cc", "utilities/memory/memory_util.cc", + "utilities/merge_operators/bytesxor.cc", "utilities/merge_operators/max.cc", "utilities/merge_operators/put.cc", "utilities/merge_operators/string_append/stringappend.cc", diff --git a/Vagrantfile b/Vagrantfile index d7c2991d799..07f2e99fdd3 100644 --- a/Vagrantfile +++ b/Vagrantfile @@ -14,6 +14,11 @@ Vagrant.configure("2") do |config| box.vm.box = "chef/centos-6.5" end + config.vm.define "centos7" do |box| + box.vm.box = "centos/7" + box.vm.provision "shell", path: "build_tools/setup_centos7.sh" + end + config.vm.define "FreeBSD10" do |box| box.vm.guest = :freebsd box.vm.box = "robin/freebsd-10" diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py index 7aee5bdc9c0..44ef4fd5e70 100644 --- a/buckifier/targets_cfg.py +++ b/buckifier/targets_cfg.py @@ -50,7 +50,10 @@ ] rocksdb_arch_preprocessor_flags = { - "x86_64": ["-DHAVE_SSE42"], + "x86_64": [ + "-DHAVE_SSE42", + "-DHAVE_PCLMUL", + ], } build_mode = read_config("fbcode", "build_mode") diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 65550ff3030..743082485b4 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -16,6 +16,8 @@ # PLATFORM_CXXFLAGS C++ compiler flags. Will contain: # PLATFORM_SHARED_VERSIONED Set to 'true' if platform supports versioned # shared libraries, empty otherwise. +# FIND Command for the find utility +# WATCH Command for the watch utility # # The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following: # @@ -88,6 +90,14 @@ if test -z "$CLANG_ANALYZER"; then CLANG_ANALYZER=$(which clang++ 2> /dev/null) fi +if test -z "$FIND"; then + FIND=find +fi + +if test -z "$WATCH"; then + WATCH=watch +fi + COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}" CROSS_COMPILE= PLATFORM_CCFLAGS= @@ -154,9 +164,12 @@ case "$TARGET_OS" in ;; OpenBSD) PLATFORM=OS_OPENBSD + CXX=clang++ COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD" PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread" # PORT_FILES=port/openbsd/openbsd_specific.cc + FIND=gfind + WATCH=gnuwatch ;; DragonFly) PLATFORM=OS_DRAGONFLYBSD @@ -485,6 +498,8 @@ if test -z "$PORTABLE"; then elif test -n "`echo $TARGET_ARCHITECTURE | grep ^arm`"; then # TODO: Handle this with approprite options. COMMON_FLAGS="$COMMON_FLAGS" + elif [ "$TARGET_OS" == IOS ]; then + COMMON_FLAGS="$COMMON_FLAGS" elif [ "$TARGET_OS" != AIX ] && [ "$TARGET_OS" != SunOS ]; then COMMON_FLAGS="$COMMON_FLAGS -march=native " elif test "$USE_SSE"; then @@ -573,6 +588,8 @@ echo "ROCKSDB_PATCH=$ROCKSDB_PATCH" >> "$OUTPUT" echo "CLANG_SCAN_BUILD=$CLANG_SCAN_BUILD" >> "$OUTPUT" echo "CLANG_ANALYZER=$CLANG_ANALYZER" >> "$OUTPUT" echo "PROFILING_FLAGS=$PROFILING_FLAGS" >> "$OUTPUT" +echo "FIND=$FIND" >> "$OUTPUT" +echo "WATCH=$WATCH" >> "$OUTPUT" # This will enable some related identifiers for the preprocessor if test -n "$JEMALLOC"; then echo "JEMALLOC=1" >> "$OUTPUT" diff --git a/build_tools/run_ci_db_test.ps1 b/build_tools/run_ci_db_test.ps1 index e47a477a54d..0f8198b484b 100644 --- a/build_tools/run_ci_db_test.ps1 +++ b/build_tools/run_ci_db_test.ps1 @@ -336,7 +336,7 @@ $InvokeTestAsync = { # Test limiting factor here [int]$count = 0 # Overall status -[bool]$success = $true; +[bool]$script:success = $true; function RunJobs($Suites, $TestCmds, [int]$ConcurrencyVal) { @@ -425,7 +425,7 @@ function RunJobs($Suites, $TestCmds, [int]$ConcurrencyVal) $log_content = @(Get-Content $log) if($completed.State -ne "Completed") { - $success = $false + $script:success = $false Write-Warning $message $log_content | Write-Warning } else { @@ -449,7 +449,7 @@ function RunJobs($Suites, $TestCmds, [int]$ConcurrencyVal) } if(!$pass_found) { - $success = $false; + $script:success = $false; Write-Warning $message $log_content | Write-Warning } else { @@ -473,7 +473,7 @@ New-TimeSpan -Start $StartDate -End $EndDate | } -if(!$success) { +if(!$script:success) { # This does not succeed killing off jobs quick # So we simply exit # Remove-Job -Job $jobs -Force diff --git a/build_tools/setup_centos7.sh b/build_tools/setup_centos7.sh new file mode 100755 index 00000000000..c633131de88 --- /dev/null +++ b/build_tools/setup_centos7.sh @@ -0,0 +1,43 @@ +#!/bin/bash +set -e + +ROCKSDB_VERSION="5.10.3" +ZSTD_VERSION="1.1.3" + +echo "This script configures CentOS with everything needed to build and run RocksDB" + +yum update -y && yum install epel-release -y + +yum install -y \ + wget \ + gcc-c++ \ + snappy snappy-devel \ + zlib zlib-devel \ + bzip2 bzip2-devel \ + lz4-devel \ + libasan \ + gflags + +mkdir -pv /usr/local/rocksdb-${ROCKSDB_VERSION} +ln -sfT /usr/local/rocksdb-${ROCKSDB_VERSION} /usr/local/rocksdb + +wget -qO /tmp/zstd-${ZSTD_VERSION}.tar.gz https://github.com/facebook/zstd/archive/v${ZSTD_VERSION}.tar.gz +wget -qO /tmp/rocksdb-${ROCKSDB_VERSION}.tar.gz https://github.com/facebook/rocksdb/archive/v${ROCKSDB_VERSION}.tar.gz + +cd /tmp + +tar xzvf zstd-${ZSTD_VERSION}.tar.gz +tar xzvf rocksdb-${ROCKSDB_VERSION}.tar.gz -C /usr/local/ + +echo "Installing ZSTD..." +pushd zstd-${ZSTD_VERSION} +make && make install +popd + +echo "Compiling RocksDB..." +cd /usr/local/rocksdb +chown -R vagrant:vagrant /usr/local/rocksdb/ +sudo -u vagrant make static_lib +cd examples/ +sudo -u vagrant make all +sudo -u vagrant ./c_simple_example diff --git a/cache/cache_test.cc b/cache/cache_test.cc index 8e241226d9c..55f9cc6bb63 100644 --- a/cache/cache_test.cc +++ b/cache/cache_test.cc @@ -40,9 +40,9 @@ static int DecodeValue(void* v) { const std::string kLRU = "lru"; const std::string kClock = "clock"; -void dumbDeleter(const Slice& key, void* value) {} +void dumbDeleter(const Slice& /*key*/, void* /*value*/) {} -void eraseDeleter(const Slice& key, void* value) { +void eraseDeleter(const Slice& /*key*/, void* value) { Cache* cache = reinterpret_cast(value); cache->Erase("foo"); } @@ -470,7 +470,7 @@ class Value { }; namespace { -void deleter(const Slice& key, void* value) { +void deleter(const Slice& /*key*/, void* value) { delete static_cast(value); } } // namespace diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index 7e42714ef14..8c26f7a9cff 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -586,7 +586,7 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const Slice& key, void* value), Cache::Handle** out_handle, - Cache::Priority priority) { + Cache::Priority /*priority*/) { CleanupContext context; HashTable::accessor accessor; char* key_data = new char[key.size()]; diff --git a/cache/sharded_cache.cc b/cache/sharded_cache.cc index 9bdea3a08e1..6a0a2228211 100644 --- a/cache/sharded_cache.cc +++ b/cache/sharded_cache.cc @@ -53,7 +53,7 @@ Status ShardedCache::Insert(const Slice& key, void* value, size_t charge, ->Insert(key, hash, value, charge, deleter, handle, priority); } -Cache::Handle* ShardedCache::Lookup(const Slice& key, Statistics* stats) { +Cache::Handle* ShardedCache::Lookup(const Slice& key, Statistics* /*stats*/) { uint32_t hash = HashSlice(key); return GetShard(Shard(hash))->Lookup(key, hash); } diff --git a/db/builder.cc b/db/builder.cc index afb8e44030b..4042d968546 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -61,10 +61,10 @@ TableBuilder* NewTableBuilder( Status BuildTable( const std::string& dbname, Env* env, const ImmutableCFOptions& ioptions, - const MutableCFOptions& mutable_cf_options, const EnvOptions& env_options, - TableCache* table_cache, InternalIterator* iter, - std::unique_ptr range_del_iter, FileMetaData* meta, - const InternalKeyComparator& internal_comparator, + const MutableCFOptions& /*mutable_cf_options*/, + const EnvOptions& env_options, TableCache* table_cache, + InternalIterator* iter, std::unique_ptr range_del_iter, + FileMetaData* meta, const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, uint32_t column_family_id, const std::string& column_family_name, diff --git a/db/c.cc b/db/c.cc index 064103ed40a..0f77949d3fa 100644 --- a/db/c.cc +++ b/db/c.cc @@ -32,11 +32,13 @@ #include "rocksdb/universal_compaction.h" #include "rocksdb/utilities/backupable_db.h" #include "rocksdb/utilities/checkpoint.h" +#include "rocksdb/utilities/db_ttl.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/transaction_db.h" #include "rocksdb/utilities/write_batch_with_index.h" #include "rocksdb/write_batch.h" +#include "rocksdb/perf_context.h" #include "utilities/merge_operators.h" using rocksdb::BytewiseComparator; @@ -100,6 +102,8 @@ using rocksdb::OptimisticTransactionDB; using rocksdb::OptimisticTransactionOptions; using rocksdb::Transaction; using rocksdb::Checkpoint; +using rocksdb::PerfLevel; +using rocksdb::PerfContext; using std::shared_ptr; @@ -140,6 +144,7 @@ struct rocksdb_envoptions_t { EnvOptions rep; }; struct rocksdb_ingestexternalfileoptions_t { IngestExternalFileOptions rep; }; struct rocksdb_sstfilewriter_t { SstFileWriter* rep; }; struct rocksdb_ratelimiter_t { RateLimiter* rep; }; +struct rocksdb_perfcontext_t { PerfContext* rep; }; struct rocksdb_pinnableslice_t { PinnableSlice rep; }; @@ -252,7 +257,7 @@ struct rocksdb_comparator_t : public Comparator { // No-ops since the C binding does not support key shortening methods. virtual void FindShortestSeparator(std::string*, const Slice&) const override {} - virtual void FindShortSuccessor(std::string* key) const override {} + virtual void FindShortSuccessor(std::string* /*key*/) const override {} }; struct rocksdb_filterpolicy_t : public FilterPolicy { @@ -367,7 +372,7 @@ struct rocksdb_mergeoperator_t : public MergeOperator { virtual bool PartialMergeMulti(const Slice& key, const std::deque& operand_list, std::string* new_value, - Logger* logger) const override { + Logger* /*logger*/) const override { size_t operand_count = operand_list.size(); std::vector operand_pointers(operand_count); std::vector operand_sizes(operand_count); @@ -477,6 +482,20 @@ rocksdb_t* rocksdb_open( return result; } +rocksdb_t* rocksdb_open_with_ttl( + const rocksdb_options_t* options, + const char* name, + int ttl, + char** errptr) { + rocksdb::DBWithTTL* db; + if (SaveError(errptr, rocksdb::DBWithTTL::Open(options->rep, std::string(name), &db, ttl))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + rocksdb_t* rocksdb_open_for_read_only( const rocksdb_options_t* options, const char* name, @@ -2166,8 +2185,8 @@ void rocksdb_options_set_level0_stop_writes_trigger( opt->rep.level0_stop_writes_trigger = n; } -void rocksdb_options_set_max_mem_compaction_level(rocksdb_options_t* opt, - int n) {} +void rocksdb_options_set_max_mem_compaction_level(rocksdb_options_t* /*opt*/, + int /*n*/) {} void rocksdb_options_set_wal_recovery_mode(rocksdb_options_t* opt,int mode) { opt->rep.wal_recovery_mode = static_cast(mode); @@ -2231,8 +2250,8 @@ void rocksdb_options_set_manifest_preallocation_size( } // noop -void rocksdb_options_set_purge_redundant_kvs_while_flush(rocksdb_options_t* opt, - unsigned char v) {} +void rocksdb_options_set_purge_redundant_kvs_while_flush( + rocksdb_options_t* /*opt*/, unsigned char /*v*/) {} void rocksdb_options_set_use_direct_reads(rocksdb_options_t* opt, unsigned char v) { @@ -2402,7 +2421,7 @@ void rocksdb_options_set_table_cache_numshardbits( } void rocksdb_options_set_table_cache_remove_scan_count_limit( - rocksdb_options_t* opt, int v) { + rocksdb_options_t* /*opt*/, int /*v*/) { // this option is deprecated } @@ -2537,6 +2556,176 @@ void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t *limiter) { delete limiter; } +void rocksdb_set_perf_level(int v) { + PerfLevel level = static_cast(v); + SetPerfLevel(level); +} + +rocksdb_perfcontext_t* rocksdb_perfcontext_create() { + rocksdb_perfcontext_t* context = new rocksdb_perfcontext_t; + context->rep = rocksdb::get_perf_context(); + return context; +} + +void rocksdb_perfcontext_reset(rocksdb_perfcontext_t* context) { + context->rep->Reset(); +} + +char* rocksdb_perfcontext_report(rocksdb_perfcontext_t* context, + unsigned char exclude_zero_counters) { + return strdup(context->rep->ToString(exclude_zero_counters).c_str()); +} + +uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context, + int metric) { + PerfContext* rep = context->rep; + switch (metric) { + case rocksdb_user_key_comparison_count: + return rep->user_key_comparison_count; + case rocksdb_block_cache_hit_count: + return rep->block_cache_hit_count; + case rocksdb_block_read_count: + return rep->block_read_count; + case rocksdb_block_read_byte: + return rep->block_read_byte; + case rocksdb_block_read_time: + return rep->block_read_time; + case rocksdb_block_checksum_time: + return rep->block_checksum_time; + case rocksdb_block_decompress_time: + return rep->block_decompress_time; + case rocksdb_get_read_bytes: + return rep->get_read_bytes; + case rocksdb_multiget_read_bytes: + return rep->multiget_read_bytes; + case rocksdb_iter_read_bytes: + return rep->iter_read_bytes; + case rocksdb_internal_key_skipped_count: + return rep->internal_key_skipped_count; + case rocksdb_internal_delete_skipped_count: + return rep->internal_delete_skipped_count; + case rocksdb_internal_recent_skipped_count: + return rep->internal_recent_skipped_count; + case rocksdb_internal_merge_count: + return rep->internal_merge_count; + case rocksdb_get_snapshot_time: + return rep->get_snapshot_time; + case rocksdb_get_from_memtable_time: + return rep->get_from_memtable_time; + case rocksdb_get_from_memtable_count: + return rep->get_from_memtable_count; + case rocksdb_get_post_process_time: + return rep->get_post_process_time; + case rocksdb_get_from_output_files_time: + return rep->get_from_output_files_time; + case rocksdb_seek_on_memtable_time: + return rep->seek_on_memtable_time; + case rocksdb_seek_on_memtable_count: + return rep->seek_on_memtable_count; + case rocksdb_next_on_memtable_count: + return rep->next_on_memtable_count; + case rocksdb_prev_on_memtable_count: + return rep->prev_on_memtable_count; + case rocksdb_seek_child_seek_time: + return rep->seek_child_seek_time; + case rocksdb_seek_child_seek_count: + return rep->seek_child_seek_count; + case rocksdb_seek_min_heap_time: + return rep->seek_min_heap_time; + case rocksdb_seek_max_heap_time: + return rep->seek_max_heap_time; + case rocksdb_seek_internal_seek_time: + return rep->seek_internal_seek_time; + case rocksdb_find_next_user_entry_time: + return rep->find_next_user_entry_time; + case rocksdb_write_wal_time: + return rep->write_wal_time; + case rocksdb_write_memtable_time: + return rep->write_memtable_time; + case rocksdb_write_delay_time: + return rep->write_delay_time; + case rocksdb_write_pre_and_post_process_time: + return rep->write_pre_and_post_process_time; + case rocksdb_db_mutex_lock_nanos: + return rep->db_mutex_lock_nanos; + case rocksdb_db_condition_wait_nanos: + return rep->db_condition_wait_nanos; + case rocksdb_merge_operator_time_nanos: + return rep->merge_operator_time_nanos; + case rocksdb_read_index_block_nanos: + return rep->read_index_block_nanos; + case rocksdb_read_filter_block_nanos: + return rep->read_filter_block_nanos; + case rocksdb_new_table_block_iter_nanos: + return rep->new_table_block_iter_nanos; + case rocksdb_new_table_iterator_nanos: + return rep->new_table_iterator_nanos; + case rocksdb_block_seek_nanos: + return rep->block_seek_nanos; + case rocksdb_find_table_nanos: + return rep->find_table_nanos; + case rocksdb_bloom_memtable_hit_count: + return rep->bloom_memtable_hit_count; + case rocksdb_bloom_memtable_miss_count: + return rep->bloom_memtable_miss_count; + case rocksdb_bloom_sst_hit_count: + return rep->bloom_sst_hit_count; + case rocksdb_bloom_sst_miss_count: + return rep->bloom_sst_miss_count; + case rocksdb_key_lock_wait_time: + return rep->key_lock_wait_time; + case rocksdb_key_lock_wait_count: + return rep->key_lock_wait_count; + case rocksdb_env_new_sequential_file_nanos: + return rep->env_new_sequential_file_nanos; + case rocksdb_env_new_random_access_file_nanos: + return rep->env_new_random_access_file_nanos; + case rocksdb_env_new_writable_file_nanos: + return rep->env_new_writable_file_nanos; + case rocksdb_env_reuse_writable_file_nanos: + return rep->env_reuse_writable_file_nanos; + case rocksdb_env_new_random_rw_file_nanos: + return rep->env_new_random_rw_file_nanos; + case rocksdb_env_new_directory_nanos: + return rep->env_new_directory_nanos; + case rocksdb_env_file_exists_nanos: + return rep->env_file_exists_nanos; + case rocksdb_env_get_children_nanos: + return rep->env_get_children_nanos; + case rocksdb_env_get_children_file_attributes_nanos: + return rep->env_get_children_file_attributes_nanos; + case rocksdb_env_delete_file_nanos: + return rep->env_delete_file_nanos; + case rocksdb_env_create_dir_nanos: + return rep->env_create_dir_nanos; + case rocksdb_env_create_dir_if_missing_nanos: + return rep->env_create_dir_if_missing_nanos; + case rocksdb_env_delete_dir_nanos: + return rep->env_delete_dir_nanos; + case rocksdb_env_get_file_size_nanos: + return rep->env_get_file_size_nanos; + case rocksdb_env_get_file_modification_time_nanos: + return rep->env_get_file_modification_time_nanos; + case rocksdb_env_rename_file_nanos: + return rep->env_rename_file_nanos; + case rocksdb_env_link_file_nanos: + return rep->env_link_file_nanos; + case rocksdb_env_lock_file_nanos: + return rep->env_lock_file_nanos; + case rocksdb_env_unlock_file_nanos: + return rep->env_unlock_file_nanos; + case rocksdb_env_new_logger_nanos: + return rep->env_new_logger_nanos; + default: + break; + } + return 0; +} + +void rocksdb_perfcontext_destroy(rocksdb_perfcontext_t* context) { + delete context; +} + /* TODO: DB::OpenForReadOnly @@ -2973,7 +3162,7 @@ rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create( rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create_with_comparator( const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options, - const rocksdb_comparator_t* comparator) { + const rocksdb_comparator_t* /*comparator*/) { rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t; writer->rep = new SstFileWriter(env->rep, io_options->rep); return writer; @@ -3011,7 +3200,7 @@ void rocksdb_sstfilewriter_delete(rocksdb_sstfilewriter_t* writer, void rocksdb_sstfilewriter_finish(rocksdb_sstfilewriter_t* writer, char** errptr) { - SaveError(errptr, writer->rep->Finish(NULL)); + SaveError(errptr, writer->rep->Finish(nullptr)); } void rocksdb_sstfilewriter_destroy(rocksdb_sstfilewriter_t* writer) { @@ -3793,7 +3982,7 @@ rocksdb_pinnableslice_t* rocksdb_get_pinned( if (!s.IsNotFound()) { SaveError(errptr, s); } - return NULL; + return nullptr; } return v; } @@ -3810,7 +3999,7 @@ rocksdb_pinnableslice_t* rocksdb_get_pinned_cf( if (!s.IsNotFound()) { SaveError(errptr, s); } - return NULL; + return nullptr; } return v; } @@ -3821,7 +4010,7 @@ const char* rocksdb_pinnableslice_value(const rocksdb_pinnableslice_t* v, size_t* vlen) { if (!v) { *vlen = 0; - return NULL; + return nullptr; } *vlen = v->rep.size(); diff --git a/db/column_family.cc b/db/column_family.cc index 5824d7b5475..b3e025bee71 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -31,6 +31,7 @@ #include "monitoring/thread_status_util.h" #include "options/options_helper.h" #include "table/block_based_table_factory.h" +#include "table/merging_iterator.h" #include "util/autovector.h" #include "util/compression.h" @@ -53,6 +54,9 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() { #endif // ROCKSDB_LITE // Job id == 0 means that this is not our background process, but rather // user thread + // Need to hold some shared pointers owned by the initial_cf_options + // before final cleaning up finishes. + ColumnFamilyOptions initial_cf_options_copy = cfd_->initial_cf_options(); JobContext job_context(0); mutex_->Lock(); if (cfd_->Unref()) { @@ -382,7 +386,7 @@ ColumnFamilyData::ColumnFamilyData( next_(nullptr), prev_(nullptr), log_number_(0), - flush_reason_(FlushReason::kUnknown), + flush_reason_(FlushReason::kOthers), column_family_set_(column_family_set), pending_flush_(false), pending_compaction_(false), @@ -845,6 +849,10 @@ uint64_t ColumnFamilyData::GetTotalSstFilesSize() const { return VersionSet::GetTotalSstFilesSize(dummy_versions_); } +uint64_t ColumnFamilyData::GetLiveSstFilesSize() const { + return current_->GetSstFilesSize(); +} + MemTable* ColumnFamilyData::ConstructNewMemtable( const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) { return new MemTable(internal_comparator_, ioptions_, mutable_cf_options, @@ -881,6 +889,68 @@ bool ColumnFamilyData::RangeOverlapWithCompaction( smallest_user_key, largest_user_key, level); } +Status ColumnFamilyData::RangesOverlapWithMemtables( + const autovector& ranges, SuperVersion* super_version, + bool* overlap) { + assert(overlap != nullptr); + *overlap = false; + // Create an InternalIterator over all unflushed memtables + Arena arena; + ReadOptions read_opts; + read_opts.total_order_seek = true; + MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena); + merge_iter_builder.AddIterator( + super_version->mem->NewIterator(read_opts, &arena)); + super_version->imm->AddIterators(read_opts, &merge_iter_builder); + ScopedArenaIterator memtable_iter(merge_iter_builder.Finish()); + + std::vector memtable_range_del_iters; + auto* active_range_del_iter = + super_version->mem->NewRangeTombstoneIterator(read_opts); + if (active_range_del_iter != nullptr) { + memtable_range_del_iters.push_back(active_range_del_iter); + } + super_version->imm->AddRangeTombstoneIterators(read_opts, + &memtable_range_del_iters); + RangeDelAggregator range_del_agg(internal_comparator_, {} /* snapshots */, + false /* collapse_deletions */); + Status status; + { + std::unique_ptr memtable_range_del_iter( + NewMergingIterator(&internal_comparator_, + memtable_range_del_iters.empty() + ? nullptr + : &memtable_range_del_iters[0], + static_cast(memtable_range_del_iters.size()))); + status = range_del_agg.AddTombstones(std::move(memtable_range_del_iter)); + } + for (size_t i = 0; i < ranges.size() && status.ok() && !*overlap; ++i) { + auto* vstorage = super_version->current->storage_info(); + auto* ucmp = vstorage->InternalComparator()->user_comparator(); + InternalKey range_start(ranges[i].start, kMaxSequenceNumber, + kValueTypeForSeek); + memtable_iter->Seek(range_start.Encode()); + status = memtable_iter->status(); + ParsedInternalKey seek_result; + if (status.ok()) { + if (memtable_iter->Valid() && + !ParseInternalKey(memtable_iter->key(), &seek_result)) { + status = Status::Corruption("DB have corrupted keys"); + } + } + if (status.ok()) { + if (memtable_iter->Valid() && + ucmp->Compare(seek_result.user_key, ranges[i].limit) <= 0) { + *overlap = true; + } else if (range_del_agg.IsRangeOverlapped(ranges[i].start, + ranges[i].limit)) { + *overlap = true; + } + } + } + return status; +} + const int ColumnFamilyData::kCompactAllLevels = -1; const int ColumnFamilyData::kCompactToBaseLevel = -2; diff --git a/db/column_family.h b/db/column_family.h index e5abb485ee2..84625d9065a 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -244,6 +244,7 @@ class ColumnFamilyData { void SetCurrent(Version* _current); uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held + uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held void SetMemtable(MemTable* new_mem) { uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1; new_mem->SetID(memtable_id); @@ -274,6 +275,16 @@ class ColumnFamilyData { const Slice& largest_user_key, int level) const; + // Check if the passed ranges overlap with any unflushed memtables + // (immutable or mutable). + // + // @param super_version A referenced SuperVersion that will be held for the + // duration of this function. + // + // Thread-safe + Status RangesOverlapWithMemtables(const autovector& ranges, + SuperVersion* super_version, bool* overlap); + // A flag to tell a manual compaction is to compact all levels together // instead of a specific level. static const int kCompactAllLevels; @@ -359,6 +370,10 @@ class ColumnFamilyData { bool initialized() const { return initialized_.load(); } + const ColumnFamilyOptions& initial_cf_options() { + return initial_cf_options_; + } + Env::WriteLifeTimeHint CalculateSSTWriteHint(int level); private: diff --git a/db/column_family_test.cc b/db/column_family_test.cc index 94c087aaca5..6d8360dcbee 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -1168,13 +1168,14 @@ TEST_F(ColumnFamilyTest, MemtableNotSupportSnapshot) { #endif // !ROCKSDB_LITE class TestComparator : public Comparator { - int Compare(const rocksdb::Slice& a, const rocksdb::Slice& b) const override { + int Compare(const rocksdb::Slice& /*a*/, + const rocksdb::Slice& /*b*/) const override { return 0; } const char* Name() const override { return "Test"; } - void FindShortestSeparator(std::string* start, - const rocksdb::Slice& limit) const override {} - void FindShortSuccessor(std::string* key) const override {} + void FindShortestSeparator(std::string* /*start*/, + const rocksdb::Slice& /*limit*/) const override {} + void FindShortSuccessor(std::string* /*key*/) const override {} }; static TestComparator third_comparator; @@ -2790,6 +2791,18 @@ TEST_F(ColumnFamilyTest, CompactionSpeedupTwoColumnFamilies) { ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); } +TEST_F(ColumnFamilyTest, CreateAndDestoryOptions) { + std::unique_ptr cfo(new ColumnFamilyOptions()); + ColumnFamilyHandle* cfh; + Open(); + ASSERT_OK(db_->CreateColumnFamily(*(cfo.get()), "yoyo", &cfh)); + cfo.reset(); + ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar")); + ASSERT_OK(db_->Flush(FlushOptions(), cfh)); + ASSERT_OK(db_->DropColumnFamily(cfh)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh)); +} + #ifndef ROCKSDB_LITE TEST_F(ColumnFamilyTest, FlushCloseWALFiles) { SpecialEnv env(Env::Default()); diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc index 5aad6114f5e..7f150453960 100644 --- a/db/compact_files_test.cc +++ b/db/compact_files_test.cc @@ -37,8 +37,7 @@ class FlushedFileCollector : public EventListener { FlushedFileCollector() {} ~FlushedFileCollector() {} - virtual void OnFlushCompleted( - DB* db, const FlushJobInfo& info) override { + virtual void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { std::lock_guard lock(mutex_); flushed_files_.push_back(info.file_path); } @@ -257,9 +256,9 @@ TEST_F(CompactFilesTest, CapturingPendingFiles) { TEST_F(CompactFilesTest, CompactionFilterWithGetSv) { class FilterWithGet : public CompactionFilter { public: - virtual bool Filter(int level, const Slice& key, const Slice& value, - std::string* new_value, - bool* value_changed) const override { + virtual bool Filter(int /*level*/, const Slice& /*key*/, + const Slice& /*value*/, std::string* /*new_value*/, + bool* /*value_changed*/) const override { if (db_ == nullptr) { return true; } diff --git a/db/compacted_db_impl.h b/db/compacted_db_impl.h index de32f21e681..736002e1e52 100644 --- a/db/compacted_db_impl.h +++ b/db/compacted_db_impl.h @@ -32,55 +32,56 @@ class CompactedDBImpl : public DBImpl { override; using DBImpl::Put; - virtual Status Put(const WriteOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) override { + virtual Status Put(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { return Status::NotSupported("Not supported in compacted db mode."); } using DBImpl::Merge; - virtual Status Merge(const WriteOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) override { + virtual Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { return Status::NotSupported("Not supported in compacted db mode."); } using DBImpl::Delete; - virtual Status Delete(const WriteOptions& options, - ColumnFamilyHandle* column_family, - const Slice& key) override { + virtual Status Delete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { return Status::NotSupported("Not supported in compacted db mode."); } - virtual Status Write(const WriteOptions& options, - WriteBatch* updates) override { + virtual Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { return Status::NotSupported("Not supported in compacted db mode."); } using DBImpl::CompactRange; - virtual Status CompactRange(const CompactRangeOptions& options, - ColumnFamilyHandle* column_family, - const Slice* begin, const Slice* end) override { + virtual Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, + const Slice* /*end*/) override { return Status::NotSupported("Not supported in compacted db mode."); } virtual Status DisableFileDeletions() override { return Status::NotSupported("Not supported in compacted db mode."); } - virtual Status EnableFileDeletions(bool force) override { + virtual Status EnableFileDeletions(bool /*force*/) override { return Status::NotSupported("Not supported in compacted db mode."); } virtual Status GetLiveFiles(std::vector&, - uint64_t* manifest_file_size, - bool flush_memtable = true) override { + uint64_t* /*manifest_file_size*/, + bool /*flush_memtable*/ = true) override { return Status::NotSupported("Not supported in compacted db mode."); } using DBImpl::Flush; - virtual Status Flush(const FlushOptions& options, - ColumnFamilyHandle* column_family) override { + virtual Status Flush(const FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { return Status::NotSupported("Not supported in compacted db mode."); } using DB::IngestExternalFile; virtual Status IngestExternalFile( - ColumnFamilyHandle* column_family, - const std::vector& external_files, - const IngestExternalFileOptions& ingestion_options) override { + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*ingestion_options*/) override { return Status::NotSupported("Not supported in compacted db mode."); } diff --git a/db/compaction.cc b/db/compaction.cc index c2785adeeb6..9db41139b51 100644 --- a/db/compaction.cc +++ b/db/compaction.cc @@ -446,7 +446,8 @@ bool Compaction::ShouldFormSubcompactions() const { return false; } if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { - return start_level_ == 0 && output_level_ > 0 && !IsOutputLevelEmpty(); + return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0 && + !IsOutputLevelEmpty(); } else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) { return number_levels_ > 1 && output_level_ > 0; } else { diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc index 8a40cd40565..3d62f43a4fc 100644 --- a/db/compaction_iterator.cc +++ b/db/compaction_iterator.cc @@ -12,31 +12,6 @@ namespace rocksdb { -#ifndef ROCKSDB_LITE -CompactionEventListener::CompactionListenerValueType fromInternalValueType( - ValueType vt) { - switch (vt) { - case kTypeDeletion: - return CompactionEventListener::CompactionListenerValueType::kDelete; - case kTypeValue: - return CompactionEventListener::CompactionListenerValueType::kValue; - case kTypeMerge: - return CompactionEventListener::CompactionListenerValueType:: - kMergeOperand; - case kTypeSingleDeletion: - return CompactionEventListener::CompactionListenerValueType:: - kSingleDelete; - case kTypeRangeDeletion: - return CompactionEventListener::CompactionListenerValueType::kRangeDelete; - case kTypeBlobIndex: - return CompactionEventListener::CompactionListenerValueType::kBlobIndex; - default: - assert(false); - return CompactionEventListener::CompactionListenerValueType::kInvalid; - } -} -#endif // ROCKSDB_LITE - CompactionIterator::CompactionIterator( InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, SequenceNumber last_sequence, std::vector* snapshots, @@ -44,7 +19,6 @@ CompactionIterator::CompactionIterator( const SnapshotChecker* snapshot_checker, Env* env, bool expect_valid_internal_key, RangeDelAggregator* range_del_agg, const Compaction* compaction, const CompactionFilter* compaction_filter, - CompactionEventListener* compaction_listener, const std::atomic* shutting_down, const SequenceNumber preserve_deletes_seqnum) : CompactionIterator( @@ -53,21 +27,18 @@ CompactionIterator::CompactionIterator( expect_valid_internal_key, range_del_agg, std::unique_ptr( compaction ? new CompactionProxy(compaction) : nullptr), - compaction_filter, compaction_listener, shutting_down, - preserve_deletes_seqnum) {} + compaction_filter, shutting_down, preserve_deletes_seqnum) {} CompactionIterator::CompactionIterator( InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, - SequenceNumber last_sequence, std::vector* snapshots, + SequenceNumber /*last_sequence*/, std::vector* snapshots, SequenceNumber earliest_write_conflict_snapshot, const SnapshotChecker* snapshot_checker, Env* env, bool expect_valid_internal_key, RangeDelAggregator* range_del_agg, std::unique_ptr compaction, const CompactionFilter* compaction_filter, - CompactionEventListener* compaction_listener, const std::atomic* shutting_down, - const SequenceNumber preserve_deletes_seqnum - ) + const SequenceNumber preserve_deletes_seqnum) : input_(input), cmp_(cmp), merge_helper_(merge_helper), @@ -79,9 +50,6 @@ CompactionIterator::CompactionIterator( range_del_agg_(range_del_agg), compaction_(std::move(compaction)), compaction_filter_(compaction_filter), -#ifndef ROCKSDB_LITE - compaction_listener_(compaction_listener), -#endif // ROCKSDB_LITE shutting_down_(shutting_down), preserve_deletes_seqnum_(preserve_deletes_seqnum), ignore_snapshots_(false), @@ -199,10 +167,13 @@ void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, CompactionFilter::ValueType value_type = ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue : CompactionFilter::ValueType::kBlobIndex; + // Hack: pass internal key to BlobIndexCompactionFilter since it needs + // to get sequence number. + Slice& filter_key = ikey_.type == kTypeValue ? ikey_.user_key : key_; { StopWatchNano timer(env_, true); filter = compaction_filter_->FilterV2( - compaction_->level(), ikey_.user_key, value_type, value_, + compaction_->level(), filter_key, value_type, value_, &compaction_filter_value_, compaction_filter_skip_until_.rep()); iter_stats_.total_filter_time += env_ != nullptr ? timer.ElapsedNanos() : 0; @@ -293,28 +264,12 @@ void CompactionIterator::NextFromInput() { (snapshot_checker_ == nullptr || snapshot_checker_->IsInSnapshot(ikey_.sequence, kMaxSequenceNumber)); -#ifndef ROCKSDB_LITE - if (compaction_listener_) { - compaction_listener_->OnCompaction(compaction_->level(), ikey_.user_key, - fromInternalValueType(ikey_.type), - value_, ikey_.sequence, true); - } -#endif // !ROCKSDB_LITE - // Apply the compaction filter to the first committed version of the user // key. if (current_key_committed_) { InvokeFilterIfNeeded(&need_skip, &skip_until); } } else { -#ifndef ROCKSDB_LITE - if (compaction_listener_) { - compaction_listener_->OnCompaction(compaction_->level(), ikey_.user_key, - fromInternalValueType(ikey_.type), - value_, ikey_.sequence, false); - } -#endif // ROCKSDB_LITE - // Update the current key to reflect the new sequence number/type without // copying the user key. // TODO(rven): Compaction filter does not process keys in this path diff --git a/db/compaction_iterator.h b/db/compaction_iterator.h index 8222f6d54dc..7732e08ae75 100644 --- a/db/compaction_iterator.h +++ b/db/compaction_iterator.h @@ -20,8 +20,6 @@ namespace rocksdb { -class CompactionEventListener; - class CompactionIterator { public: // A wrapper around Compaction. Has a much smaller interface, only what @@ -32,7 +30,7 @@ class CompactionIterator { : compaction_(compaction) {} virtual ~CompactionProxy() = default; - virtual int level(size_t compaction_input_level = 0) const { + virtual int level(size_t /*compaction_input_level*/ = 0) const { return compaction_->level(); } virtual bool KeyNotExistsBeyondOutputLevel( @@ -69,7 +67,6 @@ class CompactionIterator { RangeDelAggregator* range_del_agg, const Compaction* compaction = nullptr, const CompactionFilter* compaction_filter = nullptr, - CompactionEventListener* compaction_listener = nullptr, const std::atomic* shutting_down = nullptr, const SequenceNumber preserve_deletes_seqnum = 0); @@ -83,7 +80,6 @@ class CompactionIterator { RangeDelAggregator* range_del_agg, std::unique_ptr compaction, const CompactionFilter* compaction_filter = nullptr, - CompactionEventListener* compaction_listener = nullptr, const std::atomic* shutting_down = nullptr, const SequenceNumber preserve_deletes_seqnum = 0); @@ -147,9 +143,6 @@ class CompactionIterator { RangeDelAggregator* range_del_agg_; std::unique_ptr compaction_; const CompactionFilter* compaction_filter_; -#ifndef ROCKSDB_LITE - CompactionEventListener* compaction_listener_; -#endif // !ROCKSDB_LITE const std::atomic* shutting_down_; const SequenceNumber preserve_deletes_seqnum_; bool bottommost_level_; diff --git a/db/compaction_iterator_test.cc b/db/compaction_iterator_test.cc index 223798064c0..1402c358ed9 100644 --- a/db/compaction_iterator_test.cc +++ b/db/compaction_iterator_test.cc @@ -19,15 +19,15 @@ namespace rocksdb { // Expects no merging attempts. class NoMergingMergeOp : public MergeOperator { public: - bool FullMergeV2(const MergeOperationInput& merge_in, - MergeOperationOutput* merge_out) const override { + bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* /*merge_out*/) const override { ADD_FAILURE(); return false; } - bool PartialMergeMulti(const Slice& key, - const std::deque& operand_list, - std::string* new_value, - Logger* logger) const override { + bool PartialMergeMulti(const Slice& /*key*/, + const std::deque& /*operand_list*/, + std::string* /*new_value*/, + Logger* /*logger*/) const override { ADD_FAILURE(); return false; } @@ -126,7 +126,7 @@ class LoggingForwardVectorIterator : public InternalIterator { keys_.begin(); } - virtual void SeekForPrev(const Slice& target) override { assert(false); } + virtual void SeekForPrev(const Slice& /*target*/) override { assert(false); } virtual void Next() override { assert(Valid()); @@ -158,9 +158,12 @@ class FakeCompaction : public CompactionIterator::CompactionProxy { public: FakeCompaction() = default; - virtual int level(size_t compaction_input_level) const override { return 0; } + virtual int level(size_t /*compaction_input_level*/) const override { + return 0; + } virtual bool KeyNotExistsBeyondOutputLevel( - const Slice& user_key, std::vector* level_ptrs) const override { + const Slice& /*user_key*/, + std::vector* /*level_ptrs*/) const override { return is_bottommost_level || key_not_exists_beyond_output_level; } virtual bool bottommost_level() const override { return is_bottommost_level; } @@ -245,7 +248,7 @@ class CompactionIteratorTest : public testing::TestWithParam { iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_, earliest_write_conflict_snapshot, snapshot_checker_.get(), Env::Default(), false, range_del_agg_.get(), std::move(compaction), - filter, nullptr, &shutting_down_)); + filter, &shutting_down_)); } void AddSnapshot(SequenceNumber snapshot, @@ -365,9 +368,9 @@ TEST_P(CompactionIteratorTest, RangeDeletionWithSnapshots) { TEST_P(CompactionIteratorTest, CompactionFilterSkipUntil) { class Filter : public CompactionFilter { - virtual Decision FilterV2(int level, const Slice& key, ValueType t, + virtual Decision FilterV2(int /*level*/, const Slice& key, ValueType t, const Slice& existing_value, - std::string* new_value, + std::string* /*new_value*/, std::string* skip_until) const override { std::string k = key.ToString(); std::string v = existing_value.ToString(); @@ -548,10 +551,10 @@ TEST_P(CompactionIteratorTest, ShuttingDownInMerge) { TEST_P(CompactionIteratorTest, SingleMergeOperand) { class Filter : public CompactionFilter { - virtual Decision FilterV2(int level, const Slice& key, ValueType t, + virtual Decision FilterV2(int /*level*/, const Slice& key, ValueType t, const Slice& existing_value, - std::string* new_value, - std::string* skip_until) const override { + std::string* /*new_value*/, + std::string* /*skip_until*/) const override { std::string k = key.ToString(); std::string v = existing_value.ToString(); @@ -602,7 +605,7 @@ TEST_P(CompactionIteratorTest, SingleMergeOperand) { bool PartialMergeMulti(const Slice& key, const std::deque& operand_list, std::string* new_value, - Logger* logger) const override { + Logger* /*logger*/) const override { std::string string_key = key.ToString(); EXPECT_TRUE(string_key == "a" || string_key == "b"); diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 440d64879fa..23d4248d261 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -723,14 +723,16 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { std::set sample_begin_offsets; if (bottommost_level_ && kSampleBytes > 0) { const size_t kMaxSamples = kSampleBytes >> kSampleLenShift; - const size_t kOutFileLen = mutable_cf_options->MaxFileSizeForLevel( - compact_->compaction->output_level()); + const size_t kOutFileLen = static_cast( + mutable_cf_options->MaxFileSizeForLevel( + compact_->compaction->output_level())); if (kOutFileLen != port::kMaxSizet) { const size_t kOutFileNumSamples = kOutFileLen >> kSampleLenShift; Random64 generator{versions_->NewFileNumber()}; for (size_t i = 0; i < kMaxSamples; ++i) { - sample_begin_offsets.insert(generator.Uniform(kOutFileNumSamples) - << kSampleLenShift); + sample_begin_offsets.insert( + static_cast(generator.Uniform(kOutFileNumSamples)) + << kSampleLenShift); } } } @@ -762,24 +764,13 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { input->SeekToFirst(); } - // we allow only 1 compaction event listener. Used by blob storage - CompactionEventListener* comp_event_listener = nullptr; -#ifndef ROCKSDB_LITE - for (auto& celitr : cfd->ioptions()->listeners) { - comp_event_listener = celitr->GetCompactionEventListener(); - if (comp_event_listener != nullptr) { - break; - } - } -#endif // ROCKSDB_LITE - Status status; sub_compact->c_iter.reset(new CompactionIterator( input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(), &existing_snapshots_, earliest_write_conflict_snapshot_, snapshot_checker_, env_, false, range_del_agg.get(), - sub_compact->compaction, compaction_filter, comp_event_listener, - shutting_down_, preserve_deletes_seqnum_)); + sub_compact->compaction, compaction_filter, shutting_down_, + preserve_deletes_seqnum_)); auto c_iter = sub_compact->c_iter.get(); c_iter->SeekToFirst(); if (c_iter->Valid() && diff --git a/db/compaction_job_stats_test.cc b/db/compaction_job_stats_test.cc index 9a8372f5785..afcf6d533e9 100644 --- a/db/compaction_job_stats_test.cc +++ b/db/compaction_job_stats_test.cc @@ -426,7 +426,7 @@ class CompactionJobStatsChecker : public EventListener { // Once a compaction completed, this function will verify the returned // CompactionJobInfo with the oldest CompactionJobInfo added earlier // in "expected_stats_" which has not yet being used for verification. - virtual void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) { + virtual void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) { if (verify_next_comp_io_stats_) { ASSERT_GT(ci.stats.file_write_nanos, 0); ASSERT_GT(ci.stats.file_range_sync_nanos, 0); diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index 0e4d5627a10..9e20b63f71f 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -41,7 +41,7 @@ bool FindIntraL0Compaction(const std::vector& level_files, size_t min_files_to_compact, uint64_t max_compact_bytes_per_del_file, CompactionInputFiles* comp_inputs) { - size_t compact_bytes = level_files[0]->fd.file_size; + size_t compact_bytes = static_cast(level_files[0]->fd.file_size); size_t compact_bytes_per_del_file = port::kMaxSizet; // compaction range will be [0, span_len). size_t span_len; @@ -199,7 +199,7 @@ void CompactionPicker::GetRange(const std::vector& inputs, assert(initialized); } -bool CompactionPicker::ExpandInputsToCleanCut(const std::string& cf_name, +bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/, VersionStorageInfo* vstorage, CompactionInputFiles* inputs) { // This isn't good compaction @@ -309,7 +309,7 @@ Compaction* CompactionPicker::CompactFiles( Status CompactionPicker::GetCompactionInputsFromFileNumbers( std::vector* input_files, std::unordered_set* input_set, const VersionStorageInfo* vstorage, - const CompactionOptions& compact_options) const { + const CompactionOptions& /*compact_options*/) const { if (input_set->size() == 0U) { return Status::InvalidArgument( "Compaction must include at least one file."); @@ -1612,8 +1612,9 @@ Compaction* FIFOCompactionPicker::PickCompaction( Compaction* FIFOCompactionPicker::CompactRange( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, VersionStorageInfo* vstorage, int input_level, int output_level, - uint32_t output_path_id, const InternalKey* begin, const InternalKey* end, - InternalKey** compaction_end, bool* manual_conflict) { + uint32_t /*output_path_id*/, const InternalKey* /*begin*/, + const InternalKey* /*end*/, InternalKey** compaction_end, + bool* /*manual_conflict*/) { assert(input_level == 0); assert(output_level == 0); *compaction_end = nullptr; diff --git a/db/compaction_picker.h b/db/compaction_picker.h index 3172a68e85f..a6a551881f5 100644 --- a/db/compaction_picker.h +++ b/db/compaction_picker.h @@ -267,27 +267,29 @@ class NullCompactionPicker : public CompactionPicker { virtual ~NullCompactionPicker() {} // Always return "nullptr" - Compaction* PickCompaction(const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, - LogBuffer* log_buffer) override { + Compaction* PickCompaction(const std::string& /*cf_name*/, + const MutableCFOptions& /*mutable_cf_options*/, + VersionStorageInfo* /*vstorage*/, + LogBuffer* /*log_buffer*/) override { return nullptr; } // Always return "nullptr" - Compaction* CompactRange(const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, int input_level, - int output_level, uint32_t output_path_id, - const InternalKey* begin, const InternalKey* end, - InternalKey** compaction_end, - bool* manual_conflict) override { + Compaction* CompactRange(const std::string& /*cf_name*/, + const MutableCFOptions& /*mutable_cf_options*/, + VersionStorageInfo* /*vstorage*/, + int /*input_level*/, int /*output_level*/, + uint32_t /*output_path_id*/, + const InternalKey* /*begin*/, + const InternalKey* /*end*/, + InternalKey** /*compaction_end*/, + bool* /*manual_conflict*/) override { return nullptr; } // Always returns false. virtual bool NeedsCompaction( - const VersionStorageInfo* vstorage) const override { + const VersionStorageInfo* /*vstorage*/) const override { return false; } }; diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc index 4752d34285a..297949070ba 100644 --- a/db/compaction_picker_test.cc +++ b/db/compaction_picker_test.cc @@ -20,7 +20,9 @@ namespace rocksdb { class CountingLogger : public Logger { public: using Logger::Logv; - virtual void Logv(const char* format, va_list ap) override { log_count++; } + virtual void Logv(const char* /*format*/, va_list /*ap*/) override { + log_count++; + } size_t log_count; }; diff --git a/db/compaction_picker_universal.cc b/db/compaction_picker_universal.cc index 960c65d2b95..a0cbdea6ee4 100644 --- a/db/compaction_picker_universal.cc +++ b/db/compaction_picker_universal.cc @@ -204,7 +204,7 @@ void UniversalCompactionPicker::SortedRun::DumpSizeInfo( std::vector UniversalCompactionPicker::CalculateSortedRuns( - const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions, + const VersionStorageInfo& vstorage, const ImmutableCFOptions& /*ioptions*/, const MutableCFOptions& mutable_cf_options) { std::vector ret; for (FileMetaData* f : vstorage.LevelFiles(0)) { diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc index 28a2a5658e7..83740ffda00 100644 --- a/db/comparator_db_test.cc +++ b/db/comparator_db_test.cc @@ -188,10 +188,10 @@ class DoubleComparator : public Comparator { return -1; } } - virtual void FindShortestSeparator(std::string* start, - const Slice& limit) const override {} + virtual void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} - virtual void FindShortSuccessor(std::string* key) const override {} + virtual void FindShortSuccessor(std::string* /*key*/) const override {} }; class HashComparator : public Comparator { @@ -211,10 +211,10 @@ class HashComparator : public Comparator { return -1; } } - virtual void FindShortestSeparator(std::string* start, - const Slice& limit) const override {} + virtual void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} - virtual void FindShortSuccessor(std::string* key) const override {} + virtual void FindShortSuccessor(std::string* /*key*/) const override {} }; class TwoStrComparator : public Comparator { @@ -243,10 +243,10 @@ class TwoStrComparator : public Comparator { } return a2.compare(b2); } - virtual void FindShortestSeparator(std::string* start, - const Slice& limit) const override {} + virtual void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} - virtual void FindShortSuccessor(std::string* key) const override {} + virtual void FindShortSuccessor(std::string* /*key*/) const override {} }; } // namespace diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index c5476816778..31742d0bdf2 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -862,8 +862,9 @@ class TestEnv : public EnvWrapper { CloseHelper(); } } - virtual void Logv(const char *format, va_list ap) override { }; - protected: + virtual void Logv(const char* /*format*/, va_list /*ap*/) override{}; + + protected: virtual Status CloseImpl() override { return CloseHelper(); } @@ -879,13 +880,13 @@ class TestEnv : public EnvWrapper { int GetCloseCount() { return close_count; } - virtual Status NewLogger(const std::string& fname, + virtual Status NewLogger(const std::string& /*fname*/, shared_ptr* result) { result->reset(new TestLogger(this)); return Status::OK(); } - private: + private: int close_count; }; @@ -895,7 +896,7 @@ TEST_F(DBBasicTest, DBClose) { ASSERT_OK(DestroyDB(dbname, options)); DB* db = nullptr; - TestEnv *env = new TestEnv(); + TestEnv* env = new TestEnv(); options.create_if_missing = true; options.env = env; Status s = DB::Open(options, dbname, &db); diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 82c420c7076..5149a90c467 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -47,7 +47,7 @@ class DBBlockCacheTest : public DBTestBase { return options; } - void InitTable(const Options& options) { + void InitTable(const Options& /*options*/) { std::string value(kValueSize, 'a'); for (size_t i = 0; i < kNumBlocks; i++) { ASSERT_OK(Put(ToString(i), value.c_str())); diff --git a/db/db_compaction_filter_test.cc b/db/db_compaction_filter_test.cc index 0837d8fee95..c5e8e547c59 100644 --- a/db/db_compaction_filter_test.cc +++ b/db/db_compaction_filter_test.cc @@ -56,9 +56,9 @@ INSTANTIATE_TEST_CASE_P( class KeepFilter : public CompactionFilter { public: - virtual bool Filter(int level, const Slice& key, const Slice& value, - std::string* new_value, bool* value_changed) const - override { + virtual bool Filter(int /*level*/, const Slice& /*key*/, + const Slice& /*value*/, std::string* /*new_value*/, + bool* /*value_changed*/) const override { cfilter_count++; return false; } @@ -68,9 +68,9 @@ class KeepFilter : public CompactionFilter { class DeleteFilter : public CompactionFilter { public: - virtual bool Filter(int level, const Slice& key, const Slice& value, - std::string* new_value, bool* value_changed) const - override { + virtual bool Filter(int /*level*/, const Slice& /*key*/, + const Slice& /*value*/, std::string* /*new_value*/, + bool* /*value_changed*/) const override { cfilter_count++; return true; } @@ -80,9 +80,9 @@ class DeleteFilter : public CompactionFilter { class DeleteISFilter : public CompactionFilter { public: - virtual bool Filter(int level, const Slice& key, const Slice& value, - std::string* new_value, - bool* value_changed) const override { + virtual bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { cfilter_count++; int i = std::stoi(key.ToString()); if (i > 5 && i <= 105) { @@ -100,14 +100,16 @@ class DeleteISFilter : public CompactionFilter { // zero-padded to length 10. class SkipEvenFilter : public CompactionFilter { public: - virtual Decision FilterV2(int level, const Slice& key, ValueType value_type, - const Slice& existing_value, std::string* new_value, + virtual Decision FilterV2(int /*level*/, const Slice& key, + ValueType /*value_type*/, + const Slice& /*existing_value*/, + std::string* /*new_value*/, std::string* skip_until) const override { cfilter_count++; int i = std::stoi(key.ToString()); if (i / 10 % 2 == 0) { char key_str[100]; - snprintf(key_str, sizeof(key), "%010d", i / 10 * 10 + 10); + snprintf(key_str, sizeof(key_str), "%010d", i / 10 * 10 + 10); *skip_until = key_str; ++cfilter_skips; return Decision::kRemoveAndSkipUntil; @@ -123,9 +125,9 @@ class SkipEvenFilter : public CompactionFilter { class DelayFilter : public CompactionFilter { public: explicit DelayFilter(DBTestBase* d) : db_test(d) {} - virtual bool Filter(int level, const Slice& key, const Slice& value, - std::string* new_value, - bool* value_changed) const override { + virtual bool Filter(int /*level*/, const Slice& /*key*/, + const Slice& /*value*/, std::string* /*new_value*/, + bool* /*value_changed*/) const override { db_test->env_->addon_time_.fetch_add(1000); return true; } @@ -140,9 +142,9 @@ class ConditionalFilter : public CompactionFilter { public: explicit ConditionalFilter(const std::string* filtered_value) : filtered_value_(filtered_value) {} - virtual bool Filter(int level, const Slice& key, const Slice& value, - std::string* new_value, - bool* value_changed) const override { + virtual bool Filter(int /*level*/, const Slice& /*key*/, const Slice& value, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { return value.ToString() == *filtered_value_; } @@ -156,9 +158,9 @@ class ChangeFilter : public CompactionFilter { public: explicit ChangeFilter() {} - virtual bool Filter(int level, const Slice& key, const Slice& value, - std::string* new_value, bool* value_changed) const - override { + virtual bool Filter(int /*level*/, const Slice& /*key*/, + const Slice& /*value*/, std::string* new_value, + bool* value_changed) const override { assert(new_value != nullptr); *new_value = NEW_VALUE; *value_changed = true; @@ -247,7 +249,7 @@ class DelayFilterFactory : public CompactionFilterFactory { public: explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {} virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& context) override { + const CompactionFilter::Context& /*context*/) override { return std::unique_ptr(new DelayFilter(db_test)); } @@ -263,7 +265,7 @@ class ConditionalFilterFactory : public CompactionFilterFactory { : filtered_value_(filtered_value.ToString()) {} virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& context) override { + const CompactionFilter::Context& /*context*/) override { return std::unique_ptr( new ConditionalFilter(&filtered_value_)); } @@ -281,7 +283,7 @@ class ChangeFilterFactory : public CompactionFilterFactory { explicit ChangeFilterFactory() {} virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& context) override { + const CompactionFilter::Context& /*context*/) override { return std::unique_ptr(new ChangeFilter()); } @@ -765,7 +767,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) { iter->Next(); } ASSERT_EQ(count, 6); - read_options.snapshot = 0; + read_options.snapshot = nullptr; std::unique_ptr iter1(db_->NewIterator(read_options)); iter1->SeekToFirst(); count = 0; diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 3b77228d83b..57beaa40771 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -53,7 +53,7 @@ class FlushedFileCollector : public EventListener { FlushedFileCollector() {} ~FlushedFileCollector() {} - virtual void OnFlushCompleted(DB* db, const FlushJobInfo& info) override { + virtual void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { std::lock_guard lock(mutex_); flushed_files_.push_back(info.file_path); } @@ -2751,6 +2751,7 @@ TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) { rocksdb::SyncPoint::GetInstance()->EnableProcessing(); Options options = CurrentOptions(); + options.target_file_size_base = 100000000; options.write_buffer_size = 100000000; options.max_subcompactions = max_subcompactions_; DestroyAndReopen(options); @@ -3317,6 +3318,64 @@ TEST_F(DBCompactionTest, CompactRangeSkipFlushAfterDelay) { rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } +TEST_F(DBCompactionTest, CompactRangeFlushOverlappingMemtable) { + // Verify memtable only gets flushed if it contains data overlapping the range + // provided to `CompactRange`. Tests all kinds of overlap/non-overlap. + const int kNumEndpointKeys = 5; + std::string keys[kNumEndpointKeys] = {"a", "b", "c", "d", "e"}; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + Reopen(options); + + // One extra iteration for nullptr, which means left side of interval is + // unbounded. + for (int i = 0; i <= kNumEndpointKeys; ++i) { + Slice begin; + Slice* begin_ptr; + if (i == 0) { + begin_ptr = nullptr; + } else { + begin = keys[i - 1]; + begin_ptr = &begin; + } + // Start at `i` so right endpoint comes after left endpoint. One extra + // iteration for nullptr, which means right side of interval is unbounded. + for (int j = std::max(0, i - 1); j <= kNumEndpointKeys; ++j) { + Slice end; + Slice* end_ptr; + if (j == kNumEndpointKeys) { + end_ptr = nullptr; + } else { + end = keys[j]; + end_ptr = &end; + } + ASSERT_OK(Put("b", "val")); + ASSERT_OK(Put("d", "val")); + CompactRangeOptions compact_range_opts; + ASSERT_OK(db_->CompactRange(compact_range_opts, begin_ptr, end_ptr)); + + uint64_t get_prop_tmp, num_memtable_entries = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesImmMemTables, + &get_prop_tmp)); + num_memtable_entries += get_prop_tmp; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &get_prop_tmp)); + num_memtable_entries += get_prop_tmp; + if (begin_ptr == nullptr || end_ptr == nullptr || + (i <= 4 && j >= 1 && (begin != "c" || end != "c"))) { + // In this case `CompactRange`'s range overlapped in some way with the + // memtable's range, so flush should've happened. Then "b" and "d" won't + // be in the memtable. + ASSERT_EQ(0, num_memtable_entries); + } else { + ASSERT_EQ(2, num_memtable_entries); + // flush anyways to prepare for next iteration + db_->Flush(FlushOptions()); + } + } + } +} + INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam, ::testing::Values(std::make_tuple(1, true), std::make_tuple(1, false), diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 1f1a0f4499f..010f9fbe8ff 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -74,7 +74,7 @@ Status DBImpl::EnableFileDeletions(bool force) { } int DBImpl::IsFileDeletionsEnabled() const { - return disable_delete_obsolete_files_; + return !disable_delete_obsolete_files_; } Status DBImpl::GetLiveFiles(std::vector& ret, diff --git a/db/db_impl.cc b/db/db_impl.cc index f47f2c011b2..f7ba90f5285 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -404,9 +404,7 @@ Status DBImpl::CloseHelper() { return ret; } -Status DBImpl::CloseImpl() { - return CloseHelper(); -} +Status DBImpl::CloseImpl() { return CloseHelper(); } DBImpl::~DBImpl() { if (!closed_) { @@ -675,8 +673,9 @@ Status DBImpl::SetDBOptions( } // return the same level if it cannot be moved -int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, - const MutableCFOptions& mutable_cf_options, int level) { +int DBImpl::FindMinimumEmptyLevelFitting( + ColumnFamilyData* cfd, const MutableCFOptions& /*mutable_cf_options*/, + int level) { mutex_.AssertHeld(); const auto* vstorage = cfd->current()->storage_info(); int minimum_level = level; @@ -694,7 +693,7 @@ int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, } Status DBImpl::FlushWAL(bool sync) { - { + if (manual_wal_flush_) { // We need to lock log_write_mutex_ since logs_ might change concurrently InstrumentedMutexLock wl(&log_write_mutex_); log::Writer* cur_log_writer = logs_.back().writer; @@ -708,6 +707,9 @@ Status DBImpl::FlushWAL(bool sync) { return s; } } + if (!sync) { + return Status::OK(); + } // sync = true ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=true"); return SyncWAL(); @@ -897,7 +899,7 @@ struct IterState { bool background_purge; }; -static void CleanupIteratorState(void* arg1, void* arg2) { +static void CleanupIteratorState(void* arg1, void* /*arg2*/) { IterState* state = reinterpret_cast(arg1); if (state->super_version->Unref()) { @@ -2146,9 +2148,9 @@ Status DBImpl::DeleteFile(std::string name) { status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, &mutex_, directories_.GetDbDir()); if (status.ok()) { - InstallSuperVersionAndScheduleWork( - cfd, &job_context.superversion_context, - *cfd->GetLatestMutableCFOptions()); + InstallSuperVersionAndScheduleWork(cfd, &job_context.superversion_context, + *cfd->GetLatestMutableCFOptions(), + FlushReason::kDeleteFiles); } FindObsoleteFiles(&job_context, false); } // lock released here @@ -2230,9 +2232,9 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, &mutex_, directories_.GetDbDir()); if (status.ok()) { - InstallSuperVersionAndScheduleWork( - cfd, &job_context.superversion_context, - *cfd->GetLatestMutableCFOptions()); + InstallSuperVersionAndScheduleWork(cfd, &job_context.superversion_context, + *cfd->GetLatestMutableCFOptions(), + FlushReason::kDeleteFiles); } for (auto* deleted_file : deleted_files) { deleted_file->being_compacted = false; @@ -2335,31 +2337,31 @@ Status DBImpl::GetDbIdentity(std::string& identity) const { } // Default implementation -- returns not supported status -Status DB::CreateColumnFamily(const ColumnFamilyOptions& cf_options, - const std::string& column_family_name, - ColumnFamilyHandle** handle) { +Status DB::CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/, + const std::string& /*column_family_name*/, + ColumnFamilyHandle** /*handle*/) { return Status::NotSupported(""); } Status DB::CreateColumnFamilies( - const ColumnFamilyOptions& cf_options, - const std::vector& column_family_names, - std::vector* handles) { + const ColumnFamilyOptions& /*cf_options*/, + const std::vector& /*column_family_names*/, + std::vector* /*handles*/) { return Status::NotSupported(""); } Status DB::CreateColumnFamilies( - const std::vector& column_families, - std::vector* handles) { + const std::vector& /*column_families*/, + std::vector* /*handles*/) { return Status::NotSupported(""); } -Status DB::DropColumnFamily(ColumnFamilyHandle* column_family) { +Status DB::DropColumnFamily(ColumnFamilyHandle* /*column_family*/) { return Status::NotSupported(""); } Status DB::DropColumnFamilies( - const std::vector& column_families) { + const std::vector& /*column_families*/) { return Status::NotSupported(""); } @@ -2388,10 +2390,13 @@ Snapshot::~Snapshot() { } Status DestroyDB(const std::string& dbname, const Options& options) { - const ImmutableDBOptions soptions(SanitizeOptions(dbname, options)); + ImmutableDBOptions soptions(SanitizeOptions(dbname, options)); Env* env = soptions.env; std::vector filenames; + // Reset the logger because it holds a handle to the + // log file and prevents cleanup and directory removal + soptions.info_log.reset(); // Ignore error in case directory does not exist env->GetChildren(dbname, &filenames); @@ -2832,7 +2837,7 @@ Status DBImpl::IngestExternalFile( // Figure out if we need to flush the memtable first if (status.ok()) { bool need_flush = false; - status = ingestion_job.NeedsFlush(&need_flush); + status = ingestion_job.NeedsFlush(&need_flush, cfd->GetSuperVersion()); TEST_SYNC_POINT_CALLBACK("DBImpl::IngestExternalFile:NeedFlush", &need_flush); if (status.ok() && need_flush) { @@ -2857,8 +2862,8 @@ Status DBImpl::IngestExternalFile( &mutex_, directories_.GetDbDir()); } if (status.ok()) { - InstallSuperVersionAndScheduleWork(cfd, &sv_context, - *mutable_cf_options); + InstallSuperVersionAndScheduleWork(cfd, &sv_context, *mutable_cf_options, + FlushReason::kExternalFileIngestion); } // Resume writes to the DB diff --git a/db/db_impl.h b/db/db_impl.h index 3ee868b1630..33e44bf4d0e 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -186,7 +185,9 @@ class DBImpl : public DB { ColumnFamilyHandle* column_family, const std::vector& input_file_names, const int output_level, - const int output_path_id = -1) override; + const int output_path_id = -1, + std::vector* const output_file_names + = nullptr) override; virtual Status PauseBackgroundWork() override; virtual Status ContinueBackgroundWork() override; @@ -222,6 +223,8 @@ class DBImpl : public DB { virtual Status SyncWAL() override; virtual SequenceNumber GetLatestSequenceNumber() const override; + // REQUIRES: joined the main write queue if two_write_queues is disabled, and + // the second write queue otherwise. virtual void SetLastPublishedSequence(SequenceNumber seq); // Returns LastSequence in last_seq_same_as_publish_seq_ // mode and LastAllocatedSequence otherwise. This is useful when visiblility @@ -379,7 +382,9 @@ class DBImpl : public DB { Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr); // Wait for any compaction - Status TEST_WaitForCompact(); + // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this + // is only for the special test of CancelledCompactions + Status TEST_WaitForCompact(bool waitUnscheduled = false); // Return the maximum overlapping data (in bytes) at next level for any // file at a level >= 1. @@ -433,6 +438,8 @@ class DBImpl : public DB { uint64_t TEST_FindMinLogContainingOutstandingPrep(); uint64_t TEST_FindMinPrepLogReferencedByMemTable(); + size_t TEST_PreparedSectionCompletedSize(); + size_t TEST_LogsWithPrepSize(); int TEST_BGCompactionsAllowed() const; int TEST_BGFlushesAllowed() const; @@ -466,7 +473,7 @@ class DBImpl : public DB { bool no_full_scan = false); // Diffs the files listed in filenames and those that do not - // belong to live files are posibly removed. Also, removes all the + // belong to live files are possibly removed. Also, removes all the // files in sst_delete_files and log_delete_files. // It is not necessary to hold the mutex when invoking this method. // If FindObsoleteFiles() was run, we need to also run @@ -549,9 +556,18 @@ class DBImpl : public DB { WriteBatch* batch_; // The seq number of the first key in the batch SequenceNumber seq_; + // Number of sub-batched. A new sub-batch is created if we txn attempts to + // inserts a duplicate key,seq to memtable. This is currently used in + // WritePrparedTxn + size_t batch_cnt_; explicit RecoveredTransaction(const uint64_t log, const std::string& name, - WriteBatch* batch, SequenceNumber seq) - : log_number_(log), name_(name), batch_(batch), seq_(seq) {} + WriteBatch* batch, SequenceNumber seq, + size_t batch_cnt) + : log_number_(log), + name_(name), + batch_(batch), + seq_(seq), + batch_cnt_(batch_cnt) {} ~RecoveredTransaction() { delete batch_; } }; @@ -573,9 +589,10 @@ class DBImpl : public DB { } void InsertRecoveredTransaction(const uint64_t log, const std::string& name, - WriteBatch* batch, SequenceNumber seq) { + WriteBatch* batch, SequenceNumber seq, + size_t batch_cnt) { recovered_transactions_[name] = - new RecoveredTransaction(log, name, batch, seq); + new RecoveredTransaction(log, name, batch, seq, batch_cnt); MarkLogAsContainingPrepSection(log); } @@ -720,6 +737,7 @@ class DBImpl : public DB { #endif friend struct SuperVersion; friend class CompactedDBImpl; + friend class DBTest_ConcurrentFlushWAL_Test; #ifndef NDEBUG friend class DBTest2_ReadCallbackTest_Test; friend class WriteCallbackTest_WriteWithCallbackTest_Test; @@ -818,7 +836,8 @@ class DBImpl : public DB { Status ScheduleFlushes(WriteContext* context); - Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context); + Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context, + FlushReason flush_reason = FlushReason::kOthers); // Force current memtable contents to be flushed. Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options, @@ -858,7 +877,7 @@ class DBImpl : public DB { size_t seq_inc); // Used by WriteImpl to update bg_error_ if paranoid check is enabled. - void WriteCallbackStatusCheck(const Status& status); + void WriteStatusCheck(const Status& status); // Used by WriteImpl to update bg_error_ in case of memtable insert error. void MemTableInsertStatusCheck(const Status& memtable_insert_status); @@ -868,6 +887,7 @@ class DBImpl : public DB { Status CompactFilesImpl(const CompactionOptions& compact_options, ColumnFamilyData* cfd, Version* version, const std::vector& input_file_names, + std::vector* const output_file_names, const int output_level, int output_path_id, JobContext* job_context, LogBuffer* log_buffer); @@ -1298,27 +1318,33 @@ class DBImpl : public DB { // Indicate DB was opened successfully bool opened_successfully_; - // minimum log number still containing prepared data. + // REQUIRES: logs_with_prep_mutex_ held + // + // sorted list of log numbers still containing prepared data. // this is used by FindObsoleteFiles to determine which // flushed logs we must keep around because they still - // contain prepared data which has not been flushed or rolled back - std::priority_queue, std::greater> - min_log_with_prep_; + // contain prepared data which has not been committed or rolled back + struct LogCnt { + uint64_t log; // the log number + uint64_t cnt; // number of prepared sections in the log + }; + std::vector logs_with_prep_; + std::mutex logs_with_prep_mutex_; - // to be used in conjunction with min_log_with_prep_. + // REQUIRES: prepared_section_completed_mutex_ held + // + // to be used in conjunction with logs_with_prep_. // once a transaction with data in log L is committed or rolled back - // rather than removing the value from the heap we add that value - // to prepared_section_completed_ which maps LOG -> instance_count - // since a log could contain multiple prepared sections + // rather than updating logs_with_prep_ directly we keep track of that + // in prepared_section_completed_ which maps LOG -> instance_count. This helps + // avoiding contention between a commit thread and the prepare threads. // // when trying to determine the minimum log still active we first - // consult min_log_with_prep_. while that root value maps to - // a value > 0 in prepared_section_completed_ we decrement the - // instance_count for that log and pop the root value in - // min_log_with_prep_. This will work the same as a min_heap - // where we are deleteing arbitrary elements and the up heaping. + // consult logs_with_prep_. while that root value maps to + // an equal value in prepared_section_completed_ we erase the log from + // both logs_with_prep_ and prepared_section_completed_. std::unordered_map prepared_section_completed_; - std::mutex prep_heap_mutex_; + std::mutex prepared_section_completed_mutex_; // Callback for compaction to check if a key is visible to a snapshot. // REQUIRES: mutex held @@ -1337,7 +1363,8 @@ class DBImpl : public DB { // state needs flush or compaction. void InstallSuperVersionAndScheduleWork( ColumnFamilyData* cfd, SuperVersionContext* sv_context, - const MutableCFOptions& mutable_cf_options); + const MutableCFOptions& mutable_cf_options, + FlushReason flush_reason = FlushReason::kOthers); #ifndef ROCKSDB_LITE using DB::GetPropertiesOfAllTables; @@ -1367,8 +1394,8 @@ class DBImpl : public DB { return Env::WLTH_SHORT; } - // When set, we use a seprate queue for writes that dont write to memtable. In - // 2PC these are the writes at Prepare phase. + // When set, we use a separate queue for writes that dont write to memtable. + // In 2PC these are the writes at Prepare phase. const bool two_write_queues_; const bool manual_wal_flush_; // Increase the sequence number after writing each batch, whether memtable is diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc index 6bf5beddccc..ca14b84432d 100644 --- a/db/db_impl_compaction_flush.cc +++ b/db/db_impl_compaction_flush.cc @@ -291,7 +291,17 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options, bool exclusive = options.exclusive_manual_compaction; bool flush_needed = true; - if (!options.allow_write_stall) { + if (begin != nullptr && end != nullptr) { + // TODO(ajkr): We could also optimize away the flush in certain cases where + // one/both sides of the interval are unbounded. But it requires more + // changes to RangesOverlapWithMemtables. + Range range(*begin, *end); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_); + cfd->RangesOverlapWithMemtables({range}, super_version, &flush_needed); + CleanupSuperVersion(super_version); + } + + if (!options.allow_write_stall && flush_needed) { InstrumentedMutexLock l(&mutex_); uint64_t orig_active_memtable_id = cfd->mem()->GetID(); WriteStallCondition write_stall_condition = WriteStallCondition::kNormal; @@ -451,7 +461,8 @@ Status DBImpl::CompactFiles( const CompactionOptions& compact_options, ColumnFamilyHandle* column_family, const std::vector& input_file_names, - const int output_level, const int output_path_id) { + const int output_level, const int output_path_id, + std::vector* const output_file_names) { #ifdef ROCKSDB_LITE // not supported in lite version return Status::NotSupported("Not supported in ROCKSDB LITE"); @@ -478,7 +489,7 @@ Status DBImpl::CompactFiles( WaitForIngestFile(); s = CompactFilesImpl(compact_options, cfd, sv->current, - input_file_names, output_level, + input_file_names, output_file_names, output_level, output_path_id, &job_context, &log_buffer); } if (sv->Unref()) { @@ -522,6 +533,7 @@ Status DBImpl::CompactFiles( Status DBImpl::CompactFilesImpl( const CompactionOptions& compact_options, ColumnFamilyData* cfd, Version* version, const std::vector& input_file_names, + std::vector* const output_file_names, const int output_level, int output_path_id, JobContext* job_context, LogBuffer* log_buffer) { mutex_.AssertHeld(); @@ -643,7 +655,7 @@ Status DBImpl::CompactFilesImpl( if (status.ok()) { InstallSuperVersionAndScheduleWork( c->column_family_data(), &job_context->superversion_context, - *c->mutable_cf_options()); + *c->mutable_cf_options(), FlushReason::kManualCompaction); } c->ReleaseCompactionFiles(s); @@ -670,6 +682,14 @@ Status DBImpl::CompactFilesImpl( } } + if (output_file_names != nullptr) { + for (const auto newf : c->edit()->GetNewFiles()) { + (*output_file_names).push_back(TableFileName( + immutable_db_options_.db_paths, newf.second.fd.GetNumber(), + newf.second.fd.GetPathId()) ); + } + } + c.reset(); bg_compaction_scheduled_--; @@ -854,7 +874,7 @@ int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) { return cfh->cfd()->NumberLevels(); } -int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* column_family) { +int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) { return 0; } @@ -871,7 +891,7 @@ Status DBImpl::Flush(const FlushOptions& flush_options, ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush start.", cfh->GetName().c_str()); Status s = - FlushMemTable(cfh->cfd(), flush_options, FlushReason::kManualCompaction); + FlushMemTable(cfh->cfd(), flush_options, FlushReason::kManualFlush); ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush finished, status: %s\n", cfh->GetName().c_str(), s.ToString().c_str()); @@ -1493,7 +1513,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, if (made_progress || (bg_compaction_scheduled_ == 0 && bg_bottom_compaction_scheduled_ == 0) || - HasPendingManualCompaction()) { + HasPendingManualCompaction() || unscheduled_compactions_ == 0) { // signal if // * made_progress -- need to wakeup DelayWrite // * bg_{bottom,}_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl @@ -1556,6 +1576,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, // InternalKey manual_end_storage; // InternalKey* manual_end = &manual_end_storage; +#ifndef ROCKSDB_LITE + bool sfm_bookkeeping = false; +#endif // ROCKSDB_LITE if (is_manual) { ManualCompactionState* m = manual_compaction; assert(m->in_progress); @@ -1618,27 +1641,66 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction"); c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer)); TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction"); + + bool enough_room = true; if (c != nullptr) { - // update statistics - MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION, - c->inputs(0)->size()); - // There are three things that can change compaction score: - // 1) When flush or compaction finish. This case is covered by - // InstallSuperVersionAndScheduleWork - // 2) When MutableCFOptions changes. This case is also covered by - // InstallSuperVersionAndScheduleWork, because this is when the new - // options take effect. - // 3) When we Pick a new compaction, we "remove" those files being - // compacted from the calculation, which then influences compaction - // score. Here we check if we need the new compaction even without the - // files that are currently being compacted. If we need another - // compaction, we might be able to execute it in parallel, so we add it - // to the queue and schedule a new thread. - if (cfd->NeedsCompaction()) { - // Yes, we need more compactions! +#ifndef ROCKSDB_LITE + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm) { + enough_room = sfm->EnoughRoomForCompaction(c.get()); + if (enough_room) { + sfm_bookkeeping = true; + } + } +#endif // ROCKSDB_LITE + if (!enough_room) { + // Just in case tests want to change the value of enough_room + TEST_SYNC_POINT_CALLBACK( + "DBImpl::BackgroundCompaction():CancelledCompaction", + &enough_room); + } + if (!enough_room) { + // Then don't do the compaction + c->ReleaseCompactionFiles(status); + c->column_family_data() + ->current() + ->storage_info() + ->ComputeCompactionScore(*(c->immutable_cf_options()), + *(c->mutable_cf_options())); + + ROCKS_LOG_BUFFER(log_buffer, + "Cancelled compaction because not enough room"); AddToCompactionQueue(cfd); ++unscheduled_compactions_; - MaybeScheduleFlushOrCompaction(); + + c.reset(); + // Don't need to sleep here, because BackgroundCallCompaction + // will sleep if !s.ok() + status = Status::CompactionTooLarge(); + RecordTick(stats_, COMPACTION_CANCELLED, 1); + } else { + // update statistics + MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION, + c->inputs(0)->size()); + // There are three things that can change compaction score: + // 1) When flush or compaction finish. This case is covered by + // InstallSuperVersionAndScheduleWork + // 2) When MutableCFOptions changes. This case is also covered by + // InstallSuperVersionAndScheduleWork, because this is when the new + // options take effect. + // 3) When we Pick a new compaction, we "remove" those files being + // compacted from the calculation, which then influences compaction + // score. Here we check if we need the new compaction even without the + // files that are currently being compacted. If we need another + // compaction, we might be able to execute it in parallel, so we add + // it to the queue and schedule a new thread. + if (cfd->NeedsCompaction()) { + // Yes, we need more compactions! + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + MaybeScheduleFlushOrCompaction(); + } } } } @@ -1665,7 +1727,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, &mutex_, directories_.GetDbDir()); InstallSuperVersionAndScheduleWork( c->column_family_data(), &job_context->superversion_context, - *c->mutable_cf_options()); + *c->mutable_cf_options(), FlushReason::kAutoCompaction); ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n", c->column_family_data()->GetName().c_str(), c->num_input_files(0)); @@ -1712,7 +1774,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, // Use latest MutableCFOptions InstallSuperVersionAndScheduleWork( c->column_family_data(), &job_context->superversion_context, - *c->mutable_cf_options()); + *c->mutable_cf_options(), FlushReason::kAutoCompaction); VersionStorageInfo::LevelSummaryStorage tmp; c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(), @@ -1791,13 +1853,23 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, if (status.ok()) { InstallSuperVersionAndScheduleWork( c->column_family_data(), &job_context->superversion_context, - *c->mutable_cf_options()); + *c->mutable_cf_options(), FlushReason::kAutoCompaction); } *made_progress = true; } if (c != nullptr) { c->ReleaseCompactionFiles(status); *made_progress = true; + +#ifndef ROCKSDB_LITE + // Need to make sure SstFileManager does its bookkeeping + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm && sfm_bookkeeping) { + sfm->OnCompactionCompletion(c.get()); + } +#endif // ROCKSDB_LITE + NotifyOnCompactionCompleted( c->column_family_data(), c.get(), status, compaction_job_stats, job_context->job_id); @@ -1805,7 +1877,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, // this will unref its input_version and column_family_data c.reset(); - if (status.ok()) { + if (status.ok() || status.IsCompactionTooLarge()) { // Done } else if (status.IsShutdownInProgress()) { // Ignore compaction errors found during shutting down @@ -1972,7 +2044,7 @@ bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) { void DBImpl::InstallSuperVersionAndScheduleWork( ColumnFamilyData* cfd, SuperVersionContext* sv_context, - const MutableCFOptions& mutable_cf_options) { + const MutableCFOptions& mutable_cf_options, FlushReason flush_reason) { mutex_.AssertHeld(); // Update max_total_in_memory_state_ @@ -1991,7 +2063,7 @@ void DBImpl::InstallSuperVersionAndScheduleWork( // Whenever we install new SuperVersion, we might need to issue new flushes or // compactions. - SchedulePendingFlush(cfd, FlushReason::kSuperVersionChange); + SchedulePendingFlush(cfd, flush_reason); SchedulePendingCompaction(cfd); MaybeScheduleFlushOrCompaction(); diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc index 32c072b8f04..9d87f5c29a2 100644 --- a/db/db_impl_debug.cc +++ b/db/db_impl_debug.cc @@ -117,7 +117,7 @@ Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) { return WaitForFlushMemTable(cfd); } -Status DBImpl::TEST_WaitForCompact() { +Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) { // Wait until the compaction completes // TODO: a bug here. This function actually does not necessarily @@ -126,7 +126,8 @@ Status DBImpl::TEST_WaitForCompact() { InstrumentedMutexLock l(&mutex_); while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || - bg_flush_scheduled_) && + bg_flush_scheduled_ || + (wait_unscheduled && unscheduled_compactions_)) && bg_error_.ok()) { bg_cv_.Wait(); } @@ -186,6 +187,12 @@ uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() { return FindMinLogContainingOutstandingPrep(); } +size_t DBImpl::TEST_PreparedSectionCompletedSize() { + return prepared_section_completed_.size(); +} + +size_t DBImpl::TEST_LogsWithPrepSize() { return logs_with_prep_.size(); } + uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() { return FindMinPrepLogReferencedByMemTable(); } diff --git a/db/db_impl_files.cc b/db/db_impl_files.cc index 48bcb48aab7..885b7b8ec53 100644 --- a/db/db_impl_files.cc +++ b/db/db_impl_files.cc @@ -48,58 +48,61 @@ uint64_t DBImpl::FindMinPrepLogReferencedByMemTable() { return min_log; } -// TODO(myabandeh): Avoid using locks void DBImpl::MarkLogAsHavingPrepSectionFlushed(uint64_t log) { assert(log != 0); - std::lock_guard lock(prep_heap_mutex_); + std::lock_guard lock(prepared_section_completed_mutex_); auto it = prepared_section_completed_.find(log); - assert(it != prepared_section_completed_.end()); - it->second += 1; + if (UNLIKELY(it == prepared_section_completed_.end())) { + prepared_section_completed_[log] = 1; + } else { + it->second += 1; + } } -// TODO(myabandeh): Avoid using locks void DBImpl::MarkLogAsContainingPrepSection(uint64_t log) { assert(log != 0); - std::lock_guard lock(prep_heap_mutex_); - min_log_with_prep_.push(log); - auto it = prepared_section_completed_.find(log); - if (it == prepared_section_completed_.end()) { - prepared_section_completed_[log] = 0; + std::lock_guard lock(logs_with_prep_mutex_); + + auto rit = logs_with_prep_.rbegin(); + bool updated = false; + // Most probably the last log is the one that is being marked for + // having a prepare section; so search from the end. + for (; rit != logs_with_prep_.rend() && rit->log >= log; ++rit) { + if (rit->log == log) { + rit->cnt++; + updated = true; + break; + } + } + if (!updated) { + // We are either at the start, or at a position with rit->log < log + logs_with_prep_.insert(rit.base(), {log, 1}); } } uint64_t DBImpl::FindMinLogContainingOutstandingPrep() { - - if (!allow_2pc()) { - return 0; - } - - std::lock_guard lock(prep_heap_mutex_); - uint64_t min_log = 0; - - // first we look in the prepared heap where we keep - // track of transactions that have been prepared (written to WAL) - // but not yet committed. - while (!min_log_with_prep_.empty()) { - min_log = min_log_with_prep_.top(); - - auto it = prepared_section_completed_.find(min_log); - - // value was marked as 'deleted' from heap - if (it != prepared_section_completed_.end() && it->second > 0) { - it->second -= 1; - min_log_with_prep_.pop(); - - // back to squere one... - min_log = 0; - continue; - } else { - // found a valid value - break; + std::lock_guard lock(logs_with_prep_mutex_); + auto it = logs_with_prep_.begin(); + // start with the smallest log + for (; it != logs_with_prep_.end();) { + auto min_log = it->log; + { + std::lock_guard lock2(prepared_section_completed_mutex_); + auto completed_it = prepared_section_completed_.find(min_log); + if (completed_it == prepared_section_completed_.end() || + completed_it->second < it->cnt) { + return min_log; + } + assert(completed_it != prepared_section_completed_.end() && + completed_it->second == it->cnt); + prepared_section_completed_.erase(completed_it); } + // erase from beginning in vector is not efficient but this function is not + // on the fast path. + it = logs_with_prep_.erase(it); } - - return min_log; + // no such log found + return 0; } uint64_t DBImpl::MinLogNumberToKeep() { @@ -110,11 +113,11 @@ uint64_t DBImpl::MinLogNumberToKeep() { // sections of outstanding transactions. // // We must check min logs with outstanding prep before we check - // logs referneces by memtables because a log referenced by the + // logs references by memtables because a log referenced by the // first data structure could transition to the second under us. // // TODO(horuff): iterating over all column families under db mutex. - // should find more optimial solution + // should find more optimal solution auto min_log_in_prep_heap = FindMinLogContainingOutstandingPrep(); if (min_log_in_prep_heap != 0 && min_log_in_prep_heap < log_number) { @@ -150,7 +153,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, bool doing_the_full_scan = false; - // logic for figurint out if we're doing the full scan + // logic for figuring out if we're doing the full scan if (no_full_scan) { doing_the_full_scan = false; } else if (force || @@ -170,7 +173,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, // threads // Since job_context->min_pending_output is set, until file scan finishes, // mutex_ cannot be released. Otherwise, we might see no min_pending_output - // here but later find newer generated unfinalized files while scannint. + // here but later find newer generated unfinalized files while scanning. if (!pending_outputs_.empty()) { job_context->min_pending_output = *pending_outputs_.begin(); } else { @@ -341,7 +344,7 @@ void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname, } // Diffs the files listed in filenames and those that do not -// belong to live files are posibly removed. Also, removes all the +// belong to live files are possibly removed. Also, removes all the // files in sst_delete_files and log_delete_files. // It is not necessary to hold the mutex when invoking this method. void DBImpl::PurgeObsoleteFiles(const JobContext& state, bool schedule_only) { diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc index c9487b2b721..047a17b21fd 100644 --- a/db/db_impl_open.cc +++ b/db/db_impl_open.cc @@ -356,6 +356,29 @@ Status DBImpl::Recover( assert(s.IsIOError()); return s; } + // Verify compatibility of env_options_ and filesystem + { + unique_ptr idfile; + EnvOptions customized_env(env_options_); + customized_env.use_direct_reads |= + immutable_db_options_.use_direct_io_for_flush_and_compaction; + s = env_->NewRandomAccessFile(IdentityFileName(dbname_), &idfile, + customized_env); + if (!s.ok()) { + const char* error_msg = s.ToString().c_str(); + // Check if unsupported Direct I/O is the root cause + customized_env.use_direct_reads = false; + s = env_->NewRandomAccessFile(IdentityFileName(dbname_), &idfile, + customized_env); + if (s.ok()) { + return Status::InvalidArgument( + "Direct I/O is not supported by the specified DB."); + } else { + return Status::InvalidArgument( + "Found options incompatible with filesystem", error_msg); + } + } + } } Status s = versions_->Recover(column_families, read_only); diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc index b7ebaa53f78..d77e1d8b94e 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl_readonly.cc @@ -107,7 +107,7 @@ Status DBImplReadOnly::NewIterators( } Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, - DB** dbptr, bool error_if_log_file_exist) { + DB** dbptr, bool /*error_if_log_file_exist*/) { *dbptr = nullptr; // Try to first open DB as fully compacted DB diff --git a/db/db_impl_readonly.h b/db/db_impl_readonly.h index 9bdc95cc874..6ebe1bce760 100644 --- a/db/db_impl_readonly.h +++ b/db/db_impl_readonly.h @@ -36,46 +36,49 @@ class DBImplReadOnly : public DBImpl { std::vector* iterators) override; using DBImpl::Put; - virtual Status Put(const WriteOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) override { + virtual Status Put(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::Merge; - virtual Status Merge(const WriteOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value) override { + virtual Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::Delete; - virtual Status Delete(const WriteOptions& options, - ColumnFamilyHandle* column_family, - const Slice& key) override { + virtual Status Delete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::SingleDelete; - virtual Status SingleDelete(const WriteOptions& options, - ColumnFamilyHandle* column_family, - const Slice& key) override { + virtual Status SingleDelete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { return Status::NotSupported("Not supported operation in read only mode."); } - virtual Status Write(const WriteOptions& options, - WriteBatch* updates) override { + virtual Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::CompactRange; - virtual Status CompactRange(const CompactRangeOptions& options, - ColumnFamilyHandle* column_family, - const Slice* begin, const Slice* end) override { + virtual Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, + const Slice* /*end*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::CompactFiles; virtual Status CompactFiles( - const CompactionOptions& compact_options, - ColumnFamilyHandle* column_family, - const std::vector& input_file_names, - const int output_level, const int output_path_id = -1) override { + const CompactionOptions& /*compact_options*/, + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*input_file_names*/, + const int /*output_level*/, const int /*output_path_id*/ = -1, + std::vector* const /*output_file_names*/ = nullptr + ) override { return Status::NotSupported("Not supported operation in read only mode."); } @@ -83,18 +86,18 @@ class DBImplReadOnly : public DBImpl { return Status::NotSupported("Not supported operation in read only mode."); } - virtual Status EnableFileDeletions(bool force) override { + virtual Status EnableFileDeletions(bool /*force*/) override { return Status::NotSupported("Not supported operation in read only mode."); } virtual Status GetLiveFiles(std::vector&, - uint64_t* manifest_file_size, - bool flush_memtable = true) override { + uint64_t* /*manifest_file_size*/, + bool /*flush_memtable*/ = true) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::Flush; - virtual Status Flush(const FlushOptions& options, - ColumnFamilyHandle* column_family) override { + virtual Status Flush(const FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { return Status::NotSupported("Not supported operation in read only mode."); } @@ -105,9 +108,9 @@ class DBImplReadOnly : public DBImpl { using DB::IngestExternalFile; virtual Status IngestExternalFile( - ColumnFamilyHandle* column_family, - const std::vector& external_files, - const IngestExternalFileOptions& ingestion_options) override { + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*ingestion_options*/) override { return Status::NotSupported("Not supported operation in read only mode."); } diff --git a/db/db_impl_write.cc b/db/db_impl_write.cc index 9e83df86105..b4b92567a9c 100644 --- a/db/db_impl_write.cc +++ b/db/db_impl_write.cc @@ -133,7 +133,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, for (auto* writer : *(w.write_group)) { if (!writer->CallbackFailed() && writer->pre_release_callback) { assert(writer->sequence != kMaxSequenceNumber); - Status ws = writer->pre_release_callback->Callback(writer->sequence); + Status ws = writer->pre_release_callback->Callback(writer->sequence, + disable_memtable); if (!ws.ok()) { status = ws; break; @@ -214,7 +215,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, write_group.size > 1; size_t total_count = 0; size_t valid_batches = 0; - uint64_t total_byte_size = 0; + size_t total_byte_size = 0; for (auto* writer : write_group) { if (writer->CheckCallback(this)) { valid_batches += writer->batch_cnt; @@ -339,7 +340,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, PERF_TIMER_START(write_pre_and_post_process_time); if (!w.CallbackFailed()) { - WriteCallbackStatusCheck(status); + WriteStatusCheck(status); } if (need_log_sync) { @@ -368,7 +369,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, for (auto* writer : write_group) { if (!writer->CallbackFailed() && writer->pre_release_callback) { assert(writer->sequence != kMaxSequenceNumber); - Status ws = writer->pre_release_callback->Callback(writer->sequence); + Status ws = writer->pre_release_callback->Callback(writer->sequence, + disable_memtable); if (!ws.ok()) { status = ws; break; @@ -462,7 +464,7 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, } if (!w.CallbackFailed()) { - WriteCallbackStatusCheck(w.status); + WriteStatusCheck(w.status); } if (need_log_sync) { @@ -550,7 +552,7 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options, // Note: no need to update last_batch_group_size_ here since the batch writes // to WAL only - uint64_t total_byte_size = 0; + size_t total_byte_size = 0; for (auto* writer : write_group) { if (writer->CheckCallback(this)) { total_byte_size = WriteBatchInternal::AppendedByteSize( @@ -623,13 +625,15 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options, PERF_TIMER_START(write_pre_and_post_process_time); if (!w.CallbackFailed()) { - WriteCallbackStatusCheck(status); + WriteStatusCheck(status); } if (status.ok()) { for (auto* writer : write_group) { if (!writer->CallbackFailed() && writer->pre_release_callback) { assert(writer->sequence != kMaxSequenceNumber); - Status ws = writer->pre_release_callback->Callback(writer->sequence); + const bool DISABLE_MEMTABLE = true; + Status ws = writer->pre_release_callback->Callback(writer->sequence, + DISABLE_MEMTABLE); if (!ws.ok()) { status = ws; break; @@ -647,7 +651,7 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options, return status; } -void DBImpl::WriteCallbackStatusCheck(const Status& status) { +void DBImpl::WriteStatusCheck(const Status& status) { // Is setting bg_error_ enough here? This will at least stop // compaction and fail any further writes. if (immutable_db_options_.paranoid_checks && !status.ok() && @@ -804,7 +808,21 @@ Status DBImpl::WriteToWAL(const WriteBatch& merged_batch, assert(log_size != nullptr); Slice log_entry = WriteBatchInternal::Contents(&merged_batch); *log_size = log_entry.size(); + // When two_write_queues_ WriteToWAL has to be protected from concurretn calls + // from the two queues anyway and log_write_mutex_ is already held. Otherwise + // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord + // from possible concurrent calls via the FlushWAL by the application. + const bool needs_locking = manual_wal_flush_ && !two_write_queues_; + // Due to performance cocerns of missed branch prediction penalize the new + // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case + // when we do not need any locking. + if (UNLIKELY(needs_locking)) { + log_write_mutex_.Lock(); + } Status status = log_writer->AddRecord(log_entry); + if (UNLIKELY(needs_locking)) { + log_write_mutex_.Unlock(); + } if (log_used != nullptr) { *log_used = logfile_number_; } @@ -1062,7 +1080,8 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) { } } if (cfd_picked != nullptr) { - status = SwitchMemtable(cfd_picked, write_context); + status = SwitchMemtable(cfd_picked, write_context, + FlushReason::kWriteBufferFull); if (status.ok()) { cfd_picked->imm()->FlushRequested(); SchedulePendingFlush(cfd_picked, FlushReason::kWriteBufferFull); @@ -1163,7 +1182,7 @@ Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, Status DBImpl::ScheduleFlushes(WriteContext* context) { ColumnFamilyData* cfd; while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { - auto status = SwitchMemtable(cfd, context); + auto status = SwitchMemtable(cfd, context, FlushReason::kWriteBufferFull); if (cfd->Unref()) { delete cfd; } @@ -1175,7 +1194,7 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) { } #ifndef ROCKSDB_LITE -void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* cfd, +void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/, const MemTableInfo& mem_table_info) { if (immutable_db_options_.listeners.size() == 0U) { return; @@ -1192,7 +1211,8 @@ void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* cfd, // REQUIRES: mutex_ is held // REQUIRES: this thread is currently at the front of the writer queue -Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { +Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context, + FlushReason flush_reason) { mutex_.AssertHeld(); WriteThread::Writer nonmem_w; if (two_write_queues_) { @@ -1360,7 +1380,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { new_mem->Ref(); cfd->SetMemtable(new_mem); InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context, - mutable_cf_options); + mutable_cf_options, flush_reason); if (two_write_queues_) { nonmem_write_thread_.ExitUnbatched(&nonmem_w); } @@ -1369,11 +1389,13 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const { mutex_.AssertHeld(); - size_t bsize = write_buffer_size / 10 + write_buffer_size; + size_t bsize = static_cast( + write_buffer_size / 10 + write_buffer_size); // Some users might set very high write_buffer_size and rely on // max_total_wal_size or other parameters to control the WAL size. if (mutable_db_options_.max_total_wal_size > 0) { - bsize = std::min(bsize, mutable_db_options_.max_total_wal_size); + bsize = std::min(bsize, static_cast( + mutable_db_options_.max_total_wal_size)); } if (immutable_db_options_.db_write_buffer_size > 0) { bsize = std::min(bsize, immutable_db_options_.db_write_buffer_size); diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index a0ecbb68160..24dbac41b2d 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -50,7 +50,7 @@ class DBIteratorTest : public DBTestBase, class FlushBlockEveryKeyPolicy : public FlushBlockPolicy { public: - virtual bool Update(const Slice& key, const Slice& value) override { + virtual bool Update(const Slice& /*key*/, const Slice& /*value*/) override { if (!start_) { start_ = true; return false; @@ -70,8 +70,8 @@ class FlushBlockEveryKeyPolicyFactory : public FlushBlockPolicyFactory { } FlushBlockPolicy* NewFlushBlockPolicy( - const BlockBasedTableOptions& table_options, - const BlockBuilder& data_block_builder) const override { + const BlockBasedTableOptions& /*table_options*/, + const BlockBuilder& /*data_block_builder*/) const override { return new FlushBlockEveryKeyPolicy; } }; diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc index 92109c6ca7c..5f47a94818e 100644 --- a/db/db_memtable_test.cc +++ b/db/db_memtable_test.cc @@ -119,7 +119,7 @@ class TestPrefixExtractor : public SliceTransform { return separator(key) != nullptr; } - virtual bool InRange(const Slice& key) const override { return false; } + virtual bool InRange(const Slice& /*key*/) const override { return false; } private: const char* separator(const Slice& key) const { diff --git a/db/db_merge_operator_test.cc b/db/db_merge_operator_test.cc index e109422fa32..7b9808d6634 100644 --- a/db/db_merge_operator_test.cc +++ b/db/db_merge_operator_test.cc @@ -284,7 +284,7 @@ TEST_P(MergeOperatorPinningTest, Randomized) { Random rnd(301); std::map true_data; - const int kTotalMerges = 10000; + const int kTotalMerges = 5000; // Every key gets ~10 operands const int kKeyRange = kTotalMerges / 10; const int kOperandSize = 20; diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index 0da64b13656..c683a5a1294 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -14,6 +14,7 @@ #include "db/db_test_util.h" #include "port/stack_trace.h" +#include "rocksdb/listener.h" #include "rocksdb/options.h" #include "rocksdb/perf_context.h" #include "rocksdb/perf_level.h" @@ -68,27 +69,27 @@ TEST_F(DBPropertiesTest, Empty) { ASSERT_OK(db_->DisableFileDeletions()); ASSERT_TRUE( dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); - ASSERT_EQ("1", num); + ASSERT_EQ("0", num); ASSERT_OK(db_->DisableFileDeletions()); ASSERT_TRUE( dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); - ASSERT_EQ("2", num); + ASSERT_EQ("0", num); ASSERT_OK(db_->DisableFileDeletions()); ASSERT_TRUE( dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); - ASSERT_EQ("3", num); + ASSERT_EQ("0", num); ASSERT_OK(db_->EnableFileDeletions(false)); ASSERT_TRUE( dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); - ASSERT_EQ("2", num); + ASSERT_EQ("0", num); ASSERT_OK(db_->EnableFileDeletions()); ASSERT_TRUE( dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); - ASSERT_EQ("0", num); + ASSERT_EQ("1", num); } while (ChangeOptions()); } @@ -985,8 +986,9 @@ class CountingUserTblPropCollector : public TablePropertiesCollector { return Status::OK(); } - Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type, - SequenceNumber seq, uint64_t file_size) override { + Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { ++count_; return Status::OK(); } @@ -1027,8 +1029,9 @@ class CountingDeleteTabPropCollector : public TablePropertiesCollector { public: const char* Name() const override { return "CountingDeleteTabPropCollector"; } - Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type, - SequenceNumber seq, uint64_t file_size) override { + Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/, + EntryType type, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { if (type == kEntryDelete) { num_deletes_++; } @@ -1055,7 +1058,7 @@ class CountingDeleteTabPropCollectorFactory : public TablePropertiesCollectorFactory { public: virtual TablePropertiesCollector* CreateTablePropertiesCollector( - TablePropertiesCollectorFactory::Context context) override { + TablePropertiesCollectorFactory::Context /*context*/) override { return new CountingDeleteTabPropCollector(); } const char* Name() const override { @@ -1383,6 +1386,56 @@ TEST_F(DBPropertiesTest, EstimateOldestKeyTime) { Close(); } +TEST_F(DBPropertiesTest, SstFilesSize) { + struct TestListener : public EventListener { + void OnCompactionCompleted(DB* db, + const CompactionJobInfo& /*info*/) override { + assert(callback_triggered == false); + assert(size_before_compaction > 0); + callback_triggered = true; + uint64_t total_sst_size = 0; + uint64_t live_sst_size = 0; + bool ok = db->GetIntProperty(DB::Properties::kTotalSstFilesSize, + &total_sst_size); + ASSERT_TRUE(ok); + // total_sst_size include files before and after compaction. + ASSERT_GT(total_sst_size, size_before_compaction); + ok = + db->GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size); + ASSERT_TRUE(ok); + // live_sst_size only include files after compaction. + ASSERT_GT(live_sst_size, 0); + ASSERT_LT(live_sst_size, size_before_compaction); + } + + uint64_t size_before_compaction = 0; + bool callback_triggered = false; + }; + std::shared_ptr listener = std::make_shared(); + + Options options; + options.disable_auto_compactions = true; + options.listeners.push_back(listener); + Reopen(options); + + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put("key" + ToString(i), std::string(1000, 'v'))); + } + ASSERT_OK(Flush()); + for (int i = 0; i < 5; i++) { + ASSERT_OK(Delete("key" + ToString(i))); + } + ASSERT_OK(Flush()); + uint64_t sst_size; + bool ok = db_->GetIntProperty(DB::Properties::kTotalSstFilesSize, &sst_size); + ASSERT_TRUE(ok); + ASSERT_GT(sst_size, 0); + listener->size_before_compaction = sst_size; + // Compact to clean all keys and trigger listener. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_TRUE(listener->callback_triggered); +} + #endif // ROCKSDB_LITE } // namespace rocksdb diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc index 8a6a81764b6..a64f6a9245a 100644 --- a/db/db_sst_test.cc +++ b/db/db_sst_test.cc @@ -409,6 +409,11 @@ TEST_F(DBSSTTest, DeleteSchedulerMultipleDBPaths) { rocksdb::SyncPoint::GetInstance()->SetCallBack( "DeleteScheduler::DeleteTrashFile:DeleteFile", [&](void* arg) { bg_delete_file++; }); + // The deletion scheduler sometimes skips marking file as trash according to + // a heuristic. In that case the deletion will go through the below SyncPoint. + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteFile", + [&](void* arg) { bg_delete_file++; }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); Options options = CurrentOptions(); @@ -461,13 +466,15 @@ TEST_F(DBSSTTest, DeleteSchedulerMultipleDBPaths) { sfm->WaitForEmptyTrash(); ASSERT_EQ(bg_delete_file, 8); + // Compaction will delete both files and regenerate a file in L1 in second + // db path. The deleted files should still be cleaned up via delete scheduler. compact_options.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); ASSERT_EQ("0,1", FilesPerLevel(0)); sfm->WaitForEmptyTrash(); - ASSERT_EQ(bg_delete_file, 8); + ASSERT_EQ(bg_delete_file, 10); rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } @@ -538,6 +545,52 @@ TEST_F(DBSSTTest, DBWithMaxSpaceAllowed) { ASSERT_NOK(Flush()); } +TEST_F(DBSSTTest, CancellingCompactionsWorks) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.level0_file_num_compaction_trigger = 2; + options.statistics = CreateDBStatistics(); + DestroyAndReopen(options); + + int completed_compactions = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* arg) { + sfm->SetMaxAllowedSpaceUsage(0); + }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", + [&](void* arg) { completed_compactions++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + + // Generate a file containing 10 keys. + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + } + ASSERT_OK(Flush()); + uint64_t total_file_size = 0; + auto files_in_db = GetAllSSTFiles(&total_file_size); + // Set the maximum allowed space usage to the current total size + sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1); + + // Generate another file to trigger compaction. + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + } + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForCompact(true); + + ASSERT_GT(completed_compactions, 0); + ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); + // Make sure the stat is bumped + ASSERT_GT(dbfull()->immutable_db_options().statistics.get()->getTickerCount(COMPACTION_CANCELLED), 0); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_F(DBSSTTest, DBWithMaxSpaceAllowedRandomized) { // This test will set a maximum allowed space for the DB, then it will // keep filling the DB until the limit is reached and bg_error_ is set. @@ -566,6 +619,12 @@ TEST_F(DBSSTTest, DBWithMaxSpaceAllowedRandomized) { estimate_multiplier++; // used in the main loop assert }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* arg) { + bool* enough_room = static_cast(arg); + *enough_room = true; + }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached", [&](void* arg) { diff --git a/db/db_tailing_iter_test.cc b/db/db_tailing_iter_test.cc index d217828db9d..b1062aea643 100644 --- a/db/db_tailing_iter_test.cc +++ b/db/db_tailing_iter_test.cc @@ -214,9 +214,9 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) { } ASSERT_TRUE(file_iters_renewed_null); ASSERT_TRUE(file_iters_renewed_copy); - iter = 0; - itern = 0; - iterh = 0; + iter = nullptr; + itern = nullptr; + iterh = nullptr; BlockBasedTableOptions table_options; table_options.no_block_cache = true; table_options.block_cache_compressed = nullptr; @@ -229,7 +229,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) { Slice target1(buf5, 20); iteri->Seek(target1); ASSERT_TRUE(iteri->status().IsIncomplete()); - iteri = 0; + iteri = nullptr; read_options.read_tier = kReadAllTier; options.table_factory.reset(NewBlockBasedTableFactory()); diff --git a/db/db_test.cc b/db/db_test.cc index b21150c66ab..119883a287c 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -750,9 +750,9 @@ TEST_F(DBTest, FlushSchedule) { namespace { class KeepFilter : public CompactionFilter { public: - virtual bool Filter(int level, const Slice& key, const Slice& value, - std::string* new_value, - bool* value_changed) const override { + virtual bool Filter(int /*level*/, const Slice& /*key*/, + const Slice& /*value*/, std::string* /*new_value*/, + bool* /*value_changed*/) const override { return false; } @@ -782,9 +782,9 @@ class KeepFilterFactory : public CompactionFilterFactory { class DelayFilter : public CompactionFilter { public: explicit DelayFilter(DBTestBase* d) : db_test(d) {} - virtual bool Filter(int level, const Slice& key, const Slice& value, - std::string* new_value, - bool* value_changed) const override { + virtual bool Filter(int /*level*/, const Slice& /*key*/, + const Slice& /*value*/, std::string* /*new_value*/, + bool* /*value_changed*/) const override { db_test->env_->addon_time_.fetch_add(1000); return true; } @@ -799,7 +799,7 @@ class DelayFilterFactory : public CompactionFilterFactory { public: explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {} virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& context) override { + const CompactionFilter::Context& /*context*/) override { return std::unique_ptr(new DelayFilter(db_test)); } @@ -2247,17 +2247,17 @@ class ModelDB : public DB { return Write(o, &batch); } using DB::Get; - virtual Status Get(const ReadOptions& options, ColumnFamilyHandle* cf, - const Slice& key, PinnableSlice* value) override { + virtual Status Get(const ReadOptions& /*options*/, ColumnFamilyHandle* /*cf*/, + const Slice& key, PinnableSlice* /*value*/) override { return Status::NotSupported(key); } using DB::MultiGet; virtual std::vector MultiGet( - const ReadOptions& options, - const std::vector& column_family, + const ReadOptions& /*options*/, + const std::vector& /*column_family*/, const std::vector& keys, - std::vector* values) override { + std::vector* /*values*/) override { std::vector s(keys.size(), Status::NotSupported("Not implemented.")); return s; @@ -2266,9 +2266,9 @@ class ModelDB : public DB { #ifndef ROCKSDB_LITE using DB::IngestExternalFile; virtual Status IngestExternalFile( - ColumnFamilyHandle* column_family, - const std::vector& external_files, - const IngestExternalFileOptions& options) override { + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*options*/) override { return Status::NotSupported("Not implemented."); } @@ -2278,22 +2278,22 @@ class ModelDB : public DB { using DB::GetPropertiesOfAllTables; virtual Status GetPropertiesOfAllTables( - ColumnFamilyHandle* column_family, - TablePropertiesCollection* props) override { + ColumnFamilyHandle* /*column_family*/, + TablePropertiesCollection* /*props*/) override { return Status(); } virtual Status GetPropertiesOfTablesInRange( - ColumnFamilyHandle* column_family, const Range* range, std::size_t n, - TablePropertiesCollection* props) override { + ColumnFamilyHandle* /*column_family*/, const Range* /*range*/, + std::size_t /*n*/, TablePropertiesCollection* /*props*/) override { return Status(); } #endif // ROCKSDB_LITE using DB::KeyMayExist; - virtual bool KeyMayExist(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - std::string* value, + virtual bool KeyMayExist(const ReadOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, std::string* /*value*/, bool* value_found = nullptr) override { if (value_found != nullptr) { *value_found = false; @@ -2301,8 +2301,9 @@ class ModelDB : public DB { return true; // Not Supported directly } using DB::NewIterator; - virtual Iterator* NewIterator(const ReadOptions& options, - ColumnFamilyHandle* column_family) override { + virtual Iterator* NewIterator( + const ReadOptions& options, + ColumnFamilyHandle* /*column_family*/) override { if (options.snapshot == nullptr) { KVMap* saved = new KVMap; *saved = map_; @@ -2314,9 +2315,9 @@ class ModelDB : public DB { } } virtual Status NewIterators( - const ReadOptions& options, - const std::vector& column_family, - std::vector* iterators) override { + const ReadOptions& /*options*/, + const std::vector& /*column_family*/, + std::vector* /*iterators*/) override { return Status::NotSupported("Not supported yet"); } virtual const Snapshot* GetSnapshot() override { @@ -2329,7 +2330,7 @@ class ModelDB : public DB { delete reinterpret_cast(snapshot); } - virtual Status Write(const WriteOptions& options, + virtual Status Write(const WriteOptions& /*options*/, WriteBatch* batch) override { class Handler : public WriteBatch::Handler { public: @@ -2337,7 +2338,8 @@ class ModelDB : public DB { virtual void Put(const Slice& key, const Slice& value) override { (*map_)[key.ToString()] = value.ToString(); } - virtual void Merge(const Slice& key, const Slice& value) override { + virtual void Merge(const Slice& /*key*/, + const Slice& /*value*/) override { // ignore merge for now // (*map_)[key.ToString()] = value.ToString(); } @@ -2351,62 +2353,67 @@ class ModelDB : public DB { } using DB::GetProperty; - virtual bool GetProperty(ColumnFamilyHandle* column_family, - const Slice& property, std::string* value) override { + virtual bool GetProperty(ColumnFamilyHandle* /*column_family*/, + const Slice& /*property*/, + std::string* /*value*/) override { return false; } using DB::GetIntProperty; - virtual bool GetIntProperty(ColumnFamilyHandle* column_family, - const Slice& property, uint64_t* value) override { + virtual bool GetIntProperty(ColumnFamilyHandle* /*column_family*/, + const Slice& /*property*/, + uint64_t* /*value*/) override { return false; } using DB::GetMapProperty; virtual bool GetMapProperty( - ColumnFamilyHandle* column_family, const Slice& property, - std::map* value) override { + ColumnFamilyHandle* /*column_family*/, const Slice& /*property*/, + std::map* /*value*/) override { return false; } using DB::GetAggregatedIntProperty; - virtual bool GetAggregatedIntProperty(const Slice& property, - uint64_t* value) override { + virtual bool GetAggregatedIntProperty(const Slice& /*property*/, + uint64_t* /*value*/) override { return false; } using DB::GetApproximateSizes; - virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, - const Range* range, int n, uint64_t* sizes, - uint8_t include_flags + virtual void GetApproximateSizes(ColumnFamilyHandle* /*column_family*/, + const Range* /*range*/, int n, + uint64_t* sizes, + uint8_t /*include_flags*/ = INCLUDE_FILES) override { for (int i = 0; i < n; i++) { sizes[i] = 0; } } using DB::GetApproximateMemTableStats; - virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, - const Range& range, - uint64_t* const count, - uint64_t* const size) override { + virtual void GetApproximateMemTableStats( + ColumnFamilyHandle* /*column_family*/, const Range& /*range*/, + uint64_t* const count, uint64_t* const size) override { *count = 0; *size = 0; } using DB::CompactRange; - virtual Status CompactRange(const CompactRangeOptions& options, - ColumnFamilyHandle* column_family, - const Slice* start, const Slice* end) override { + virtual Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*start*/, + const Slice* /*end*/) override { return Status::NotSupported("Not supported operation."); } virtual Status SetDBOptions( - const std::unordered_map& new_options) + const std::unordered_map& /*new_options*/) override { return Status::NotSupported("Not supported operation."); } using DB::CompactFiles; - virtual Status CompactFiles(const CompactionOptions& compact_options, - ColumnFamilyHandle* column_family, - const std::vector& input_file_names, - const int output_level, - const int output_path_id = -1) override { + virtual Status CompactFiles( + const CompactionOptions& /*compact_options*/, + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*input_file_names*/, + const int /*output_level*/, const int /*output_path_id*/ = -1, + std::vector* const /*output_file_names*/ = nullptr + ) override { return Status::NotSupported("Not supported operation."); } @@ -2419,24 +2426,25 @@ class ModelDB : public DB { } Status EnableAutoCompaction( - const std::vector& column_family_handles) override { + const std::vector& /*column_family_handles*/) + override { return Status::NotSupported("Not supported operation."); } using DB::NumberLevels; - virtual int NumberLevels(ColumnFamilyHandle* column_family) override { + virtual int NumberLevels(ColumnFamilyHandle* /*column_family*/) override { return 1; } using DB::MaxMemCompactionLevel; virtual int MaxMemCompactionLevel( - ColumnFamilyHandle* column_family) override { + ColumnFamilyHandle* /*column_family*/) override { return 1; } using DB::Level0StopWriteTrigger; virtual int Level0StopWriteTrigger( - ColumnFamilyHandle* column_family) override { + ColumnFamilyHandle* /*column_family*/) override { return -1; } @@ -2445,7 +2453,8 @@ class ModelDB : public DB { virtual Env* GetEnv() const override { return nullptr; } using DB::GetOptions; - virtual Options GetOptions(ColumnFamilyHandle* column_family) const override { + virtual Options GetOptions( + ColumnFamilyHandle* /*column_family*/) const override { return options_; } @@ -2453,8 +2462,8 @@ class ModelDB : public DB { virtual DBOptions GetDBOptions() const override { return options_; } using DB::Flush; - virtual Status Flush(const rocksdb::FlushOptions& options, - ColumnFamilyHandle* column_family) override { + virtual Status Flush(const rocksdb::FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { Status ret; return ret; } @@ -2464,39 +2473,42 @@ class ModelDB : public DB { #ifndef ROCKSDB_LITE virtual Status DisableFileDeletions() override { return Status::OK(); } - virtual Status EnableFileDeletions(bool force) override { + virtual Status EnableFileDeletions(bool /*force*/) override { return Status::OK(); } - virtual Status GetLiveFiles(std::vector&, uint64_t* size, - bool flush_memtable = true) override { + virtual Status GetLiveFiles(std::vector&, uint64_t* /*size*/, + bool /*flush_memtable*/ = true) override { return Status::OK(); } - virtual Status GetSortedWalFiles(VectorLogPtr& files) override { + virtual Status GetSortedWalFiles(VectorLogPtr& /*files*/) override { return Status::OK(); } - virtual Status DeleteFile(std::string name) override { return Status::OK(); } + virtual Status DeleteFile(std::string /*name*/) override { + return Status::OK(); + } virtual Status GetUpdatesSince( rocksdb::SequenceNumber, unique_ptr*, - const TransactionLogIterator::ReadOptions& read_options = + const TransactionLogIterator::ReadOptions& /*read_options*/ = TransactionLogIterator::ReadOptions()) override { return Status::NotSupported("Not supported in Model DB"); } virtual void GetColumnFamilyMetaData( - ColumnFamilyHandle* column_family, - ColumnFamilyMetaData* metadata) override {} + ColumnFamilyHandle* /*column_family*/, + ColumnFamilyMetaData* /*metadata*/) override {} #endif // ROCKSDB_LITE - virtual Status GetDbIdentity(std::string& identity) const override { + virtual Status GetDbIdentity(std::string& /*identity*/) const override { return Status::OK(); } virtual SequenceNumber GetLatestSequenceNumber() const override { return 0; } - virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) override { + virtual bool SetPreserveDeletesSequenceNumber( + SequenceNumber /*seqnum*/) override { return true; } @@ -3336,6 +3348,56 @@ TEST_F(DBTest, WriteSingleThreadEntry) { } } +TEST_F(DBTest, ConcurrentFlushWAL) { + const size_t cnt = 100; + Options options; + WriteOptions wopt; + ReadOptions ropt; + for (bool two_write_queues : {false, true}) { + for (bool manual_wal_flush : {false, true}) { + options.two_write_queues = two_write_queues; + options.manual_wal_flush = manual_wal_flush; + options.create_if_missing = true; + DestroyAndReopen(options); + std::vector threads; + threads.emplace_back([&] { + for (size_t i = 0; i < cnt; i++) { + auto istr = ToString(i); + db_->Put(wopt, db_->DefaultColumnFamily(), "a" + istr, "b" + istr); + } + }); + if (two_write_queues) { + threads.emplace_back([&] { + for (size_t i = cnt; i < 2 * cnt; i++) { + auto istr = ToString(i); + WriteBatch batch; + batch.Put("a" + istr, "b" + istr); + dbfull()->WriteImpl(wopt, &batch, nullptr, nullptr, 0, true); + } + }); + } + threads.emplace_back([&] { + for (size_t i = 0; i < cnt * 100; i++) { // FlushWAL is faster than Put + db_->FlushWAL(false); + } + }); + for (auto& t : threads) { + t.join(); + } + options.create_if_missing = false; + // Recover from the wal and make sure that it is not corrupted + Reopen(options); + for (size_t i = 0; i < cnt; i++) { + PinnableSlice pval; + auto istr = ToString(i); + ASSERT_OK( + db_->Get(ropt, db_->DefaultColumnFamily(), "a" + istr, &pval)); + ASSERT_TRUE(pval == ("b" + istr)); + } + } + } +} + #ifndef ROCKSDB_LITE TEST_F(DBTest, DynamicMemtableOptions) { const uint64_t k64KB = 1 << 16; @@ -4776,7 +4838,7 @@ class DelayedMergeOperator : public MergeOperator { public: explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {} - virtual bool FullMergeV2(const MergeOperationInput& merge_in, + virtual bool FullMergeV2(const MergeOperationInput& /*merge_in*/, MergeOperationOutput* merge_out) const override { db_test_->env_->addon_time_.fetch_add(1000); merge_out->new_value = ""; @@ -5143,7 +5205,7 @@ TEST_F(DBTest, AutomaticConflictsWithManualCompaction) { } ASSERT_OK(Flush()); } - std::thread manual_compaction_thread([this]() { + port::Thread manual_compaction_thread([this]() { CompactRangeOptions croptions; croptions.exclusive_manual_compaction = true; ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr)); @@ -5383,18 +5445,42 @@ TEST_F(DBTest, HardLimit) { #ifndef ROCKSDB_LITE class WriteStallListener : public EventListener { public: - WriteStallListener() : condition_(WriteStallCondition::kNormal) {} + WriteStallListener() : cond_(&mutex_), + condition_(WriteStallCondition::kNormal), + expected_(WriteStallCondition::kNormal), + expected_set_(false) + {} void OnStallConditionsChanged(const WriteStallInfo& info) override { MutexLock l(&mutex_); condition_ = info.condition.cur; + if (expected_set_ && + condition_ == expected_) { + cond_.Signal(); + expected_set_ = false; + } } bool CheckCondition(WriteStallCondition expected) { MutexLock l(&mutex_); - return expected == condition_; + if (expected != condition_) { + expected_ = expected; + expected_set_ = true; + while (expected != condition_) { + // We bail out on timeout 500 milliseconds + const uint64_t timeout_us = 500000; + if (cond_.TimedWait(timeout_us)) { + expected_set_ = false; + return false; + } + } + } + return true; } private: - port::Mutex mutex_; + port::Mutex mutex_; + port::CondVar cond_; WriteStallCondition condition_; + WriteStallCondition expected_; + bool expected_set_; }; TEST_F(DBTest, SoftLimit) { @@ -5733,7 +5819,7 @@ TEST_F(DBTest, ThreadLocalPtrDeadlock) { return flushes_done.load() > 10; }; - std::thread flushing_thread([&] { + port::Thread flushing_thread([&] { for (int i = 0; !done(); ++i) { ASSERT_OK(db_->Put(WriteOptions(), Slice("hi"), Slice(std::to_string(i).c_str()))); @@ -5743,12 +5829,12 @@ TEST_F(DBTest, ThreadLocalPtrDeadlock) { } }); - std::vector thread_spawning_threads(10); + std::vector thread_spawning_threads(10); for (auto& t: thread_spawning_threads) { - t = std::thread([&] { + t = port::Thread([&] { while (!done()) { { - std::thread tmp_thread([&] { + port::Thread tmp_thread([&] { auto it = db_->NewIterator(ReadOptions()); delete it; }); diff --git a/db/db_test2.cc b/db/db_test2.cc index ee591d3bbf5..7af5379e230 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -498,9 +498,9 @@ TEST_F(DBTest2, WalFilterTest) { apply_option_at_record_index_(apply_option_for_record_index), current_record_index_(0) {} - virtual WalProcessingOption LogRecord(const WriteBatch& batch, - WriteBatch* new_batch, - bool* batch_changed) const override { + virtual WalProcessingOption LogRecord( + const WriteBatch& /*batch*/, WriteBatch* /*new_batch*/, + bool* /*batch_changed*/) const override { WalFilter::WalProcessingOption option_to_return; if (current_record_index_ == apply_option_at_record_index_) { @@ -874,11 +874,10 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { cf_name_id_map_ = cf_name_id_map; } - virtual WalProcessingOption LogRecordFound(unsigned long long log_number, - const std::string& log_file_name, - const WriteBatch& batch, - WriteBatch* new_batch, - bool* batch_changed) override { + virtual WalProcessingOption LogRecordFound( + unsigned long long log_number, const std::string& /*log_file_name*/, + const WriteBatch& batch, WriteBatch* /*new_batch*/, + bool* /*batch_changed*/) override { class LogRecordBatchHandler : public WriteBatch::Handler { private: const std::map & cf_log_number_map_; @@ -1231,7 +1230,7 @@ class CompactionStallTestListener : public EventListener { public: CompactionStallTestListener() : compacted_files_cnt_(0) {} - void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override { + void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { ASSERT_EQ(ci.cf_name, "default"); ASSERT_EQ(ci.base_input_level, 0); ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum); @@ -1823,14 +1822,26 @@ TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) { { const int kIdBufLen = 100; char id_buf[kIdBufLen]; +#ifndef OS_WIN + // You can't open a directory on windows using random access file std::unique_ptr file; - env_->NewRandomAccessFile(dbname_, &file, EnvOptions()); + ASSERT_OK(env_->NewRandomAccessFile(dbname_, &file, EnvOptions())); if (file->GetUniqueId(id_buf, kIdBufLen) == 0) { // fs holding db directory doesn't support getting a unique file id, // this means that running this test will fail because lru_cache will load // the blocks again regardless of them being already in the cache return; } +#else + std::unique_ptr dir; + ASSERT_OK(env_->NewDirectory(dbname_, &dir)); + if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) { + // fs holding db directory doesn't support getting a unique file id, + // this means that running this test will fail because lru_cache will load + // the blocks again regardless of them being already in the cache + return; + } +#endif } uint32_t bytes_per_bit[2] = {1, 16}; for (size_t k = 0; k < 2; k++) { diff --git a/db/db_test_util.cc b/db/db_test_util.cc index 6bdc5b9a85b..79f36415949 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -1143,17 +1143,18 @@ UpdateStatus DBTestBase::updateInPlaceSmallerVarintSize(char* prevValue, } } -UpdateStatus DBTestBase::updateInPlaceLargerSize(char* prevValue, - uint32_t* prevSize, +UpdateStatus DBTestBase::updateInPlaceLargerSize(char* /*prevValue*/, + uint32_t* /*prevSize*/, Slice delta, std::string* newValue) { *newValue = std::string(delta.size(), 'c'); return UpdateStatus::UPDATED; } -UpdateStatus DBTestBase::updateInPlaceNoAction(char* prevValue, - uint32_t* prevSize, Slice delta, - std::string* newValue) { +UpdateStatus DBTestBase::updateInPlaceNoAction(char* /*prevValue*/, + uint32_t* /*prevSize*/, + Slice /*delta*/, + std::string* /*newValue*/) { return UpdateStatus::UPDATE_FAILED; } diff --git a/db/db_test_util.h b/db/db_test_util.h index 9f634583583..936823eff20 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -187,7 +187,7 @@ class SpecialSkipListFactory : public MemTableRepFactory { using MemTableRepFactory::CreateMemTableRep; virtual MemTableRep* CreateMemTableRep( const MemTableRep::KeyComparator& compare, Allocator* allocator, - const SliceTransform* transform, Logger* logger) override { + const SliceTransform* transform, Logger* /*logger*/) override { return new SpecialMemTableRep( allocator, factory_.CreateMemTableRep(compare, allocator, transform, 0), num_entries_flush_); diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc index 2bc78475ece..80d17de2196 100644 --- a/db/db_universal_compaction_test.cc +++ b/db/db_universal_compaction_test.cc @@ -56,9 +56,9 @@ void VerifyCompactionResult( class KeepFilter : public CompactionFilter { public: - virtual bool Filter(int level, const Slice& key, const Slice& value, - std::string* new_value, bool* value_changed) const - override { + virtual bool Filter(int /*level*/, const Slice& /*key*/, + const Slice& /*value*/, std::string* /*new_value*/, + bool* /*value_changed*/) const override { return false; } @@ -88,9 +88,9 @@ class KeepFilterFactory : public CompactionFilterFactory { class DelayFilter : public CompactionFilter { public: explicit DelayFilter(DBTestBase* d) : db_test(d) {} - virtual bool Filter(int level, const Slice& key, const Slice& value, - std::string* new_value, - bool* value_changed) const override { + virtual bool Filter(int /*level*/, const Slice& /*key*/, + const Slice& /*value*/, std::string* /*new_value*/, + bool* /*value_changed*/) const override { db_test->env_->addon_time_.fetch_add(1000); return true; } @@ -105,7 +105,7 @@ class DelayFilterFactory : public CompactionFilterFactory { public: explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {} virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& context) override { + const CompactionFilter::Context& /*context*/) override { return std::unique_ptr(new DelayFilter(db_test)); } diff --git a/db/db_write_test.cc b/db/db_write_test.cc index 1a27f470ec7..917aef550e2 100644 --- a/db/db_write_test.cc +++ b/db/db_write_test.cc @@ -80,6 +80,22 @@ TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) { Close(); } +TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) { + std::unique_ptr mock_env( + new FaultInjectionTestEnv(Env::Default())); + Options options = GetOptions(); + options.env = mock_env.get(); + Reopen(options); + for (int i = 0; i < 2; i++) { + // Forcibly fail WAL write for the first Put only. Subsequent Puts should + // fail due to read-only mode + mock_env->SetFilesystemActive(i != 0); + ASSERT_FALSE(Put("key" + ToString(i), "value").ok()); + } + // Close before mock_env destruct. + Close(); +} + INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest, testing::Values(DBTestBase::kDefault, DBTestBase::kConcurrentWALWrites, diff --git a/db/dbformat.cc b/db/dbformat.cc index 9357c32786f..7b565abc52c 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -106,45 +106,6 @@ const char* InternalKeyComparator::Name() const { return name_.c_str(); } -int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { - // Order by: - // increasing user key (according to user-supplied comparator) - // decreasing sequence number - // decreasing type (though sequence# should be enough to disambiguate) - int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); - PERF_COUNTER_ADD(user_key_comparison_count, 1); - if (r == 0) { - const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); - const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); - if (anum > bnum) { - r = -1; - } else if (anum < bnum) { - r = +1; - } - } - return r; -} - -int InternalKeyComparator::CompareKeySeq(const Slice& akey, - const Slice& bkey) const { - // Order by: - // increasing user key (according to user-supplied comparator) - // decreasing sequence number - int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); - PERF_COUNTER_ADD(user_key_comparison_count, 1); - if (r == 0) { - // Shift the number to exclude the last byte which contains the value type - const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8) >> 8; - const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8) >> 8; - if (anum > bnum) { - r = -1; - } else if (anum < bnum) { - r = +1; - } - } - return r; -} - int InternalKeyComparator::Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const { // Order by: diff --git a/db/dbformat.h b/db/dbformat.h index 52e668d1d0e..7262bb24bdf 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -11,6 +11,7 @@ #include #include #include +#include "monitoring/perf_context_imp.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/filter_policy.h" @@ -607,4 +608,46 @@ struct RangeTombstone { } }; +inline +int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); + PERF_COUNTER_ADD(user_key_comparison_count, 1); + if (r == 0) { + const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); + const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + +inline +int InternalKeyComparator::CompareKeySeq(const Slice& akey, + const Slice& bkey) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); + PERF_COUNTER_ADD(user_key_comparison_count, 1); + if (r == 0) { + // Shift the number to exclude the last byte which contains the value type + const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8) >> 8; + const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8) >> 8; + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + + } // namespace rocksdb diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index 989c0c4118b..dbe77917400 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -159,7 +159,7 @@ class DeleteFileTest : public testing::Test { } // An empty job to guard all jobs are processed - static void GuardFinish(void* arg) { + static void GuardFinish(void* /*arg*/) { TEST_SYNC_POINT("DeleteFileTest::GuardFinish"); } }; @@ -228,7 +228,7 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) { // this time, we keep an iterator alive ReopenDB(true); - Iterator *itr = 0; + Iterator *itr = nullptr; CreateTwoLevels(); itr = db_->NewIterator(ReadOptions()); db_->CompactRange(compact_options, &first_slice, &last_slice); @@ -249,7 +249,7 @@ TEST_F(DeleteFileTest, BackgroundPurgeTest) { Slice first_slice(first), last_slice(last); // We keep an iterator alive - Iterator* itr = 0; + Iterator* itr = nullptr; CreateTwoLevels(); ReadOptions options; options.background_purge_on_iterator_cleanup = true; @@ -289,7 +289,7 @@ TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) { Slice first_slice(first), last_slice(last); // We keep an iterator alive - Iterator* itr = 0; + Iterator* itr = nullptr; CreateTwoLevels(); ReadOptions* options = new ReadOptions(); options->background_purge_on_iterator_cleanup = true; diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 8c29635089f..b37440e47c7 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -132,11 +132,15 @@ Status ExternalSstFileIngestionJob::Prepare( return status; } -Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed) { - SuperVersion* super_version = cfd_->GetSuperVersion(); +Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed, + SuperVersion* super_version) { + autovector ranges; + for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) { + ranges.emplace_back(file_to_ingest.smallest_user_key, + file_to_ingest.largest_user_key); + } Status status = - IngestedFilesOverlapWithMemtables(super_version, flush_needed); - + cfd_->RangesOverlapWithMemtables(ranges, super_version, flush_needed); if (status.ok() && *flush_needed && !ingestion_options_.allow_blocking_flush) { status = Status::InvalidArgument("External file requires flush"); @@ -148,11 +152,12 @@ Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed) { // nonmem_write_thread_ Status ExternalSstFileIngestionJob::Run() { Status status; + SuperVersion* super_version = cfd_->GetSuperVersion(); #ifndef NDEBUG // We should never run the job with a memtable that is overlapping // with the files we are ingesting bool need_flush = false; - status = NeedsFlush(&need_flush); + status = NeedsFlush(&need_flush, super_version); assert(status.ok() && need_flush == false); #endif @@ -167,7 +172,6 @@ Status ExternalSstFileIngestionJob::Run() { // It is safe to use this instead of LastAllocatedSequence since we are // the only active writer, and hence they are equal const SequenceNumber last_seqno = versions_->LastSequence(); - SuperVersion* super_version = cfd_->GetSuperVersion(); edit_.SetColumnFamily(cfd_->GetID()); // The levels that the files will be ingested into @@ -375,54 +379,6 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( return status; } -Status ExternalSstFileIngestionJob::IngestedFilesOverlapWithMemtables( - SuperVersion* sv, bool* overlap) { - *overlap = false; - // Create an InternalIterator over all memtables - Arena arena; - ReadOptions ro; - ro.total_order_seek = true; - MergeIteratorBuilder merge_iter_builder(&cfd_->internal_comparator(), &arena); - merge_iter_builder.AddIterator(sv->mem->NewIterator(ro, &arena)); - sv->imm->AddIterators(ro, &merge_iter_builder); - ScopedArenaIterator memtable_iter(merge_iter_builder.Finish()); - - std::vector memtable_range_del_iters; - auto* active_range_del_iter = sv->mem->NewRangeTombstoneIterator(ro); - if (active_range_del_iter != nullptr) { - memtable_range_del_iters.push_back(active_range_del_iter); - } - sv->imm->AddRangeTombstoneIterators(ro, &memtable_range_del_iters); - RangeDelAggregator range_del_agg(cfd_->internal_comparator(), - {} /* snapshots */, - false /* collapse_deletions */); - Status status; - { - std::unique_ptr memtable_range_del_iter( - NewMergingIterator(&cfd_->internal_comparator(), - memtable_range_del_iters.empty() - ? nullptr - : &memtable_range_del_iters[0], - static_cast(memtable_range_del_iters.size()))); - status = range_del_agg.AddTombstones(std::move(memtable_range_del_iter)); - } - if (status.ok()) { - for (IngestedFileInfo& f : files_to_ingest_) { - status = IngestedFileOverlapWithIteratorRange(&f, memtable_iter.get(), - overlap); - if (!status.ok() || *overlap == true) { - break; - } - if (range_del_agg.IsRangeOverlapped(f.smallest_user_key, - f.largest_user_key)) { - *overlap = true; - break; - } - } - } - return status; -} - Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style, IngestedFileInfo* file_to_ingest, SequenceNumber* assigned_seqno) { @@ -451,8 +407,9 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( if (vstorage->NumLevelFiles(lvl) > 0) { bool overlap_with_level = false; - status = IngestedFileOverlapWithLevel(sv, file_to_ingest, lvl, - &overlap_with_level); + status = sv->current->OverlapWithLevelIterator(ro, env_options_, + file_to_ingest->smallest_user_key, file_to_ingest->largest_user_key, + lvl, &overlap_with_level); if (!status.ok()) { return status; } @@ -553,40 +510,15 @@ Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile( std::string seqno_val; PutFixed64(&seqno_val, seqno); status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val); + if (status.ok()) { + status = rwfile->Fsync(); + } if (status.ok()) { file_to_ingest->assigned_seqno = seqno; } return status; } -Status ExternalSstFileIngestionJob::IngestedFileOverlapWithIteratorRange( - const IngestedFileInfo* file_to_ingest, InternalIterator* iter, - bool* overlap) { - auto* vstorage = cfd_->current()->storage_info(); - auto* ucmp = vstorage->InternalComparator()->user_comparator(); - InternalKey range_start(file_to_ingest->smallest_user_key, kMaxSequenceNumber, - kValueTypeForSeek); - iter->Seek(range_start.Encode()); - if (!iter->status().ok()) { - return iter->status(); - } - - *overlap = false; - if (iter->Valid()) { - ParsedInternalKey seek_result; - if (!ParseInternalKey(iter->key(), &seek_result)) { - return Status::Corruption("DB have corrupted keys"); - } - - if (ucmp->Compare(seek_result.user_key, file_to_ingest->largest_user_key) <= - 0) { - *overlap = true; - } - } - - return iter->status(); -} - bool ExternalSstFileIngestionJob::IngestedFileFitInLevel( const IngestedFileInfo* file_to_ingest, int level) { if (level == 0) { @@ -615,38 +547,6 @@ bool ExternalSstFileIngestionJob::IngestedFileFitInLevel( return true; } -Status ExternalSstFileIngestionJob::IngestedFileOverlapWithLevel( - SuperVersion* sv, IngestedFileInfo* file_to_ingest, int lvl, - bool* overlap_with_level) { - Arena arena; - ReadOptions ro; - ro.total_order_seek = true; - MergeIteratorBuilder merge_iter_builder(&cfd_->internal_comparator(), - &arena); - // Files are opened lazily when the iterator needs them, thus range deletions - // are also added lazily to the aggregator. We need to check for range - // deletion overlap only in the case where there's no point-key overlap. Then, - // we've already opened the file with range containing the ingested file's - // begin key, and iterated through all files until the one containing the - // ingested file's end key. So any files maybe containing range deletions - // overlapping the ingested file must have been opened and had their range - // deletions added to the aggregator. - RangeDelAggregator range_del_agg(cfd_->internal_comparator(), - {} /* snapshots */, - false /* collapse_deletions */); - sv->current->AddIteratorsForLevel(ro, env_options_, &merge_iter_builder, lvl, - &range_del_agg); - ScopedArenaIterator level_iter(merge_iter_builder.Finish()); - Status status = IngestedFileOverlapWithIteratorRange( - file_to_ingest, level_iter.get(), overlap_with_level); - if (status.ok() && *overlap_with_level == false && - range_del_agg.IsRangeOverlapped(file_to_ingest->smallest_user_key, - file_to_ingest->largest_user_key)) { - *overlap_with_level = true; - } - return status; -} - } // namespace rocksdb #endif // !ROCKSDB_LITE diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h index e42c50603e5..ea0a7c46fa0 100644 --- a/db/external_sst_file_ingestion_job.h +++ b/db/external_sst_file_ingestion_job.h @@ -84,8 +84,12 @@ class ExternalSstFileIngestionJob { // Check if we need to flush the memtable before running the ingestion job // This will be true if the files we are ingesting are overlapping with any // key range in the memtable. - // REQUIRES: Mutex held - Status NeedsFlush(bool* flush_needed); + // + // @param super_version A referenced SuperVersion that will be held for the + // duration of this function. + // + // Thread-safe + Status NeedsFlush(bool* flush_needed, SuperVersion* super_version); // Will execute the ingestion job and prepare edit() to be applied. // REQUIRES: Mutex held @@ -110,10 +114,6 @@ class ExternalSstFileIngestionJob { Status GetIngestedFileInfo(const std::string& external_file, IngestedFileInfo* file_to_ingest); - // Check if the files we are ingesting overlap with any memtable. - // REQUIRES: Mutex held - Status IngestedFilesOverlapWithMemtables(SuperVersion* sv, bool* overlap); - // Assign `file_to_ingest` the appropriate sequence number and the lowest // possible level that it can be ingested to according to compaction_style. // REQUIRES: Mutex held @@ -133,17 +133,6 @@ class ExternalSstFileIngestionJob { Status AssignGlobalSeqnoForIngestedFile(IngestedFileInfo* file_to_ingest, SequenceNumber seqno); - // Check if `file_to_ingest` key range overlap with the range `iter` represent - // REQUIRES: Mutex held - Status IngestedFileOverlapWithIteratorRange( - const IngestedFileInfo* file_to_ingest, InternalIterator* iter, - bool* overlap); - - // Check if `file_to_ingest` key range overlap with level - // REQUIRES: Mutex held - Status IngestedFileOverlapWithLevel(SuperVersion* sv, - IngestedFileInfo* file_to_ingest, int lvl, bool* overlap_with_level); - // Check if `file_to_ingest` can fit in level `level` // REQUIRES: Mutex held bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest, diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc index 8f7c868399f..fcdf07adc0e 100644 --- a/db/external_sst_file_test.cc +++ b/db/external_sst_file_test.cc @@ -396,8 +396,9 @@ class SstFileWriterCollector : public TablePropertiesCollector { return Status::OK(); } - Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type, - SequenceNumber seq, uint64_t file_size) override { + Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { ++count_; return Status::OK(); } @@ -417,7 +418,7 @@ class SstFileWriterCollectorFactory : public TablePropertiesCollectorFactory { explicit SstFileWriterCollectorFactory(std::string prefix) : prefix_(prefix), num_created_(0) {} virtual TablePropertiesCollector* CreateTablePropertiesCollector( - TablePropertiesCollectorFactory::Context context) override { + TablePropertiesCollectorFactory::Context /*context*/) override { num_created_++; return new SstFileWriterCollector(prefix_); } @@ -1798,7 +1799,7 @@ TEST_F(ExternalSSTFileTest, FileWithCFInfo) { class TestIngestExternalFileListener : public EventListener { public: - void OnExternalFileIngested(DB* db, + void OnExternalFileIngested(DB* /*db*/, const ExternalFileIngestionInfo& info) override { ingested_files.push_back(info); } diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index 70a36b66260..8feccfff9fe 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -76,8 +76,8 @@ class FaultInjectionTest : public testing::Test, sync_use_wal_(false), sync_use_compact_(true), base_env_(nullptr), - env_(NULL), - db_(NULL) { + env_(nullptr), + db_(nullptr) { } ~FaultInjectionTest() { @@ -139,9 +139,9 @@ class FaultInjectionTest : public testing::Test, } Status NewDB() { - assert(db_ == NULL); + assert(db_ == nullptr); assert(tiny_cache_ == nullptr); - assert(env_ == NULL); + assert(env_ == nullptr); env_ = new FaultInjectionTestEnv(base_env_ ? base_env_.get() : Env::Default()); @@ -176,7 +176,7 @@ class FaultInjectionTest : public testing::Test, Status s = DestroyDB(dbname_, options_); delete env_; - env_ = NULL; + env_ = nullptr; tiny_cache_.reset(); diff --git a/db/file_indexer_test.cc b/db/file_indexer_test.cc index 5cd8c2d2cf6..b424f91eacc 100644 --- a/db/file_indexer_test.cc +++ b/db/file_indexer_test.cc @@ -36,10 +36,10 @@ class IntComparator : public Comparator { const char* Name() const override { return "IntComparator"; } - void FindShortestSeparator(std::string* start, - const Slice& limit) const override {} + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} - void FindShortSuccessor(std::string* key) const override {} + void FindShortSuccessor(std::string* /*key*/) const override {} }; class FileIndexerTest : public testing::Test { diff --git a/db/flush_job.cc b/db/flush_job.cc index 2181bebb413..f01565697e6 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -56,8 +56,8 @@ namespace rocksdb { const char* GetFlushReasonString (FlushReason flush_reason) { switch (flush_reason) { - case FlushReason::kUnknown: - return "Unknown"; + case FlushReason::kOthers: + return "Other Reasons"; case FlushReason::kGetLiveFiles: return "Get Live Files"; case FlushReason::kShutDown: @@ -72,8 +72,12 @@ const char* GetFlushReasonString (FlushReason flush_reason) { return "Write Buffer Full"; case FlushReason::kTest: return "Test"; - case FlushReason::kSuperVersionChange: - return "SuperVersion Change"; + case FlushReason::kDeleteFiles: + return "Delete Files"; + case FlushReason::kAutoCompaction: + return "Auto Compaction"; + case FlushReason::kManualFlush: + return "Manual Flush"; default: return "Invalid"; } diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index e00e1d6d766..471d7c0b7ba 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -104,7 +104,7 @@ class ForwardLevelIterator : public InternalIterator { file_iter_->Seek(internal_key); valid_ = file_iter_->Valid(); } - void SeekForPrev(const Slice& internal_key) override { + void SeekForPrev(const Slice& /*internal_key*/) override { status_ = Status::NotSupported("ForwardLevelIterator::SeekForPrev()"); valid_ = false; } diff --git a/db/forward_iterator.h b/db/forward_iterator.h index c576a9d8396..146588d961c 100644 --- a/db/forward_iterator.h +++ b/db/forward_iterator.h @@ -55,7 +55,7 @@ class ForwardIterator : public InternalIterator { ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr); virtual ~ForwardIterator(); - void SeekForPrev(const Slice& target) override { + void SeekForPrev(const Slice& /*target*/) override { status_ = Status::NotSupported("ForwardIterator::SeekForPrev()"); valid_ = false; } diff --git a/db/internal_stats.cc b/db/internal_stats.cc index 52ed4b4d93a..b3cf2380819 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -233,6 +233,7 @@ static const std::string estimate_live_data_size = "estimate-live-data-size"; static const std::string min_log_number_to_keep = "min-log-number-to-keep"; static const std::string base_level = "base-level"; static const std::string total_sst_files_size = "total-sst-files-size"; +static const std::string live_sst_files_size = "live-sst-files-size"; static const std::string estimate_pending_comp_bytes = "estimate-pending-compaction-bytes"; static const std::string aggregated_table_properties = @@ -307,6 +308,8 @@ const std::string DB::Properties::kMinLogNumberToKeep = rocksdb_prefix + min_log_number_to_keep; const std::string DB::Properties::kTotalSstFilesSize = rocksdb_prefix + total_sst_files_size; +const std::string DB::Properties::kLiveSstFilesSize = + rocksdb_prefix + live_sst_files_size; const std::string DB::Properties::kBaseLevel = rocksdb_prefix + base_level; const std::string DB::Properties::kEstimatePendingCompactionBytes = rocksdb_prefix + estimate_pending_comp_bytes; @@ -405,6 +408,8 @@ const std::unordered_map {false, nullptr, &InternalStats::HandleBaseLevel, nullptr}}, {DB::Properties::kTotalSstFilesSize, {false, nullptr, &InternalStats::HandleTotalSstFilesSize, nullptr}}, + {DB::Properties::kLiveSstFilesSize, + {false, nullptr, &InternalStats::HandleLiveSstFilesSize, nullptr}}, {DB::Properties::kEstimatePendingCompactionBytes, {false, nullptr, &InternalStats::HandleEstimatePendingCompactionBytes, nullptr}}, @@ -442,7 +447,7 @@ bool InternalStats::GetStringProperty(const DBPropertyInfo& property_info, } bool InternalStats::GetMapProperty(const DBPropertyInfo& property_info, - const Slice& property, + const Slice& /*property*/, std::map* value) { assert(value != nullptr); assert(property_info.handle_map != nullptr); @@ -494,7 +499,7 @@ bool InternalStats::HandleCompressionRatioAtLevelPrefix(std::string* value, return true; } -bool InternalStats::HandleLevelStats(std::string* value, Slice suffix) { +bool InternalStats::HandleLevelStats(std::string* value, Slice /*suffix*/) { char buf[1000]; const auto* vstorage = cfd_->current()->storage_info(); snprintf(buf, sizeof(buf), @@ -527,35 +532,36 @@ bool InternalStats::HandleCFMapStats( return true; } -bool InternalStats::HandleCFStats(std::string* value, Slice suffix) { +bool InternalStats::HandleCFStats(std::string* value, Slice /*suffix*/) { DumpCFStats(value); return true; } bool InternalStats::HandleCFStatsNoFileHistogram(std::string* value, - Slice suffix) { + Slice /*suffix*/) { DumpCFStatsNoFileHistogram(value); return true; } -bool InternalStats::HandleCFFileHistogram(std::string* value, Slice suffix) { +bool InternalStats::HandleCFFileHistogram(std::string* value, + Slice /*suffix*/) { DumpCFFileHistogram(value); return true; } -bool InternalStats::HandleDBStats(std::string* value, Slice suffix) { +bool InternalStats::HandleDBStats(std::string* value, Slice /*suffix*/) { DumpDBStats(value); return true; } -bool InternalStats::HandleSsTables(std::string* value, Slice suffix) { +bool InternalStats::HandleSsTables(std::string* value, Slice /*suffix*/) { auto* current = cfd_->current(); *value = current->DebugString(true, true); return true; } bool InternalStats::HandleAggregatedTableProperties(std::string* value, - Slice suffix) { + Slice /*suffix*/) { std::shared_ptr tp; auto s = cfd_->current()->GetAggregatedTableProperties(&tp); if (!s.ok()) { @@ -582,34 +588,34 @@ bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* value, return true; } -bool InternalStats::HandleNumImmutableMemTable(uint64_t* value, DBImpl* db, - Version* version) { +bool InternalStats::HandleNumImmutableMemTable(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { *value = cfd_->imm()->NumNotFlushed(); return true; } bool InternalStats::HandleNumImmutableMemTableFlushed(uint64_t* value, - DBImpl* db, - Version* version) { + DBImpl* /*db*/, + Version* /*version*/) { *value = cfd_->imm()->NumFlushed(); return true; } -bool InternalStats::HandleMemTableFlushPending(uint64_t* value, DBImpl* db, - Version* version) { +bool InternalStats::HandleMemTableFlushPending(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { // Return number of mem tables that are ready to flush (made immutable) *value = (cfd_->imm()->IsFlushPending() ? 1 : 0); return true; } bool InternalStats::HandleNumRunningFlushes(uint64_t* value, DBImpl* db, - Version* version) { + Version* /*version*/) { *value = db->num_running_flushes(); return true; } -bool InternalStats::HandleCompactionPending(uint64_t* value, DBImpl* db, - Version* version) { +bool InternalStats::HandleCompactionPending(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { // 1 if the system already determines at least one compaction is needed. // 0 otherwise, const auto* vstorage = cfd_->current()->storage_info(); @@ -618,70 +624,74 @@ bool InternalStats::HandleCompactionPending(uint64_t* value, DBImpl* db, } bool InternalStats::HandleNumRunningCompactions(uint64_t* value, DBImpl* db, - Version* version) { + Version* /*version*/) { *value = db->num_running_compactions_; return true; } -bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* db, - Version* version) { +bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { // Accumulated number of errors in background flushes or compactions. *value = GetBackgroundErrorCount(); return true; } -bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* db, - Version* version) { +bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { // Current size of the active memtable *value = cfd_->mem()->ApproximateMemoryUsage(); return true; } -bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* db, - Version* version) { +bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { // Current size of the active memtable + immutable memtables *value = cfd_->mem()->ApproximateMemoryUsage() + cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage(); return true; } -bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* db, - Version* version) { +bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { *value = cfd_->mem()->ApproximateMemoryUsage() + cfd_->imm()->ApproximateMemoryUsage(); return true; } -bool InternalStats::HandleNumEntriesActiveMemTable(uint64_t* value, DBImpl* db, - Version* version) { +bool InternalStats::HandleNumEntriesActiveMemTable(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { // Current number of entires in the active memtable *value = cfd_->mem()->num_entries(); return true; } -bool InternalStats::HandleNumEntriesImmMemTables(uint64_t* value, DBImpl* db, - Version* version) { +bool InternalStats::HandleNumEntriesImmMemTables(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { // Current number of entries in the immutable memtables *value = cfd_->imm()->current()->GetTotalNumEntries(); return true; } -bool InternalStats::HandleNumDeletesActiveMemTable(uint64_t* value, DBImpl* db, - Version* version) { +bool InternalStats::HandleNumDeletesActiveMemTable(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { // Current number of entires in the active memtable *value = cfd_->mem()->num_deletes(); return true; } -bool InternalStats::HandleNumDeletesImmMemTables(uint64_t* value, DBImpl* db, - Version* version) { +bool InternalStats::HandleNumDeletesImmMemTables(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { // Current number of entries in the immutable memtables *value = cfd_->imm()->current()->GetTotalNumDeletes(); return true; } -bool InternalStats::HandleEstimateNumKeys(uint64_t* value, DBImpl* db, - Version* version) { +bool InternalStats::HandleEstimateNumKeys(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { // Estimate number of entries in the column family: // Use estimated entries in tables + total entries in memtables. const auto* vstorage = cfd_->current()->storage_info(); @@ -697,77 +707,85 @@ bool InternalStats::HandleEstimateNumKeys(uint64_t* value, DBImpl* db, } bool InternalStats::HandleNumSnapshots(uint64_t* value, DBImpl* db, - Version* version) { + Version* /*version*/) { *value = db->snapshots().count(); return true; } bool InternalStats::HandleOldestSnapshotTime(uint64_t* value, DBImpl* db, - Version* version) { + Version* /*version*/) { *value = static_cast(db->snapshots().GetOldestSnapshotTime()); return true; } -bool InternalStats::HandleNumLiveVersions(uint64_t* value, DBImpl* db, - Version* version) { +bool InternalStats::HandleNumLiveVersions(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { *value = cfd_->GetNumLiveVersions(); return true; } -bool InternalStats::HandleCurrentSuperVersionNumber(uint64_t* value, DBImpl* db, - Version* version) { +bool InternalStats::HandleCurrentSuperVersionNumber(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { *value = cfd_->GetSuperVersionNumber(); return true; } bool InternalStats::HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db, - Version* version) { + Version* /*version*/) { *value = db->IsFileDeletionsEnabled(); return true; } -bool InternalStats::HandleBaseLevel(uint64_t* value, DBImpl* db, - Version* version) { +bool InternalStats::HandleBaseLevel(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { const auto* vstorage = cfd_->current()->storage_info(); *value = vstorage->base_level(); return true; } -bool InternalStats::HandleTotalSstFilesSize(uint64_t* value, DBImpl* db, - Version* version) { +bool InternalStats::HandleTotalSstFilesSize(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { *value = cfd_->GetTotalSstFilesSize(); return true; } +bool InternalStats::HandleLiveSstFilesSize(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->GetLiveSstFilesSize(); + return true; +} + bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value, - DBImpl* db, - Version* version) { + DBImpl* /*db*/, + Version* /*version*/) { const auto* vstorage = cfd_->current()->storage_info(); *value = vstorage->estimated_compaction_needed_bytes(); return true; } -bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value, DBImpl* db, +bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value, + DBImpl* /*db*/, Version* version) { *value = (version == nullptr) ? 0 : version->GetMemoryUsageByTableReaders(); return true; } -bool InternalStats::HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db, - Version* version) { +bool InternalStats::HandleEstimateLiveDataSize(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { const auto* vstorage = cfd_->current()->storage_info(); *value = vstorage->EstimateLiveDataSize(); return true; } bool InternalStats::HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db, - Version* version) { + Version* /*version*/) { *value = db->MinLogNumberToKeep(); return true; } bool InternalStats::HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db, - Version* version) { + Version* /*version*/) { const WriteController& wc = db->write_controller(); if (!wc.NeedsDelay()) { *value = 0; @@ -778,7 +796,7 @@ bool InternalStats::HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db, } bool InternalStats::HandleIsWriteStopped(uint64_t* value, DBImpl* db, - Version* version) { + Version* /*version*/) { *value = db->write_controller().IsStopped() ? 1 : 0; return true; } diff --git a/db/internal_stats.h b/db/internal_stats.h index dea9c098727..481c6d32f97 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -467,6 +467,7 @@ class InternalStats { Version* version); bool HandleBaseLevel(uint64_t* value, DBImpl* db, Version* version); bool HandleTotalSstFilesSize(uint64_t* value, DBImpl* db, Version* version); + bool HandleLiveSstFilesSize(uint64_t* value, DBImpl* db, Version* version); bool HandleEstimatePendingCompactionBytes(uint64_t* value, DBImpl* db, Version* version); bool HandleEstimateTableReadersMem(uint64_t* value, DBImpl* db, diff --git a/db/listener_test.cc b/db/listener_test.cc index 86646fdde02..204c9216e84 100644 --- a/db/listener_test.cc +++ b/db/listener_test.cc @@ -46,11 +46,11 @@ class EventListenerTest : public DBTestBase { }; struct TestPropertiesCollector : public rocksdb::TablePropertiesCollector { - virtual rocksdb::Status AddUserKey(const rocksdb::Slice& key, - const rocksdb::Slice& value, - rocksdb::EntryType type, - rocksdb::SequenceNumber seq, - uint64_t file_size) override { + virtual rocksdb::Status AddUserKey(const rocksdb::Slice& /*key*/, + const rocksdb::Slice& /*value*/, + rocksdb::EntryType /*type*/, + rocksdb::SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { return Status::OK(); } virtual rocksdb::Status Finish( @@ -73,7 +73,7 @@ struct TestPropertiesCollector : public rocksdb::TablePropertiesCollector { class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory { public: virtual TablePropertiesCollector* CreateTablePropertiesCollector( - TablePropertiesCollectorFactory::Context context) override { + TablePropertiesCollectorFactory::Context /*context*/) override { return new TestPropertiesCollector; } const char* Name() const override { return "TestTablePropertiesCollector"; } @@ -425,7 +425,7 @@ TEST_F(EventListenerTest, DisableBGCompaction) { class TestCompactionReasonListener : public EventListener { public: - void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override { + void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { std::lock_guard lock(mutex_); compaction_reasons_.push_back(ci.compaction_reason); } @@ -807,7 +807,8 @@ class BackgroundErrorListener : public EventListener { public: BackgroundErrorListener(SpecialEnv* env) : env_(env), counter_(0) {} - void OnBackgroundError(BackgroundErrorReason reason, Status* bg_error) override { + void OnBackgroundError(BackgroundErrorReason /*reason*/, + Status* bg_error) override { if (counter_ == 0) { // suppress the first error and disable write-dropping such that a retry // can succeed. diff --git a/db/log_test.cc b/db/log_test.cc index 24187e0484b..ee79e10af8b 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -163,7 +163,7 @@ class LogTest : public ::testing::TestWithParam { source_holder_( test::GetSequentialFileReader(new StringSource(reader_contents_))), writer_(std::move(dest_holder_), 123, GetParam()), - reader_(NULL, std::move(source_holder_), &report_, true /*checksum*/, + reader_(nullptr, std::move(source_holder_), &report_, true /*checksum*/, 0 /*initial_offset*/, 123) { int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; initial_offset_last_record_offsets_[0] = 0; @@ -271,7 +271,7 @@ class LogTest : public ::testing::TestWithParam { unique_ptr file_reader( test::GetSequentialFileReader(new StringSource(reader_contents_))); unique_ptr offset_reader( - new Reader(NULL, std::move(file_reader), &report_, + new Reader(nullptr, std::move(file_reader), &report_, true /*checksum*/, WrittenBytes() + offset_past_end, 123)); Slice record; std::string scratch; @@ -284,7 +284,7 @@ class LogTest : public ::testing::TestWithParam { unique_ptr file_reader( test::GetSequentialFileReader(new StringSource(reader_contents_))); unique_ptr offset_reader( - new Reader(NULL, std::move(file_reader), &report_, + new Reader(nullptr, std::move(file_reader), &report_, true /*checksum*/, initial_offset, 123)); Slice record; std::string scratch; diff --git a/db/log_writer.cc b/db/log_writer.cc index b02eec89dd9..a767f19160c 100644 --- a/db/log_writer.cc +++ b/db/log_writer.cc @@ -58,7 +58,8 @@ Status Writer::AddRecord(const Slice& slice) { // kRecyclableHeaderSize being <= 11) assert(header_size <= 11); dest_->Append( - Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", leftover)); + Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + static_cast(leftover))); } block_offset_ = 0; } diff --git a/db/log_writer.h b/db/log_writer.h index a3a879924e9..143ad2674de 100644 --- a/db/log_writer.h +++ b/db/log_writer.h @@ -49,7 +49,7 @@ namespace log { * |CRC (4B) | Size (2B) | Type (1B) | Payload | * +---------+-----------+-----------+--- ... ---+ * - * CRC = 32bit hash computed over the payload using CRC + * CRC = 32bit hash computed over the record type and payload using CRC * Size = Length of the payload data * Type = Type of record * (kZeroType, kFullType, kFirstType, kLastType, kMiddleType ) diff --git a/db/malloc_stats.cc b/db/malloc_stats.cc index 00197d04f26..02e89503862 100644 --- a/db/malloc_stats.cc +++ b/db/malloc_stats.cc @@ -54,6 +54,5 @@ void DumpMallocStats(std::string* stats) { #else void DumpMallocStats(std::string*) {} #endif // ROCKSDB_JEMALLOC - } #endif // !ROCKSDB_LITE diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc index 039b9080ed3..f31a50b8191 100644 --- a/db/manual_compaction_test.cc +++ b/db/manual_compaction_test.cc @@ -46,9 +46,9 @@ class DestroyAllCompactionFilter : public CompactionFilter { public: DestroyAllCompactionFilter() {} - virtual bool Filter(int level, const Slice& key, const Slice& existing_value, - std::string* new_value, - bool* value_changed) const override { + virtual bool Filter(int /*level*/, const Slice& /*key*/, + const Slice& existing_value, std::string* /*new_value*/, + bool* /*value_changed*/) const override { return existing_value.ToString() == "destroy"; } diff --git a/db/memtable.cc b/db/memtable.cc index 6cf4c2a0368..46779013536 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -229,14 +229,14 @@ int MemTable::KeyComparator::operator()(const char* prefix_len_key1, } int MemTable::KeyComparator::operator()(const char* prefix_len_key, - const Slice& key) + const KeyComparator::DecodedType& key) const { // Internal keys are encoded as length-prefixed strings. Slice a = GetLengthPrefixedSlice(prefix_len_key); return comparator.CompareKeySeq(a, key); } -void MemTableRep::InsertConcurrently(KeyHandle handle) { +void MemTableRep::InsertConcurrently(KeyHandle /*handle*/) { #ifndef ROCKSDB_LITE throw std::runtime_error("concurrent insert not supported"); #else diff --git a/db/memtable.h b/db/memtable.h index 7a04eaf7728..6082a8e6cd9 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -64,7 +64,7 @@ struct MemTablePostProcessInfo { }; // Note: Many of the methods in this class have comments indicating that -// external synchromization is required as these methods are not thread-safe. +// external synchronization is required as these methods are not thread-safe. // It is up to higher layers of code to decide how to prevent concurrent // invokation of these methods. This is usually done by acquiring either // the db mutex or the single writer thread. @@ -84,7 +84,7 @@ class MemTable { virtual int operator()(const char* prefix_len_key1, const char* prefix_len_key2) const override; virtual int operator()(const char* prefix_len_key, - const Slice& key) const override; + const DecodedType& key) const override; }; // MemTables are reference counted. The initial reference count diff --git a/db/memtable_list.cc b/db/memtable_list.cc index a09a118b90d..e3cd64cfe13 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -154,7 +154,7 @@ bool MemTableListVersion::GetFromList( } Status MemTableListVersion::AddRangeTombstoneIterators( - const ReadOptions& read_opts, Arena* arena, + const ReadOptions& read_opts, Arena* /*arena*/, RangeDelAggregator* range_del_agg) { assert(range_del_agg != nullptr); for (auto& m : memlist_) { @@ -300,7 +300,7 @@ void MemTableList::PickMemtablesToFlush(autovector* ret) { } void MemTableList::RollbackMemtableFlush(const autovector& mems, - uint64_t file_number) { + uint64_t /*file_number*/) { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_MEMTABLE_ROLLBACK); assert(!mems.empty()); diff --git a/db/merge_test.cc b/db/merge_test.cc index b6582b7a596..c1b0cbfaefb 100644 --- a/db/merge_test.cc +++ b/db/merge_test.cc @@ -504,7 +504,7 @@ void runTest(int argc, const std::string& dbname, const bool use_ttl = false) { } } // namespace -int main(int argc, char *argv[]) { +int main(int argc, char* /*argv*/ []) { //TODO: Make this test like a general rocksdb unit-test rocksdb::port::InstallStackTraceHandler(); runTest(argc, test::TmpDir() + "/merge_testdb"); diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 0b60332e53a..8fae9746d84 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -327,7 +327,7 @@ class TestPlainTableFactory : public PlainTableFactory { const TableReaderOptions& table_reader_options, unique_ptr&& file, uint64_t file_size, unique_ptr* table, - bool prefetch_index_and_filter_in_cache) const override { + bool /*prefetch_index_and_filter_in_cache*/) const override { TableProperties* props = nullptr; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, diff --git a/db/pre_release_callback.h b/db/pre_release_callback.h index fdc4d50c53c..b3e6585770b 100644 --- a/db/pre_release_callback.h +++ b/db/pre_release_callback.h @@ -24,7 +24,9 @@ class PreReleaseCallback { // propagated to all the writers in the write group. // seq is the sequence number that is used for this write and will be // released. - virtual Status Callback(SequenceNumber seq) = 0; + // is_mem_disabled is currently used for debugging purposes to assert that + // the callback is done from the right write queue. + virtual Status Callback(SequenceNumber seq, const bool is_mem_disabled) = 0; }; } // namespace rocksdb diff --git a/db/prefix_test.cc b/db/prefix_test.cc index c3dfdea2e71..398d31893d5 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -126,10 +126,10 @@ class TestKeyComparator : public Comparator { return "TestKeyComparator"; } - virtual void FindShortestSeparator(std::string* start, - const Slice& limit) const override {} + virtual void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} - virtual void FindShortSuccessor(std::string* key) const override {} + virtual void FindShortSuccessor(std::string* /*key*/) const override {} }; namespace { diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index fdd847a7ac3..10d22fbb688 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -536,4 +536,11 @@ bool RangeDelAggregator::IsEmpty() { return true; } +bool RangeDelAggregator::AddFile(uint64_t file_number) { + if (added_files_ == nullptr) { + added_files_.reset(new std::set()); + } + return added_files_->emplace(file_number).second; +} + } // namespace rocksdb diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index f050e8917e0..1c1402d29de 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -6,6 +6,7 @@ #pragma once #include +#include #include #include @@ -140,6 +141,7 @@ class RangeDelAggregator { CompactionIterationStats* range_del_out_stats = nullptr, bool bottommost_level = false); bool IsEmpty(); + bool AddFile(uint64_t file_number); private: // Maps tombstone user start key -> tombstone object @@ -180,6 +182,10 @@ class RangeDelAggregator { const InternalKeyComparator& icmp_; // collapse range deletions so they're binary searchable const bool collapse_deletions_; + + // Record files whose tombstones have been added, to avoid duplicate adding. + // Same as rep_, we initializes it lazily. + std::unique_ptr> added_files_; }; } // namespace rocksdb diff --git a/db/snapshot_checker.h b/db/snapshot_checker.h index 8a8738a5a88..3bc8bc3c55e 100644 --- a/db/snapshot_checker.h +++ b/db/snapshot_checker.h @@ -19,8 +19,8 @@ class SnapshotChecker { class DisableGCSnapshotChecker : public SnapshotChecker { public: virtual ~DisableGCSnapshotChecker() {} - virtual bool IsInSnapshot(SequenceNumber sequence, - SequenceNumber snapshot_sequence) const { + virtual bool IsInSnapshot(SequenceNumber /*sequence*/, + SequenceNumber /*snapshot_sequence*/) const { // By returning false, we prevent all the values from being GCed return false; } diff --git a/db/table_cache.cc b/db/table_cache.cc index 56b8272d46c..a3c02fa8be4 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -30,7 +30,7 @@ namespace rocksdb { namespace { template -static void DeleteEntry(const Slice& key, void* value) { +static void DeleteEntry(const Slice& /*key*/, void* value) { T* typed_value = reinterpret_cast(value); delete typed_value; } @@ -41,7 +41,7 @@ static void UnrefEntry(void* arg1, void* arg2) { cache->Release(h); } -static void DeleteTableReader(void* arg1, void* arg2) { +static void DeleteTableReader(void* arg1, void* /*arg2*/) { TableReader* table_reader = reinterpret_cast(arg1); delete table_reader; } @@ -247,13 +247,15 @@ InternalIterator* TableCache::NewIterator( } } if (s.ok() && range_del_agg != nullptr && !options.ignore_range_deletions) { - std::unique_ptr range_del_iter( - table_reader->NewRangeTombstoneIterator(options)); - if (range_del_iter != nullptr) { - s = range_del_iter->status(); - } - if (s.ok()) { - s = range_del_agg->AddTombstones(std::move(range_del_iter)); + if (range_del_agg->AddFile(fd.GetNumber())) { + std::unique_ptr range_del_iter( + table_reader->NewRangeTombstoneIterator(options)); + if (range_del_iter != nullptr) { + s = range_del_iter->status(); + } + if (s.ok()) { + s = range_del_agg->AddTombstones(std::move(range_del_iter)); + } } } diff --git a/db/table_properties_collector.cc b/db/table_properties_collector.cc index fc27844b87a..084cf139db8 100644 --- a/db/table_properties_collector.cc +++ b/db/table_properties_collector.cc @@ -12,8 +12,8 @@ namespace rocksdb { Status InternalKeyPropertiesCollector::InternalAdd(const Slice& key, - const Slice& value, - uint64_t file_size) { + const Slice& /*value*/, + uint64_t /*file_size*/) { ParsedInternalKey ikey; if (!ParseInternalKey(key, &ikey)) { return Status::InvalidArgument("Invalid internal key"); diff --git a/db/table_properties_collector.h b/db/table_properties_collector.h index d8cd75689d5..7216ec3190f 100644 --- a/db/table_properties_collector.h +++ b/db/table_properties_collector.h @@ -73,7 +73,7 @@ class InternalKeyPropertiesCollectorFactory : public IntTblPropCollectorFactory { public: virtual IntTblPropCollector* CreateIntTblPropCollector( - uint32_t column_family_id) override { + uint32_t /*column_family_id*/) override { return new InternalKeyPropertiesCollector(); } diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index 66c66c02531..bf382b4fddc 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -82,8 +82,9 @@ class RegularKeysStartWithA: public TablePropertiesCollector { return Status::OK(); } - Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type, - SequenceNumber seq, uint64_t file_size) override { + Status AddUserKey(const Slice& user_key, const Slice& /*value*/, + EntryType type, SequenceNumber /*seq*/, + uint64_t file_size) override { // simply asssume all user keys are not empty. if (user_key.data()[0] == 'A') { ++count_; @@ -133,7 +134,7 @@ class RegularKeysStartWithABackwardCompatible return Status::OK(); } - Status Add(const Slice& user_key, const Slice& value) override { + Status Add(const Slice& user_key, const Slice& /*value*/) override { // simply asssume all user keys are not empty. if (user_key.data()[0] == 'A') { ++count_; @@ -161,8 +162,8 @@ class RegularKeysStartWithAInternal : public IntTblPropCollector { return Status::OK(); } - Status InternalAdd(const Slice& user_key, const Slice& value, - uint64_t file_size) override { + Status InternalAdd(const Slice& user_key, const Slice& /*value*/, + uint64_t /*file_size*/) override { // simply asssume all user keys are not empty. if (user_key.data()[0] == 'A') { ++count_; @@ -193,7 +194,7 @@ class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory, } } virtual IntTblPropCollector* CreateIntTblPropCollector( - uint32_t column_family_id) override { + uint32_t /*column_family_id*/) override { return new RegularKeysStartWithAInternal(); } const char* Name() const override { return "RegularKeysStartWithA"; } @@ -203,7 +204,7 @@ class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory, class FlushBlockEveryThreePolicy : public FlushBlockPolicy { public: - virtual bool Update(const Slice& key, const Slice& value) override { + virtual bool Update(const Slice& /*key*/, const Slice& /*value*/) override { return (++count_ % 3U == 0); } @@ -220,8 +221,8 @@ class FlushBlockEveryThreePolicyFactory : public FlushBlockPolicyFactory { } FlushBlockPolicy* NewFlushBlockPolicy( - const BlockBasedTableOptions& table_options, - const BlockBuilder& data_block_builder) const override { + const BlockBasedTableOptions& /*table_options*/, + const BlockBuilder& /*data_block_builder*/) const override { return new FlushBlockEveryThreePolicy; } }; diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc index 1dbba7de528..011673892c3 100644 --- a/db/transaction_log_impl.cc +++ b/db/transaction_log_impl.cc @@ -269,16 +269,18 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) { return Status::OK(); } - Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override { + Status PutCF(uint32_t /*cf*/, const Slice& /*key*/, + const Slice& /*val*/) override { return Status::OK(); } - Status DeleteCF(uint32_t cf, const Slice& key) override { + Status DeleteCF(uint32_t /*cf*/, const Slice& /*key*/) override { return Status::OK(); } - Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + Status SingleDeleteCF(uint32_t /*cf*/, const Slice& /*key*/) override { return Status::OK(); } - Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override { + Status MergeCF(uint32_t /*cf*/, const Slice& /*key*/, + const Slice& /*val*/) override { return Status::OK(); } Status MarkBeginPrepare() override { return Status::OK(); } diff --git a/db/version_builder.cc b/db/version_builder.cc index 00972d4e590..6507b8e2f29 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -199,7 +199,7 @@ class VersionBuilder::Rep { } } - void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number, + void CheckConsistencyForDeletes(VersionEdit* /*edit*/, uint64_t number, int level) { #ifdef NDEBUG if (!base_vstorage_->force_consistency_checks()) { @@ -420,7 +420,7 @@ class VersionBuilder::Rep { void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) { if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) { - // f is to-be-delected table file + // f is to-be-deleted table file vstorage->RemoveCurrentStats(f); } else { vstorage->AddFile(level, f, info_log_); diff --git a/db/version_edit.cc b/db/version_edit.cc index b01f7bbdf70..ebfc10584c9 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -198,7 +198,7 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) { } } -bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) { +bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) { uint32_t v; if (GetVarint32(input, &v)) { *level = v; diff --git a/db/version_set.cc b/db/version_set.cc index 0de142740fe..b7a62d5e7dd 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -78,6 +78,33 @@ int FindFileInRange(const InternalKeyComparator& icmp, return right; } +Status OverlapWithIterator(const Comparator* ucmp, + const Slice& smallest_user_key, + const Slice& largest_user_key, + InternalIterator* iter, + bool* overlap) { + InternalKey range_start(smallest_user_key, kMaxSequenceNumber, + kValueTypeForSeek); + iter->Seek(range_start.Encode()); + if (!iter->status().ok()) { + return iter->status(); + } + + *overlap = false; + if (iter->Valid()) { + ParsedInternalKey seek_result; + if (!ParseInternalKey(iter->key(), &seek_result)) { + return Status::Corruption("DB have corrupted keys"); + } + + if (ucmp->Compare(seek_result.user_key, largest_user_key) <= 0) { + *overlap = true; + } + } + + return iter->status(); +} + // Class to help choose the next file to search for the particular key. // Searches and returns files level by level. // We can search level-by-level since entries never hop across @@ -891,6 +918,15 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { } } +uint64_t Version::GetSstFilesSize() { + uint64_t sst_files_size = 0; + for (int level = 0; level < storage_info_.num_levels_; level++) { + for (const auto& file_meta : storage_info_.LevelFiles(level)) { + sst_files_size += file_meta->fd.GetFileSize(); + } + } + return sst_files_size; +} uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const { // Estimation will be inaccurate when: @@ -1001,6 +1037,59 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, } } +Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, + const EnvOptions& env_options, + const Slice& smallest_user_key, + const Slice& largest_user_key, + int level, bool* overlap) { + assert(storage_info_.finalized_); + + auto icmp = cfd_->internal_comparator(); + auto ucmp = icmp.user_comparator(); + + Arena arena; + Status status; + RangeDelAggregator range_del_agg(icmp, {}, false); + + *overlap = false; + + if (level == 0) { + for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) { + const auto file = &storage_info_.LevelFilesBrief(0).files[i]; + if (AfterFile(ucmp, &smallest_user_key, file) || + BeforeFile(ucmp, &largest_user_key, file)) { + continue; + } + ScopedArenaIterator iter(cfd_->table_cache()->NewIterator( + read_options, env_options, cfd_->internal_comparator(), file->fd, + &range_del_agg, nullptr, cfd_->internal_stats()->GetFileReadHist(0), + false, &arena, false /* skip_filters */, 0 /* level */)); + status = OverlapWithIterator( + ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); + if (!status.ok() || *overlap) { + break; + } + } + } else if (storage_info_.LevelFilesBrief(level).num_files > 0) { + auto mem = arena.AllocateAligned(sizeof(LevelIterator)); + ScopedArenaIterator iter(new (mem) LevelIterator( + cfd_->table_cache(), read_options, env_options, + cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), + should_sample_file_read(), + cfd_->internal_stats()->GetFileReadHist(level), + false /* for_compaction */, IsFilterSkipped(level), level, + &range_del_agg)); + status = OverlapWithIterator( + ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); + } + + if (status.ok() && *overlap == false && + range_del_agg.IsRangeOverlapped(smallest_user_key, largest_user_key)) { + *overlap = true; + } + return status; +} + VersionStorageInfo::VersionStorageInfo( const InternalKeyComparator* internal_comparator, const Comparator* user_comparator, int levels, @@ -1745,7 +1834,8 @@ void SortFileByOverlappingRatio( void VersionStorageInfo::UpdateFilesByCompactionPri( CompactionPri compaction_pri) { - if (compaction_style_ == kCompactionStyleFIFO || + if (compaction_style_ == kCompactionStyleNone || + compaction_style_ == kCompactionStyleFIFO || compaction_style_ == kCompactionStyleUniversal) { // don't need this return; @@ -2898,7 +2988,7 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) { } void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, - VersionBuilder* builder, Version* v, + VersionBuilder* builder, Version* /*v*/, VersionEdit* edit, InstrumentedMutex* mu) { mu->AssertHeld(); assert(!edit->IsColumnFamilyManipulation()); @@ -3006,7 +3096,7 @@ Status VersionSet::Recover( { VersionSet::LogReporter reporter; reporter.status = &s; - log::Reader reader(NULL, std::move(manifest_file_reader), &reporter, + log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter, true /*checksum*/, 0 /*initial_offset*/, 0); Slice record; std::string scratch; @@ -3275,7 +3365,7 @@ Status VersionSet::ListColumnFamilies(std::vector* column_families, column_family_names.insert({0, kDefaultColumnFamilyName}); VersionSet::LogReporter reporter; reporter.status = &s; - log::Reader reader(NULL, std::move(file_reader), &reporter, true /*checksum*/, + log::Reader reader(nullptr, std::move(file_reader), &reporter, true /*checksum*/, 0 /*initial_offset*/, 0); Slice record; std::string scratch; @@ -3435,7 +3525,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, { VersionSet::LogReporter reporter; reporter.status = &s; - log::Reader reader(NULL, std::move(file_reader), &reporter, + log::Reader reader(nullptr, std::move(file_reader), &reporter, true /*checksum*/, 0 /*initial_offset*/, 0); Slice record; std::string scratch; diff --git a/db/version_set.h b/db/version_set.h index ea6e4e88a75..832857f6334 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -115,7 +115,7 @@ class VersionStorageInfo { // Update the accumulated stats from a file-meta. void UpdateAccumulatedStats(FileMetaData* file_meta); - // Decrease the current stat form a to-be-delected file-meta + // Decrease the current stat from a to-be-deleted file-meta void RemoveCurrentStats(FileMetaData* file_meta); void ComputeCompensatedSizes(); @@ -491,7 +491,7 @@ class VersionStorageInfo { uint64_t accumulated_num_deletions_; // current number of non_deletion entries uint64_t current_num_non_deletions_; - // current number of delection entries + // current number of deletion entries uint64_t current_num_deletions_; // current number of file samples uint64_t current_num_samples_; @@ -525,6 +525,11 @@ class Version { MergeIteratorBuilder* merger_iter_builder, int level, RangeDelAggregator* range_del_agg); + Status OverlapWithLevelIterator(const ReadOptions&, const EnvOptions&, + const Slice& smallest_user_key, + const Slice& largest_user_key, + int level, bool* overlap); + // Lookup the value for key. If found, store it in *val and // return OK. Else return a non-OK status. // Uses *operands to store merge_operator operations to apply later. @@ -565,13 +570,13 @@ class Version { // Return a human readable string that describes this version's contents. std::string DebugString(bool hex = false, bool print_stats = false) const; - // Returns the version nuber of this version + // Returns the version number of this version uint64_t GetVersionNumber() const { return version_number_; } // REQUIRES: lock is held // On success, "tp" will contains the table properties of the file // specified in "file_meta". If the file name of "file_meta" is - // known ahread, passing it by a non-null "fname" can save a + // known ahead, passing it by a non-null "fname" can save a // file-name conversion. Status GetTableProperties(std::shared_ptr* tp, const FileMetaData* file_meta, @@ -580,14 +585,14 @@ class Version { // REQUIRES: lock is held // On success, *props will be populated with all SSTables' table properties. // The keys of `props` are the sst file name, the values of `props` are the - // tables' propertis, represented as shared_ptr. + // tables' properties, represented as shared_ptr. Status GetPropertiesOfAllTables(TablePropertiesCollection* props); Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level); Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n, TablePropertiesCollection* props) const; // REQUIRES: lock is held - // On success, "tp" will contains the aggregated table property amoug + // On success, "tp" will contains the aggregated table property among // the table properties of all sst files in this version. Status GetAggregatedTableProperties( std::shared_ptr* tp, int level = -1); @@ -613,6 +618,8 @@ class Version { void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta); + uint64_t GetSstFilesSize(); + private: Env* env_; friend class VersionSet; @@ -635,7 +642,7 @@ class Version { bool IsFilterSkipped(int level, bool is_file_last_in_level = false); // The helper function of UpdateAccumulatedStats, which may fill the missing - // fields of file_mata from its associated TableProperties. + // fields of file_meta from its associated TableProperties. // Returns true if it does initialize FileMetaData. bool MaybeInitializeFileMetaData(FileMetaData* file_meta); @@ -773,7 +780,7 @@ class VersionSet { // Set the last sequence number to s. void SetLastSequence(uint64_t s) { assert(s >= last_sequence_); - // Last visible seqeunce must always be less than last written seq + // Last visible sequence must always be less than last written seq assert(!db_options_->two_write_queues || s <= last_allocated_sequence_); last_sequence_.store(s, std::memory_order_release); } @@ -870,7 +877,7 @@ class VersionSet { struct LogReporter : public log::Reader::Reporter { Status* status; - virtual void Corruption(size_t bytes, const Status& s) override { + virtual void Corruption(size_t /*bytes*/, const Status& s) override { if (this->status->ok()) *this->status = s; } }; @@ -911,7 +918,7 @@ class VersionSet { // The last allocated sequence that is also published to the readers. This is // applicable only when last_seq_same_as_publish_seq_ is not set. Otherwise // last_sequence_ also indicates the last published seq. - // We have last_sequence <= last_published_seqeunce_ <= + // We have last_sequence <= last_published_sequence_ <= // last_allocated_sequence_ std::atomic last_published_sequence_; uint64_t prev_log_number_; // 0 or backing store for memtable being compacted diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 625d4592264..090e074cf0d 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -76,7 +76,9 @@ class CountingLogger : public Logger { public: CountingLogger() : log_count(0) {} using Logger::Logv; - virtual void Logv(const char* format, va_list ap) override { log_count++; } + virtual void Logv(const char* /*format*/, va_list /*ap*/) override { + log_count++; + } int log_count; }; diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc index fe54b84cb8e..224defe261e 100644 --- a/db/wal_manager_test.cc +++ b/db/wal_manager_test.cc @@ -73,7 +73,7 @@ class WalManagerTest : public testing::Test { } // NOT thread safe - void RollTheLog(bool archived) { + void RollTheLog(bool /*archived*/) { current_log_number_++; std::string fname = ArchivedLogFileName(dbname_, current_log_number_); unique_ptr file; diff --git a/db/write_batch.cc b/db/write_batch.cc index 4e257b31984..f272604cbf7 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -52,6 +52,7 @@ #include "monitoring/statistics.h" #include "rocksdb/merge_operator.h" #include "util/coding.h" +#include "util/duplicate_detector.h" #include "util/string_util.h" namespace rocksdb { @@ -186,7 +187,7 @@ WriteBatch::~WriteBatch() { delete save_points_; } WriteBatch::Handler::~Handler() { } -void WriteBatch::Handler::LogData(const Slice& blob) { +void WriteBatch::Handler::LogData(const Slice& /*blob*/) { // If the user has not specified something to do with blobs, then we ignore // them. } @@ -396,7 +397,7 @@ Status WriteBatch::Iterate(Handler* handler) const { input.remove_prefix(WriteBatchInternal::kHeader); Slice key, value, blob, xid; // Sometimes a sub-batch starts with a Noop. We want to exclude such Noops as - // the batch boundry sybmols otherwise we would mis-count the number of + // the batch boundary symbols otherwise we would mis-count the number of // batches. We do that by checking whether the accumulated batch is empty // before seeing the next Noop. bool empty_batch = true; @@ -569,7 +570,7 @@ void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { EncodeFixed64(&b->rep_[0], seq); } -size_t WriteBatchInternal::GetFirstOffset(WriteBatch* b) { +size_t WriteBatchInternal::GetFirstOffset(WriteBatch* /*b*/) { return WriteBatchInternal::kHeader; } @@ -1008,6 +1009,9 @@ class MemTableInserter : public WriteBatch::Handler { bool seq_per_batch_; // Whether the memtable write will be done only after the commit bool write_after_commit_; + using DupDetector = std::aligned_storage::type; + DupDetector duplicate_detector_; + bool dup_dectector_on_; MemPostInfoMap& GetPostMap() { assert(concurrent_memtable_writes_); @@ -1018,6 +1022,17 @@ class MemTableInserter : public WriteBatch::Handler { return *reinterpret_cast(&mem_post_info_map_); } + bool IsDuplicateKeySeq(uint32_t column_family_id, const Slice& key) { + assert(!write_after_commit_); + assert(rebuilding_trx_ != nullptr); + if (!dup_dectector_on_) { + new (&duplicate_detector_) DuplicateDetector(db_); + dup_dectector_on_ = true; + } + return reinterpret_cast + (&duplicate_detector_)->IsDuplicateKeySeq(column_family_id, key, sequence_); + } + protected: virtual bool WriteAfterCommit() const override { return write_after_commit_; } @@ -1045,11 +1060,17 @@ class MemTableInserter : public WriteBatch::Handler { // Write after commit currently uses one seq per key (instead of per // batch). So seq_per_batch being false indicates write_after_commit // approach. - write_after_commit_(!seq_per_batch) { + write_after_commit_(!seq_per_batch), + duplicate_detector_(), + dup_dectector_on_(false) { assert(cf_mems_); } ~MemTableInserter() { + if (dup_dectector_on_) { + reinterpret_cast + (&duplicate_detector_)->~DuplicateDetector(); + } if (post_info_created_) { reinterpret_cast (&mem_post_info_map_)->~MemPostInfoMap(); @@ -1067,11 +1088,11 @@ class MemTableInserter : public WriteBatch::Handler { // is set when a batch, which is tagged with seq, is read from the WAL. // Within a sequenced batch, which could be a merge of multiple batches, we // have two policies to advance the seq: i) seq_per_key (default) and ii) - // seq_per_batch. To implement the latter we need to mark the boundry between + // seq_per_batch. To implement the latter we need to mark the boundary between // the individual batches. The approach is this: 1) Use the terminating - // markers to indicate the boundry (kTypeEndPrepareXID, kTypeCommitXID, - // kTypeRollbackXID) 2) Terminate a batch with kTypeNoop in the absense of a - // natural boundy marker. + // markers to indicate the boundary (kTypeEndPrepareXID, kTypeCommitXID, + // kTypeRollbackXID) 2) Terminate a batch with kTypeNoop in the absence of a + // natural boundary marker. void MaybeAdvanceSeq(bool batch_boundry = false) { if (batch_boundry == seq_per_batch_) { sequence_++; @@ -1135,17 +1156,24 @@ class MemTableInserter : public WriteBatch::Handler { Status PutCFImpl(uint32_t column_family_id, const Slice& key, const Slice& value, ValueType value_type) { - if (rebuilding_trx_ != nullptr) { + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value); - if (write_after_commit_) { - return Status::OK(); - } + return Status::OK(); // else insert the values to the memtable right away } Status seek_status; - if (!SeekToColumnFamily(column_family_id, &seek_status)) { - MaybeAdvanceSeq(); + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) { + bool batch_boundry = false; + if (rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value); + batch_boundry = IsDuplicateKeySeq(column_family_id, key); + } + MaybeAdvanceSeq(batch_boundry); return seek_status; } Status ret_status; @@ -1215,7 +1243,14 @@ class MemTableInserter : public WriteBatch::Handler { } } } - // Since all Puts are logged in trasaction logs (if enabled), always bump + // optimize for non-recovery mode + if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // If the ret_status is TryAgain then let the next try to add the ky to + // the rebuilding transaction object. + WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value); + } + // Since all Puts are logged in transaction logs (if enabled), always bump // sequence number. Even if the update eventually fails and does not result // in memtable add/update. MaybeAdvanceSeq(); @@ -1228,7 +1263,7 @@ class MemTableInserter : public WriteBatch::Handler { return PutCFImpl(column_family_id, key, value, kTypeValue); } - Status DeleteImpl(uint32_t column_family_id, const Slice& key, + Status DeleteImpl(uint32_t /*column_family_id*/, const Slice& key, const Slice& value, ValueType delete_type) { Status ret_status; MemTable* mem = cf_mems_->GetMemTable(); @@ -1248,57 +1283,99 @@ class MemTableInserter : public WriteBatch::Handler { virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) override { - if (rebuilding_trx_ != nullptr) { + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); - if (write_after_commit_) { - return Status::OK(); - } + return Status::OK(); // else insert the values to the memtable right away } Status seek_status; - if (!SeekToColumnFamily(column_family_id, &seek_status)) { - MaybeAdvanceSeq(); + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) { + bool batch_boundry = false; + if (rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); + batch_boundry = IsDuplicateKeySeq(column_family_id, key); + } + MaybeAdvanceSeq(batch_boundry); return seek_status; } - return DeleteImpl(column_family_id, key, Slice(), kTypeDeletion); + auto ret_status = DeleteImpl(column_family_id, key, Slice(), kTypeDeletion); + // optimize for non-recovery mode + if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // If the ret_status is TryAgain then let the next try to add the ky to + // the rebuilding transaction object. + WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); + } + return ret_status; } virtual Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override { - if (rebuilding_trx_ != nullptr) { + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key); - if (write_after_commit_) { - return Status::OK(); - } + return Status::OK(); // else insert the values to the memtable right away } Status seek_status; - if (!SeekToColumnFamily(column_family_id, &seek_status)) { - MaybeAdvanceSeq(); + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) { + bool batch_boundry = false; + if (rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, + key); + batch_boundry = IsDuplicateKeySeq(column_family_id, key); + } + MaybeAdvanceSeq(batch_boundry); return seek_status; } - return DeleteImpl(column_family_id, key, Slice(), kTypeSingleDeletion); + auto ret_status = + DeleteImpl(column_family_id, key, Slice(), kTypeSingleDeletion); + // optimize for non-recovery mode + if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // If the ret_status is TryAgain then let the next try to add the ky to + // the rebuilding transaction object. + WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key); + } + return ret_status; } virtual Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key, const Slice& end_key) override { - if (rebuilding_trx_ != nullptr) { + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id, begin_key, end_key); - if (write_after_commit_) { - return Status::OK(); - } + return Status::OK(); // else insert the values to the memtable right away } Status seek_status; - if (!SeekToColumnFamily(column_family_id, &seek_status)) { - MaybeAdvanceSeq(); + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) { + bool batch_boundry = false; + if (rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id, + begin_key, end_key); + // TODO(myabandeh): when transactional DeleteRange support is added, + // check if end_key must also be added. + batch_boundry = IsDuplicateKeySeq(column_family_id, begin_key); + } + MaybeAdvanceSeq(batch_boundry); return seek_status; } if (db_ != nullptr) { @@ -1315,23 +1392,41 @@ class MemTableInserter : public WriteBatch::Handler { } } - return DeleteImpl(column_family_id, begin_key, end_key, kTypeRangeDeletion); + auto ret_status = + DeleteImpl(column_family_id, begin_key, end_key, kTypeRangeDeletion); + // optimize for non-recovery mode + if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // If the ret_status is TryAgain then let the next try to add the ky to + // the rebuilding transaction object. + WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id, + begin_key, end_key); + } + return ret_status; } virtual Status MergeCF(uint32_t column_family_id, const Slice& key, const Slice& value) override { assert(!concurrent_memtable_writes_); - if (rebuilding_trx_ != nullptr) { + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value); - if (write_after_commit_) { - return Status::OK(); - } + return Status::OK(); // else insert the values to the memtable right away } Status seek_status; - if (!SeekToColumnFamily(column_family_id, &seek_status)) { - MaybeAdvanceSeq(); + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) { + bool batch_boundry = false; + if (rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, + value); + batch_boundry = IsDuplicateKeySeq(column_family_id, key); + } + MaybeAdvanceSeq(batch_boundry); return seek_status; } @@ -1412,6 +1507,13 @@ class MemTableInserter : public WriteBatch::Handler { } } + // optimize for non-recovery mode + if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // If the ret_status is TryAgain then let the next try to add the ky to + // the rebuilding transaction object. + WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value); + } MaybeAdvanceSeq(); CheckMemtableFull(); return ret_status; @@ -1466,8 +1568,13 @@ class MemTableInserter : public WriteBatch::Handler { if (recovering_log_number_ != 0) { assert(db_->allow_2pc()); + size_t batch_cnt = + write_after_commit_ + ? 0 // 0 will disable further checks + : static_cast(sequence_ - rebuilding_trx_seq_ + 1); db_->InsertRecoveredTransaction(recovering_log_number_, name.ToString(), - rebuilding_trx_, rebuilding_trx_seq_); + rebuilding_trx_, rebuilding_trx_seq_, + batch_cnt); rebuilding_trx_ = nullptr; } else { assert(rebuilding_trx_ == nullptr); @@ -1502,7 +1609,7 @@ class MemTableInserter : public WriteBatch::Handler { // and commit. auto trx = db_->GetRecoveredTransaction(name.ToString()); - // the log contaiting the prepared section may have + // the log containing the prepared section may have // been released in the last incarnation because the // data was flushed to L0 if (trx != nullptr) { @@ -1510,7 +1617,7 @@ class MemTableInserter : public WriteBatch::Handler { // duplicate re-insertion of values. assert(log_number_ref_ == 0); if (write_after_commit_) { - // all insertes must reference this trx log number + // all inserts must reference this trx log number log_number_ref_ = trx->log_number_; s = trx->batch_->Iterate(this); log_number_ref_ = 0; diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index 9a200f3cbc9..ba0e9ffe450 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -117,10 +117,10 @@ class WriteBatchInternal { // Set the count for the number of entries in the batch. static void SetCount(WriteBatch* batch, int n); - // Return the seqeunce number for the start of this batch. + // Return the sequence number for the start of this batch. static SequenceNumber Sequence(const WriteBatch* batch); - // Store the specified number as the seqeunce number for the start of + // Store the specified number as the sequence number for the start of // this batch. static void SetSequence(WriteBatch* batch, SequenceNumber seq); @@ -168,7 +168,7 @@ class WriteBatchInternal { bool seq_per_batch = false); // Convenience form of InsertInto when you have only one batch - // next_seq returns the seq after last sequnce number used in MemTable insert + // next_seq returns the seq after last sequence number used in MemTable insert static Status InsertInto(const WriteBatch* batch, ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 4511f015b9d..cf2a121e473 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -437,7 +437,7 @@ TEST_F(WriteBatchTest, DISABLED_ManyUpdates) { struct NoopHandler : public WriteBatch::Handler { uint32_t num_seen = 0; char expected_char = 'A'; - virtual Status PutCF(uint32_t column_family_id, const Slice& key, + virtual Status PutCF(uint32_t /*column_family_id*/, const Slice& key, const Slice& value) override { EXPECT_EQ(kKeyValueSize, key.size()); EXPECT_EQ(kKeyValueSize, value.size()); @@ -452,22 +452,22 @@ TEST_F(WriteBatchTest, DISABLED_ManyUpdates) { ++num_seen; return Status::OK(); } - virtual Status DeleteCF(uint32_t column_family_id, - const Slice& key) override { + virtual Status DeleteCF(uint32_t /*column_family_id*/, + const Slice& /*key*/) override { ADD_FAILURE(); return Status::OK(); } - virtual Status SingleDeleteCF(uint32_t column_family_id, - const Slice& key) override { + virtual Status SingleDeleteCF(uint32_t /*column_family_id*/, + const Slice& /*key*/) override { ADD_FAILURE(); return Status::OK(); } - virtual Status MergeCF(uint32_t column_family_id, const Slice& key, - const Slice& value) override { + virtual Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/, + const Slice& /*value*/) override { ADD_FAILURE(); return Status::OK(); } - virtual void LogData(const Slice& blob) override { ADD_FAILURE(); } + virtual void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); } virtual bool Continue() override { return num_seen < kNumUpdates; } } handler; @@ -492,7 +492,7 @@ TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) { struct NoopHandler : public WriteBatch::Handler { int num_seen = 0; - virtual Status PutCF(uint32_t column_family_id, const Slice& key, + virtual Status PutCF(uint32_t /*column_family_id*/, const Slice& key, const Slice& value) override { EXPECT_EQ(kKeyValueSize, key.size()); EXPECT_EQ(kKeyValueSize, value.size()); @@ -503,22 +503,22 @@ TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) { ++num_seen; return Status::OK(); } - virtual Status DeleteCF(uint32_t column_family_id, - const Slice& key) override { + virtual Status DeleteCF(uint32_t /*column_family_id*/, + const Slice& /*key*/) override { ADD_FAILURE(); return Status::OK(); } - virtual Status SingleDeleteCF(uint32_t column_family_id, - const Slice& key) override { + virtual Status SingleDeleteCF(uint32_t /*column_family_id*/, + const Slice& /*key*/) override { ADD_FAILURE(); return Status::OK(); } - virtual Status MergeCF(uint32_t column_family_id, const Slice& key, - const Slice& value) override { + virtual Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/, + const Slice& /*value*/) override { ADD_FAILURE(); return Status::OK(); } - virtual void LogData(const Slice& blob) override { ADD_FAILURE(); } + virtual void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); } virtual bool Continue() override { return num_seen < 2; } } handler; diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc index 7d04d501736..c91a4305cbd 100644 --- a/db/write_callback_test.cc +++ b/db/write_callback_test.cc @@ -54,9 +54,7 @@ class WriteCallbackTestWriteCallback1 : public WriteCallback { class WriteCallbackTestWriteCallback2 : public WriteCallback { public: - Status Callback(DB *db) override { - return Status::Busy(); - } + Status Callback(DB* /*db*/) override { return Status::Busy(); } bool AllowWriteBatching() override { return true; } }; @@ -74,7 +72,7 @@ class MockWriteCallback : public WriteCallback { was_called_.store(other.was_called_.load()); } - Status Callback(DB* db) override { + Status Callback(DB* /*db*/) override { was_called_.store(true); if (should_fail_) { return Status::Busy(); @@ -296,7 +294,8 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) { public: PublishSeqCallback(DBImpl* db_impl_in) : db_impl_(db_impl_in) {} - virtual Status Callback(SequenceNumber last_seq) { + virtual Status Callback(SequenceNumber last_seq, + const bool /*not used*/) override { db_impl_->SetLastPublishedSequence(last_seq); return Status::OK(); } diff --git a/db/write_thread.cc b/db/write_thread.cc index e115ba53926..a44f028cecf 100644 --- a/db/write_thread.cc +++ b/db/write_thread.cc @@ -456,7 +456,8 @@ void WriteThread::EnterAsMemTableWriter(Writer* leader, last_writer->sequence + WriteBatchInternal::Count(last_writer->batch) - 1; } -void WriteThread::ExitAsMemTableWriter(Writer* self, WriteGroup& write_group) { +void WriteThread::ExitAsMemTableWriter(Writer* /*self*/, + WriteGroup& write_group) { Writer* leader = write_group.leader; Writer* last_writer = write_group.last_writer; diff --git a/env/env.cc b/env/env.cc index f428697cb41..9f165d6d126 100644 --- a/env/env.cc +++ b/env/env.cc @@ -73,7 +73,7 @@ RandomAccessFile::~RandomAccessFile() { WritableFile::~WritableFile() { } -Logger::~Logger() { } +Logger::~Logger() {} Status Logger::Close() { if (!closed_) { diff --git a/env/env_encryption.cc b/env/env_encryption.cc index 6b688a66020..e80796fe0c7 100644 --- a/env/env_encryption.cc +++ b/env/env_encryption.cc @@ -150,7 +150,7 @@ class EncryptedRandomAccessFile : public RandomAccessFile { // may not have been modified. // // This function guarantees, for IDs from a given environment, two unique ids - // cannot be made equal to eachother by adding arbitrary bytes to one of + // cannot be made equal to each other by adding arbitrary bytes to one of // them. That is, no unique ID is the prefix of another. // // This function guarantees that the returned ID will not be interpretable as @@ -584,7 +584,7 @@ class EncryptedEnv : public EnvWrapper { return Status::OK(); } - // Open `fname` for random read and write, if file dont exist the file + // Open `fname` for random read and write, if file doesn't exist the file // will be created. On success, stores a pointer to the new file in // *result and returns OK. On failure returns non-OK. // @@ -828,7 +828,7 @@ Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char *data, char* scra // GetPrefixLength returns the length of the prefix that is added to every file // and used for storing encryption options. // For optimal performance, the prefix length should be a multiple of -// the a page size. +// the page size. size_t CTREncryptionProvider::GetPrefixLength() { return defaultPrefixLength; } @@ -844,7 +844,9 @@ static void decodeCTRParameters(const char *prefix, size_t blockSize, uint64_t & // CreateNewPrefix initialized an allocated block of prefix memory // for a new file. -Status CTREncryptionProvider::CreateNewPrefix(const std::string& fname, char *prefix, size_t prefixLength) { +Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, + char* prefix, + size_t prefixLength) { // Create & seed rnd. Random rnd((uint32_t)Env::Default()->NowMicros()); // Fill entire prefix block with random values. @@ -873,7 +875,9 @@ Status CTREncryptionProvider::CreateNewPrefix(const std::string& fname, char *pr // in plain text. // Returns the amount of space (starting from the start of the prefix) // that has been initialized. -size_t CTREncryptionProvider::PopulateSecretPrefixPart(char *prefix, size_t prefixLength, size_t blockSize) { +size_t CTREncryptionProvider::PopulateSecretPrefixPart(char* /*prefix*/, + size_t /*prefixLength*/, + size_t /*blockSize*/) { // Nothing to do here, put in custom data in override when needed. return 0; } @@ -898,8 +902,10 @@ Status CTREncryptionProvider::CreateCipherStream(const std::string& fname, const // CreateCipherStreamFromPrefix creates a block access cipher stream for a file given // given name and options. The given prefix is already decrypted. -Status CTREncryptionProvider::CreateCipherStreamFromPrefix(const std::string& fname, const EnvOptions& options, - uint64_t initialCounter, const Slice& iv, const Slice& prefix, unique_ptr* result) { +Status CTREncryptionProvider::CreateCipherStreamFromPrefix( + const std::string& /*fname*/, const EnvOptions& /*options*/, + uint64_t initialCounter, const Slice& iv, const Slice& /*prefix*/, + unique_ptr* result) { (*result) = unique_ptr(new CTRCipherStream(cipher_, iv.data(), initialCounter)); return Status::OK(); } diff --git a/env/env_hdfs.cc b/env/env_hdfs.cc index c29eb7a1621..1eaea3a1ce5 100644 --- a/env/env_hdfs.cc +++ b/env/env_hdfs.cc @@ -288,9 +288,7 @@ class HdfsLogger : public Logger { } protected: - virtual Status CloseImpl() override { - return HdfsCloseHelper(); - } + virtual Status CloseImpl() override { return HdfsCloseHelper(); } public: HdfsLogger(HdfsWritableFile* f, uint64_t (*gettid)()) @@ -611,13 +609,13 @@ Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname) { // dummy placeholders used when HDFS is not available namespace rocksdb { - Status HdfsEnv::NewSequentialFile(const std::string& fname, - unique_ptr* result, - const EnvOptions& options) { - return Status::NotSupported("Not compiled with hdfs support"); +Status HdfsEnv::NewSequentialFile(const std::string& /*fname*/, + unique_ptr* /*result*/, + const EnvOptions& /*options*/) { + return Status::NotSupported("Not compiled with hdfs support"); } - Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname) { + Status NewHdfsEnv(Env** /*hdfs_env*/, const std::string& /*fsname*/) { return Status::NotSupported("Not compiled with hdfs support"); } } diff --git a/env/env_posix.cc b/env/env_posix.cc index c0e93603333..fa40f8fdce4 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -647,7 +647,7 @@ class PosixEnv : public Env { virtual void Schedule(void (*function)(void* arg1), void* arg, Priority pri = LOW, void* tag = nullptr, - void (*unschedFunction)(void* arg) = 0) override; + void (*unschedFunction)(void* arg) = nullptr) override; virtual int UnSchedule(void* arg, Priority pri) override; diff --git a/env/env_test.cc b/env/env_test.cc index 2360a5e8b3b..bf19980a6ca 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -135,6 +135,37 @@ TEST_F(EnvPosixTest, RunImmediately) { } } +#ifdef OS_WIN +TEST_F(EnvPosixTest, AreFilesSame) { + { + bool tmp; + if (env_->AreFilesSame("", "", &tmp).IsNotSupported()) { + fprintf(stderr, + "skipping EnvBasicTestWithParam.AreFilesSame due to " + "unsupported Env::AreFilesSame\n"); + return; + } + } + + const EnvOptions soptions; + auto* env = Env::Default(); + std::string same_file_name = test::TmpDir(env) + "/same_file"; + std::string same_file_link_name = same_file_name + "_link"; + + std::unique_ptr same_file; + ASSERT_OK(env->NewWritableFile(same_file_name, + &same_file, soptions)); + same_file->Append("random_data"); + ASSERT_OK(same_file->Flush()); + same_file.reset(); + + ASSERT_OK(env->LinkFile(same_file_name, same_file_link_name)); + bool result = false; + ASSERT_OK(env->AreFilesSame(same_file_name, same_file_link_name, &result)); + ASSERT_TRUE(result); +} +#endif + TEST_P(EnvPosixTestWithParam, UnSchedule) { std::atomic called(false); env_->SetBackgroundThreads(1, Env::LOW); @@ -1143,7 +1174,7 @@ TEST_P(EnvPosixTestWithParam, Preallocation) { unique_ptr srcfile; EnvOptions soptions; soptions.use_direct_reads = soptions.use_direct_writes = direct_io_; -#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX) && !defined(OS_OPENBSD) +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD) if (soptions.use_direct_writes) { rocksdb::SyncPoint::GetInstance()->SetCallBack( "NewWritableFile:O_DIRECT", [&](void* arg) { @@ -1205,7 +1236,7 @@ TEST_P(EnvPosixTestWithParam, ConsistentChildrenAttributes) { oss << test::TmpDir(env_) << "/testfile_" << i; const std::string path = oss.str(); unique_ptr file; -#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX) && !defined(OS_OPENBSD) +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD) if (soptions.use_direct_writes) { rocksdb::SyncPoint::GetInstance()->SetCallBack( "NewWritableFile:O_DIRECT", [&](void* arg) { @@ -1255,33 +1286,36 @@ TEST_P(EnvPosixTestWithParam, WritableFileWrapper) { inc(0); } - Status Append(const Slice& data) override { inc(1); return Status::OK(); } - Status Truncate(uint64_t size) override { return Status::OK(); } + Status Append(const Slice& /*data*/) override { + inc(1); + return Status::OK(); + } + Status Truncate(uint64_t /*size*/) override { return Status::OK(); } Status Close() override { inc(2); return Status::OK(); } Status Flush() override { inc(3); return Status::OK(); } Status Sync() override { inc(4); return Status::OK(); } Status Fsync() override { inc(5); return Status::OK(); } - void SetIOPriority(Env::IOPriority pri) override { inc(6); } + void SetIOPriority(Env::IOPriority /*pri*/) override { inc(6); } uint64_t GetFileSize() override { inc(7); return 0; } - void GetPreallocationStatus(size_t* block_size, - size_t* last_allocated_block) override { + void GetPreallocationStatus(size_t* /*block_size*/, + size_t* /*last_allocated_block*/) override { inc(8); } - size_t GetUniqueId(char* id, size_t max_size) const override { + size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override { inc(9); return 0; } - Status InvalidateCache(size_t offset, size_t length) override { + Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override { inc(10); return Status::OK(); } protected: - Status Allocate(uint64_t offset, uint64_t len) override { + Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) override { inc(11); return Status::OK(); } - Status RangeSync(uint64_t offset, uint64_t nbytes) override { + Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) override { inc(12); return Status::OK(); } @@ -1480,47 +1514,46 @@ class TestEnv : public EnvWrapper { explicit TestEnv() : EnvWrapper(Env::Default()), close_count(0) { } - class TestLogger : public Logger { - public: - using Logger::Logv; - TestLogger(TestEnv *env_ptr) : Logger() { env = env_ptr; } - ~TestLogger() { - if (!closed_) { - CloseHelper(); - } - } - virtual void Logv(const char *format, va_list ap) override { }; - protected: - virtual Status CloseImpl() override { - return CloseHelper(); - } - private: - Status CloseHelper() { - env->CloseCountInc();; - return Status::OK(); - } - TestEnv *env; - }; - - void CloseCountInc() { close_count++; } + class TestLogger : public Logger { + public: + using Logger::Logv; + TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; } + ~TestLogger() { + if (!closed_) { + CloseHelper(); + } + } + virtual void Logv(const char* format, va_list ap) override{}; - int GetCloseCount() { return close_count; } + protected: + virtual Status CloseImpl() override { return CloseHelper(); } - virtual Status NewLogger(const std::string& fname, - shared_ptr* result) { - result->reset(new TestLogger(this)); + private: + Status CloseHelper() { + env->CloseCountInc();; return Status::OK(); } + TestEnv* env; + }; - private: - int close_count; -}; + void CloseCountInc() { close_count++; } + + int GetCloseCount() { return close_count; } + + virtual Status NewLogger(const std::string& fname, + shared_ptr* result) { + result->reset(new TestLogger(this)); + return Status::OK(); + } -class EnvTest : public testing::Test { + private: + int close_count; }; +class EnvTest : public testing::Test {}; + TEST_F(EnvTest, Close) { - TestEnv *env = new TestEnv(); + TestEnv* env = new TestEnv(); std::shared_ptr logger; Status s; @@ -1542,7 +1575,6 @@ TEST_F(EnvTest, Close) { delete env; } - INSTANTIATE_TEST_CASE_P(DefaultEnvWithoutDirectIO, EnvPosixTestWithParam, ::testing::Values(std::pair(Env::Default(), false))); diff --git a/env/io_posix.cc b/env/io_posix.cc index e2190670aed..b4f06c228b0 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -448,7 +448,7 @@ PosixMmapReadableFile::~PosixMmapReadableFile() { } Status PosixMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { + char* /*scratch*/) const { Status s; if (offset > length_) { *result = Slice(); @@ -941,7 +941,7 @@ size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const { */ PosixRandomRWFile::PosixRandomRWFile(const std::string& fname, int fd, - const EnvOptions& options) + const EnvOptions& /*options*/) : filename_(fname), fd_(fd) {} PosixRandomRWFile::~PosixRandomRWFile() { diff --git a/env/io_posix.h b/env/io_posix.h index 804864cd1f7..f29a159ae0d 100644 --- a/env/io_posix.h +++ b/env/io_posix.h @@ -202,7 +202,7 @@ class PosixMmapFile : public WritableFile { // Means Close() will properly take care of truncate // and it does not need any additional information - virtual Status Truncate(uint64_t size) override { return Status::OK(); } + virtual Status Truncate(uint64_t /*size*/) override { return Status::OK(); } virtual Status Close() override; virtual Status Append(const Slice& data) override; virtual Status Flush() override; diff --git a/env/mock_env.cc b/env/mock_env.cc index 4e46b467f7f..de008afe64f 100644 --- a/env/mock_env.cc +++ b/env/mock_env.cc @@ -94,35 +94,37 @@ class MemFile { uint64_t end = std::min(start + 512, size_.load()); MutexLock lock(&mutex_); for (uint64_t pos = start; pos < end; ++pos) { - data_[pos] = static_cast(rnd_.Uniform(256)); + data_[static_cast(pos)] = static_cast(rnd_.Uniform(256)); } } Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { MutexLock lock(&mutex_); const uint64_t available = Size() - std::min(Size(), offset); + size_t offset_ = static_cast(offset); if (n > available) { - n = available; + n = static_cast(available); } if (n == 0) { *result = Slice(); return Status::OK(); } if (scratch) { - memcpy(scratch, &(data_[offset]), n); + memcpy(scratch, &(data_[offset_]), n); *result = Slice(scratch, n); } else { - *result = Slice(&(data_[offset]), n); + *result = Slice(&(data_[offset_]), n); } return Status::OK(); } Status Write(uint64_t offset, const Slice& data) { MutexLock lock(&mutex_); + size_t offset_ = static_cast(offset); if (offset + data.size() > data_.size()) { - data_.resize(offset + data.size()); + data_.resize(offset_ + data.size()); } - data_.replace(offset, data.size(), data.data(), data.size()); + data_.replace(offset_, data.size(), data.data(), data.size()); size_ = data_.size(); modified_time_ = Now(); return Status::OK(); @@ -203,7 +205,7 @@ class MockSequentialFile : public SequentialFile { if (pos_ > file_->Size()) { return Status::IOError("pos_ > file_->Size()"); } - const size_t available = file_->Size() - pos_; + const uint64_t available = file_->Size() - pos_; if (n > available) { n = available; } @@ -273,7 +275,7 @@ class MockWritableFile : public WritableFile { } virtual Status Append(const Slice& data) override { - uint64_t bytes_written = 0; + size_t bytes_written = 0; while (bytes_written < data.size()) { auto bytes = RequestToken(data.size() - bytes_written); Status s = file_->Append(Slice(data.data() + bytes_written, bytes)); @@ -285,7 +287,7 @@ class MockWritableFile : public WritableFile { return Status::OK(); } virtual Status Truncate(uint64_t size) override { - file_->Truncate(size); + file_->Truncate(static_cast(size)); return Status::OK(); } virtual Status Close() override { return file_->Fsync(); } @@ -447,12 +449,12 @@ MockEnv::~MockEnv() { // Partial implementation of the Env interface. Status MockEnv::NewSequentialFile(const std::string& fname, - unique_ptr* result, - const EnvOptions& soptions) { + unique_ptr* result, + const EnvOptions& /*soptions*/) { auto fn = NormalizePath(fname); MutexLock lock(&mutex_); if (file_map_.find(fn) == file_map_.end()) { - *result = NULL; + *result = nullptr; return Status::IOError(fn, "File not found"); } auto* f = file_map_[fn]; @@ -464,12 +466,12 @@ Status MockEnv::NewSequentialFile(const std::string& fname, } Status MockEnv::NewRandomAccessFile(const std::string& fname, - unique_ptr* result, - const EnvOptions& soptions) { + unique_ptr* result, + const EnvOptions& /*soptions*/) { auto fn = NormalizePath(fname); MutexLock lock(&mutex_); if (file_map_.find(fn) == file_map_.end()) { - *result = NULL; + *result = nullptr; return Status::IOError(fn, "File not found"); } auto* f = file_map_[fn]; @@ -482,11 +484,11 @@ Status MockEnv::NewRandomAccessFile(const std::string& fname, Status MockEnv::NewRandomRWFile(const std::string& fname, unique_ptr* result, - const EnvOptions& soptions) { + const EnvOptions& /*soptions*/) { auto fn = NormalizePath(fname); MutexLock lock(&mutex_); if (file_map_.find(fn) == file_map_.end()) { - *result = NULL; + *result = nullptr; return Status::IOError(fn, "File not found"); } auto* f = file_map_[fn]; @@ -525,8 +527,8 @@ Status MockEnv::NewWritableFile(const std::string& fname, return Status::OK(); } -Status MockEnv::NewDirectory(const std::string& name, - unique_ptr* result) { +Status MockEnv::NewDirectory(const std::string& /*name*/, + unique_ptr* result) { result->reset(new MockEnvDirectory()); return Status::OK(); } diff --git a/env/posix_logger.h b/env/posix_logger.h index 121591e0dae..e983ba704e4 100644 --- a/env/posix_logger.h +++ b/env/posix_logger.h @@ -52,9 +52,7 @@ class PosixLogger : public Logger { std::atomic flush_pending_; protected: - virtual Status CloseImpl() override { - return PosixCloseHelper(); - } + virtual Status CloseImpl() override { return PosixCloseHelper(); } public: PosixLogger(FILE* f, uint64_t (*gettid)(), Env* env, diff --git a/hdfs/env_hdfs.h b/hdfs/env_hdfs.h index 3a62bc8cb92..b0c9e33fd78 100644 --- a/hdfs/env_hdfs.h +++ b/hdfs/env_hdfs.h @@ -245,7 +245,7 @@ static const Status notsup; class HdfsEnv : public Env { public: - explicit HdfsEnv(const std::string& fsname) { + explicit HdfsEnv(const std::string& /*fsname*/) { fprintf(stderr, "You have not build rocksdb with HDFS support\n"); fprintf(stderr, "Please see hdfs/README for details\n"); abort(); @@ -258,112 +258,125 @@ class HdfsEnv : public Env { unique_ptr* result, const EnvOptions& options) override; - virtual Status NewRandomAccessFile(const std::string& fname, - unique_ptr* result, - const EnvOptions& options) override { + virtual Status NewRandomAccessFile(const std::string& /*fname*/, + unique_ptr* /*result*/, + const EnvOptions& /*options*/) override { return notsup; } - virtual Status NewWritableFile(const std::string& fname, - unique_ptr* result, - const EnvOptions& options) override { + virtual Status NewWritableFile(const std::string& /*fname*/, + unique_ptr* /*result*/, + const EnvOptions& /*options*/) override { return notsup; } - virtual Status NewDirectory(const std::string& name, - unique_ptr* result) override { + virtual Status NewDirectory(const std::string& /*name*/, + unique_ptr* /*result*/) override { return notsup; } - virtual Status FileExists(const std::string& fname) override { + virtual Status FileExists(const std::string& /*fname*/) override { return notsup; } - virtual Status GetChildren(const std::string& path, - std::vector* result) override { + virtual Status GetChildren(const std::string& /*path*/, + std::vector* /*result*/) override { return notsup; } - virtual Status DeleteFile(const std::string& fname) override { + virtual Status DeleteFile(const std::string& /*fname*/) override { return notsup; } - virtual Status CreateDir(const std::string& name) override { return notsup; } + virtual Status CreateDir(const std::string& /*name*/) override { + return notsup; + } - virtual Status CreateDirIfMissing(const std::string& name) override { + virtual Status CreateDirIfMissing(const std::string& /*name*/) override { return notsup; } - virtual Status DeleteDir(const std::string& name) override { return notsup; } + virtual Status DeleteDir(const std::string& /*name*/) override { + return notsup; + } - virtual Status GetFileSize(const std::string& fname, - uint64_t* size) override { + virtual Status GetFileSize(const std::string& /*fname*/, + uint64_t* /*size*/) override { return notsup; } - virtual Status GetFileModificationTime(const std::string& fname, - uint64_t* time) override { + virtual Status GetFileModificationTime(const std::string& /*fname*/, + uint64_t* /*time*/) override { return notsup; } - virtual Status RenameFile(const std::string& src, - const std::string& target) override { + virtual Status RenameFile(const std::string& /*src*/, + const std::string& /*target*/) override { return notsup; } - virtual Status LinkFile(const std::string& src, - const std::string& target) override { + virtual Status LinkFile(const std::string& /*src*/, + const std::string& /*target*/) override { return notsup; } - virtual Status LockFile(const std::string& fname, FileLock** lock) override { + virtual Status LockFile(const std::string& /*fname*/, + FileLock** /*lock*/) override { return notsup; } - virtual Status UnlockFile(FileLock* lock) override { return notsup; } + virtual Status UnlockFile(FileLock* /*lock*/) override { return notsup; } - virtual Status NewLogger(const std::string& fname, - shared_ptr* result) override { + virtual Status NewLogger(const std::string& /*fname*/, + shared_ptr* /*result*/) override { return notsup; } - virtual void Schedule(void (*function)(void* arg), void* arg, - Priority pri = LOW, void* tag = nullptr, - void (*unschedFunction)(void* arg) = 0) override {} + virtual void Schedule(void (* /*function*/)(void* arg), void* /*arg*/, + Priority /*pri*/ = LOW, void* /*tag*/ = nullptr, + void (* /*unschedFunction*/)(void* arg) = 0) override {} - virtual int UnSchedule(void* tag, Priority pri) override { return 0; } + virtual int UnSchedule(void* /*tag*/, Priority /*pri*/) override { return 0; } - virtual void StartThread(void (*function)(void* arg), void* arg) override {} + virtual void StartThread(void (* /*function*/)(void* arg), + void* /*arg*/) override {} virtual void WaitForJoin() override {} virtual unsigned int GetThreadPoolQueueLen( - Priority pri = LOW) const override { + Priority /*pri*/ = LOW) const override { return 0; } - virtual Status GetTestDirectory(std::string* path) override { return notsup; } + virtual Status GetTestDirectory(std::string* /*path*/) override { + return notsup; + } virtual uint64_t NowMicros() override { return 0; } - virtual void SleepForMicroseconds(int micros) override {} + virtual void SleepForMicroseconds(int /*micros*/) override {} - virtual Status GetHostName(char* name, uint64_t len) override { + virtual Status GetHostName(char* /*name*/, uint64_t /*len*/) override { return notsup; } - virtual Status GetCurrentTime(int64_t* unix_time) override { return notsup; } + virtual Status GetCurrentTime(int64_t* /*unix_time*/) override { + return notsup; + } - virtual Status GetAbsolutePath(const std::string& db_path, - std::string* outputpath) override { + virtual Status GetAbsolutePath(const std::string& /*db_path*/, + std::string* /*outputpath*/) override { return notsup; } - virtual void SetBackgroundThreads(int number, Priority pri = LOW) override {} - virtual int GetBackgroundThreads(Priority pri = LOW) override { return 0; } - virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) override { + virtual void SetBackgroundThreads(int /*number*/, + Priority /*pri*/ = LOW) override {} + virtual int GetBackgroundThreads(Priority /*pri*/ = LOW) override { + return 0; } - virtual std::string TimeToString(uint64_t number) override { return ""; } + virtual void IncBackgroundThreadsIfNeeded(int /*number*/, + Priority /*pri*/) override {} + virtual std::string TimeToString(uint64_t /*number*/) override { return ""; } virtual uint64_t GetThreadID() const override { return 0; diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index e7436c772d4..12cb6a31799 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -253,7 +253,7 @@ struct AdvancedColumnFamilyOptions { // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0, // create prefix bloom for memtable with the size of // write_buffer_size * memtable_prefix_bloom_size_ratio. - // If it is larger than 0.25, it is santinized to 0.25. + // If it is larger than 0.25, it is sanitized to 0.25. // // Default: 0 (disable) // @@ -560,7 +560,7 @@ struct AdvancedColumnFamilyOptions { // Default: false bool paranoid_file_checks = false; - // In debug mode, RocksDB run consistency checks on the LSM everytime the LSM + // In debug mode, RocksDB run consistency checks on the LSM every time the LSM // change (Flush, Compaction, AddFile). These checks are disabled in release // mode, use this option to enable them in release mode as well. // Default: false diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index d5b739ccc1e..45537255f6d 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -113,6 +113,7 @@ typedef struct rocksdb_envoptions_t rocksdb_envoptions_t; typedef struct rocksdb_ingestexternalfileoptions_t rocksdb_ingestexternalfileoptions_t; typedef struct rocksdb_sstfilewriter_t rocksdb_sstfilewriter_t; typedef struct rocksdb_ratelimiter_t rocksdb_ratelimiter_t; +typedef struct rocksdb_perfcontext_t rocksdb_perfcontext_t; typedef struct rocksdb_pinnableslice_t rocksdb_pinnableslice_t; typedef struct rocksdb_transactiondb_options_t rocksdb_transactiondb_options_t; typedef struct rocksdb_transactiondb_t rocksdb_transactiondb_t; @@ -129,6 +130,9 @@ typedef struct rocksdb_checkpoint_t rocksdb_checkpoint_t; extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open( const rocksdb_options_t* options, const char* name, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_with_ttl( + const rocksdb_options_t* options, const char* name, int ttl, char** errptr); + extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_for_read_only( const rocksdb_options_t* options, const char* name, unsigned char error_if_log_file_exist, char** errptr); @@ -966,6 +970,97 @@ extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* rocksdb_ratelimiter_create( int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness); extern ROCKSDB_LIBRARY_API void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t*); +/* PerfContext */ +enum { + rocksdb_uninitialized = 0, + rocksdb_disable = 1, + rocksdb_enable_count = 2, + rocksdb_enable_time_except_for_mutex = 3, + rocksdb_enable_time = 4, + rocksdb_out_of_bounds = 5 +}; + +enum { + rocksdb_user_key_comparison_count = 0, + rocksdb_block_cache_hit_count, + rocksdb_block_read_count, + rocksdb_block_read_byte, + rocksdb_block_read_time, + rocksdb_block_checksum_time, + rocksdb_block_decompress_time, + rocksdb_get_read_bytes, + rocksdb_multiget_read_bytes, + rocksdb_iter_read_bytes, + rocksdb_internal_key_skipped_count, + rocksdb_internal_delete_skipped_count, + rocksdb_internal_recent_skipped_count, + rocksdb_internal_merge_count, + rocksdb_get_snapshot_time, + rocksdb_get_from_memtable_time, + rocksdb_get_from_memtable_count, + rocksdb_get_post_process_time, + rocksdb_get_from_output_files_time, + rocksdb_seek_on_memtable_time, + rocksdb_seek_on_memtable_count, + rocksdb_next_on_memtable_count, + rocksdb_prev_on_memtable_count, + rocksdb_seek_child_seek_time, + rocksdb_seek_child_seek_count, + rocksdb_seek_min_heap_time, + rocksdb_seek_max_heap_time, + rocksdb_seek_internal_seek_time, + rocksdb_find_next_user_entry_time, + rocksdb_write_wal_time, + rocksdb_write_memtable_time, + rocksdb_write_delay_time, + rocksdb_write_pre_and_post_process_time, + rocksdb_db_mutex_lock_nanos, + rocksdb_db_condition_wait_nanos, + rocksdb_merge_operator_time_nanos, + rocksdb_read_index_block_nanos, + rocksdb_read_filter_block_nanos, + rocksdb_new_table_block_iter_nanos, + rocksdb_new_table_iterator_nanos, + rocksdb_block_seek_nanos, + rocksdb_find_table_nanos, + rocksdb_bloom_memtable_hit_count, + rocksdb_bloom_memtable_miss_count, + rocksdb_bloom_sst_hit_count, + rocksdb_bloom_sst_miss_count, + rocksdb_key_lock_wait_time, + rocksdb_key_lock_wait_count, + rocksdb_env_new_sequential_file_nanos, + rocksdb_env_new_random_access_file_nanos, + rocksdb_env_new_writable_file_nanos, + rocksdb_env_reuse_writable_file_nanos, + rocksdb_env_new_random_rw_file_nanos, + rocksdb_env_new_directory_nanos, + rocksdb_env_file_exists_nanos, + rocksdb_env_get_children_nanos, + rocksdb_env_get_children_file_attributes_nanos, + rocksdb_env_delete_file_nanos, + rocksdb_env_create_dir_nanos, + rocksdb_env_create_dir_if_missing_nanos, + rocksdb_env_delete_dir_nanos, + rocksdb_env_get_file_size_nanos, + rocksdb_env_get_file_modification_time_nanos, + rocksdb_env_rename_file_nanos, + rocksdb_env_link_file_nanos, + rocksdb_env_lock_file_nanos, + rocksdb_env_unlock_file_nanos, + rocksdb_env_new_logger_nanos, + rocksdb_total_metric_count = 68 +}; + +extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int); +extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create(); +extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_reset( + rocksdb_perfcontext_t* context); +extern ROCKSDB_LIBRARY_API char* rocksdb_perfcontext_report( + rocksdb_perfcontext_t* context, unsigned char exclude_zero_counters); +extern ROCKSDB_LIBRARY_API uint64_t rocksdb_perfcontext_metric( + rocksdb_perfcontext_t* context, int metric); + /* Compaction Filter */ extern ROCKSDB_LIBRARY_API rocksdb_compactionfilter_t* diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index c9efd5fcf25..86dafe3959f 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -216,7 +216,8 @@ class Cache { // Mark the last inserted object as being a raw data block. This will be used // in tests. The default implementation does nothing. - virtual void TEST_mark_as_data_block(const Slice& key, size_t charge) {} + virtual void TEST_mark_as_data_block(const Slice& /*key*/, + size_t /*charge*/) {} private: // No copying allowed diff --git a/include/rocksdb/cleanable.h b/include/rocksdb/cleanable.h index cd2e9425f12..ee4ee44241e 100644 --- a/include/rocksdb/cleanable.h +++ b/include/rocksdb/cleanable.h @@ -30,7 +30,7 @@ class Cleanable { Cleanable(Cleanable&) = delete; Cleanable& operator=(Cleanable&) = delete; - // Move consturctor and move assignment is allowed. + // Move constructor and move assignment is allowed. Cleanable(Cleanable&&); Cleanable& operator=(Cleanable&&); diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h index 344b1001e8b..29b7e507719 100644 --- a/include/rocksdb/compaction_filter.h +++ b/include/rocksdb/compaction_filter.h @@ -94,8 +94,10 @@ class CompactionFilter { // be used by a single thread that is doing the compaction run, and this // call does not need to be thread-safe. However, multiple filters may be // in existence and operating concurrently. - virtual bool Filter(int level, const Slice& key, const Slice& existing_value, - std::string* new_value, bool* value_changed) const { + virtual bool Filter(int /*level*/, const Slice& /*key*/, + const Slice& /*existing_value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const { return false; } @@ -108,8 +110,8 @@ class CompactionFilter { // may not realize there is a write conflict and may allow a Transaction to // Commit that should have failed. Instead, it is better to implement any // Merge filtering inside the MergeOperator. - virtual bool FilterMergeOperand(int level, const Slice& key, - const Slice& operand) const { + virtual bool FilterMergeOperand(int /*level*/, const Slice& /*key*/, + const Slice& /*operand*/) const { return false; } @@ -154,7 +156,7 @@ class CompactionFilter { // MergeOperator. virtual Decision FilterV2(int level, const Slice& key, ValueType value_type, const Slice& existing_value, std::string* new_value, - std::string* skip_until) const { + std::string* /*skip_until*/) const { switch (value_type) { case ValueType::kValue: { bool value_changed = false; diff --git a/include/rocksdb/compaction_job_stats.h b/include/rocksdb/compaction_job_stats.h index ebb04a46bff..e5d8af8bdcb 100644 --- a/include/rocksdb/compaction_job_stats.h +++ b/include/rocksdb/compaction_job_stats.h @@ -72,7 +72,7 @@ struct CompactionJobStats { // Time spent on file fsync. uint64_t file_fsync_nanos; - // Time spent on preparing file write (falocate, etc) + // Time spent on preparing file write (fallocate, etc) uint64_t file_prepare_write_nanos; // 0-terminated strings storing the first 8 bytes of the smallest and diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 7e892615358..dc74398c3b3 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -93,8 +93,8 @@ static const int kMinorVersion = __ROCKSDB_MINOR__; // A range of keys struct Range { - Slice start; // Included in the range - Slice limit; // Not included in the range + Slice start; + Slice limit; Range() { } Range(const Slice& s, const Slice& l) : start(s), limit(l) { } @@ -172,9 +172,10 @@ class DB { std::vector* handles, DB** dbptr); // Close the DB by releasing resources, closing files etc. This should be - // called before calling the desctructor so that the caller can get back a + // called before calling the destructor so that the caller can get back a // status in case there are any errors. This will not fsync the WAL files. - // If syncing is required, the caller must first call SyncWAL. + // If syncing is required, the caller must first call SyncWAL(), or Write() + // using an empty write batch with WriteOptions.sync=true. // Regardless of the return status, the DB must be freed. If the return // status is NotSupported(), then the DB implementation does cleanup in the // destructor @@ -575,6 +576,10 @@ class DB { // WARNING: may slow down online queries if there are too many files. static const std::string kTotalSstFilesSize; + // "rocksdb.live-sst-files-size" - returns total size (bytes) of all SST + // files belong to the latest LSM tree. + static const std::string kLiveSstFilesSize; + // "rocksdb.base-level" - returns number of level to which L0 data will be // compacted. static const std::string kBaseLevel; @@ -650,6 +655,7 @@ class DB { // "rocksdb.estimate-live-data-size" // "rocksdb.min-log-number-to-keep" // "rocksdb.total-sst-files-size" + // "rocksdb.live-sst-files-size" // "rocksdb.base-level" // "rocksdb.estimate-pending-compaction-bytes" // "rocksdb.num-running-compactions" @@ -808,19 +814,22 @@ class DB { const CompactionOptions& compact_options, ColumnFamilyHandle* column_family, const std::vector& input_file_names, - const int output_level, const int output_path_id = -1) = 0; + const int output_level, const int output_path_id = -1, + std::vector* const output_file_names = nullptr) = 0; virtual Status CompactFiles( const CompactionOptions& compact_options, const std::vector& input_file_names, - const int output_level, const int output_path_id = -1) { + const int output_level, const int output_path_id = -1, + std::vector* const output_file_names = nullptr) { return CompactFiles(compact_options, DefaultColumnFamily(), - input_file_names, output_level, output_path_id); + input_file_names, output_level, output_path_id, + output_file_names); } // This function will wait until all currently running background processes // finish. After it returns, no background process will be run until - // UnblockBackgroundWork is called + // ContinueBackgroundWork is called virtual Status PauseBackgroundWork() = 0; virtual Status ContinueBackgroundWork() = 0; @@ -880,7 +889,7 @@ class DB { // Flush the WAL memory buffer to the file. If sync is true, it calls SyncWAL // afterwards. - virtual Status FlushWAL(bool sync) { + virtual Status FlushWAL(bool /*sync*/) { return Status::NotSupported("FlushWAL not implemented"); } // Sync the wal. Note that Write() followed by SyncWAL() is not exactly the @@ -1134,13 +1143,14 @@ class DB { ColumnFamilyHandle* column_family, const Range* range, std::size_t n, TablePropertiesCollection* props) = 0; - virtual Status SuggestCompactRange(ColumnFamilyHandle* column_family, - const Slice* begin, const Slice* end) { + virtual Status SuggestCompactRange(ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, + const Slice* /*end*/) { return Status::NotSupported("SuggestCompactRange() is not implemented."); } - virtual Status PromoteL0(ColumnFamilyHandle* column_family, - int target_level) { + virtual Status PromoteL0(ColumnFamilyHandle* /*column_family*/, + int /*target_level*/) { return Status::NotSupported("PromoteL0() is not implemented."); } diff --git a/include/rocksdb/db_dump_tool.h b/include/rocksdb/db_dump_tool.h index cb9a265f5c8..aeaa3422df7 100644 --- a/include/rocksdb/db_dump_tool.h +++ b/include/rocksdb/db_dump_tool.h @@ -17,7 +17,7 @@ struct DumpOptions { std::string db_path; // File location that will contain dump output std::string dump_location; - // Dont include db information header in the dump + // Don't include db information header in the dump bool anonymous = false; }; diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 3f0d4bf6796..54341d47b38 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -181,9 +181,9 @@ class Env { // returns non-OK. // // The returned file will only be accessed by one thread at a time. - virtual Status ReopenWritableFile(const std::string& fname, - unique_ptr* result, - const EnvOptions& options) { + virtual Status ReopenWritableFile(const std::string& /*fname*/, + unique_ptr* /*result*/, + const EnvOptions& /*options*/) { return Status::NotSupported(); } @@ -198,9 +198,9 @@ class Env { // *result and returns OK. On failure returns non-OK. // // The returned file will only be accessed by one thread at a time. - virtual Status NewRandomRWFile(const std::string& fname, - unique_ptr* result, - const EnvOptions& options) { + virtual Status NewRandomRWFile(const std::string& /*fname*/, + unique_ptr* /*result*/, + const EnvOptions& /*options*/) { return Status::NotSupported("RandomRWFile is not implemented in this Env"); } @@ -268,12 +268,13 @@ class Env { const std::string& target) = 0; // Hard Link file src to target. - virtual Status LinkFile(const std::string& src, const std::string& target) { + virtual Status LinkFile(const std::string& /*src*/, + const std::string& /*target*/) { return Status::NotSupported("LinkFile is not supported for this Env"); } - virtual Status AreFilesSame(const std::string& first, - const std::string& second, bool* res) { + virtual Status AreFilesSame(const std::string& /*first*/, + const std::string& /*second*/, bool* /*res*/) { return Status::NotSupported("AreFilesSame is not supported for this Env"); } @@ -320,11 +321,11 @@ class Env { // registered at the time of Schedule is invoked with arg as a parameter. virtual void Schedule(void (*function)(void* arg), void* arg, Priority pri = LOW, void* tag = nullptr, - void (*unschedFunction)(void* arg) = 0) = 0; + void (*unschedFunction)(void* arg) = nullptr) = 0; // Arrange to remove jobs for given arg from the queue_ if they are not // already scheduled. Caller is expected to have exclusive lock on arg. - virtual int UnSchedule(void* arg, Priority pri) { return 0; } + virtual int UnSchedule(void* /*arg*/, Priority /*pri*/) { return 0; } // Start a new thread, invoking "function(arg)" within the new thread. // When "function(arg)" returns, the thread will be destroyed. @@ -334,7 +335,7 @@ class Env { virtual void WaitForJoin() {} // Get thread pool queue length for specific thread pool. - virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const { + virtual unsigned int GetThreadPoolQueueLen(Priority /*pri*/ = LOW) const { return 0; } @@ -362,7 +363,7 @@ class Env { return NowMicros() * 1000; } - // Sleep/delay the thread for the perscribed number of micro-seconds. + // Sleep/delay the thread for the prescribed number of micro-seconds. virtual void SleepForMicroseconds(int micros) = 0; // Get the current host name. @@ -388,7 +389,7 @@ class Env { virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) = 0; // Lower IO priority for threads from the specified pool. - virtual void LowerThreadPoolIOPriority(Priority pool = LOW) {} + virtual void LowerThreadPoolIOPriority(Priority /*pool*/ = LOW) {} // Converts seconds-since-Jan-01-1970 to a printable string virtual std::string TimeToString(uint64_t time) = 0; @@ -432,7 +433,7 @@ class Env { const ImmutableDBOptions& db_options) const; // Returns the status of all threads that belong to the current Env. - virtual Status GetThreadList(std::vector* thread_list) { + virtual Status GetThreadList(std::vector* /*thread_list*/) { return Status::NotSupported("Not supported."); } @@ -498,14 +499,14 @@ class SequentialFile { // Remove any kind of caching of data from the offset to offset+length // of this file. If the length is 0, then it refers to the end of file. // If the system is not caching the file contents, then this is a noop. - virtual Status InvalidateCache(size_t offset, size_t length) { + virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) { return Status::NotSupported("InvalidateCache not supported."); } // Positioned Read for direct I/O // If Direct I/O enabled, offset, n, and scratch should be properly aligned - virtual Status PositionedRead(uint64_t offset, size_t n, Slice* result, - char* scratch) { + virtual Status PositionedRead(uint64_t /*offset*/, size_t /*n*/, + Slice* /*result*/, char* /*scratch*/) { return Status::NotSupported(); } }; @@ -531,7 +532,7 @@ class RandomAccessFile { char* scratch) const = 0; // Readahead the file starting from offset by n bytes for caching. - virtual Status Prefetch(uint64_t offset, size_t n) { + virtual Status Prefetch(uint64_t /*offset*/, size_t /*n*/) { return Status::OK(); } @@ -550,14 +551,14 @@ class RandomAccessFile { // a single varint. // // Note: these IDs are only valid for the duration of the process. - virtual size_t GetUniqueId(char* id, size_t max_size) const { + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { return 0; // Default implementation to prevent issues with backwards // compatibility. }; enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED }; - virtual void Hint(AccessPattern pattern) {} + virtual void Hint(AccessPattern /*pattern*/) {} // Indicates the upper layers if the current RandomAccessFile implementation // uses direct IO. @@ -570,7 +571,7 @@ class RandomAccessFile { // Remove any kind of caching of data from the offset to offset+length // of this file. If the length is 0, then it refers to the end of file. // If the system is not caching the file contents, then this is a noop. - virtual Status InvalidateCache(size_t offset, size_t length) { + virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) { return Status::NotSupported("InvalidateCache not supported."); } }; @@ -621,9 +622,7 @@ class WritableFile { // before closing. It is not always possible to keep track of the file // size due to whole pages writes. The behavior is undefined if called // with other writes to follow. - virtual Status Truncate(uint64_t size) { - return Status::OK(); - } + virtual Status Truncate(uint64_t /*size*/) { return Status::OK(); } virtual Status Close() = 0; virtual Status Flush() = 0; virtual Status Sync() = 0; // sync data @@ -690,7 +689,7 @@ class WritableFile { } // For documentation, refer to RandomAccessFile::GetUniqueId() - virtual size_t GetUniqueId(char* id, size_t max_size) const { + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { return 0; // Default implementation to prevent issues with backwards } @@ -698,7 +697,7 @@ class WritableFile { // of this file. If the length is 0, then it refers to the end of file. // If the system is not caching the file contents, then this is a noop. // This call has no effect on dirty pages in the cache. - virtual Status InvalidateCache(size_t offset, size_t length) { + virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) { return Status::NotSupported("InvalidateCache not supported."); } @@ -708,7 +707,9 @@ class WritableFile { // This asks the OS to initiate flushing the cached data to disk, // without waiting for completion. // Default implementation does nothing. - virtual Status RangeSync(uint64_t offset, uint64_t nbytes) { return Status::OK(); } + virtual Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) { + return Status::OK(); + } // PrepareWrite performs any necessary preparation for a write // before the write actually occurs. This allows for pre-allocation @@ -735,7 +736,7 @@ class WritableFile { } // Pre-allocates space for a file. - virtual Status Allocate(uint64_t offset, uint64_t len) { + virtual Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) { return Status::OK(); } @@ -801,6 +802,10 @@ class Directory { virtual ~Directory() {} // Fsync directory. Can be called concurrently from multiple threads. virtual Status Fsync() = 0; + + virtual size_t GetUniqueId(char* id, size_t max_size) const { + return 0; + } }; enum InfoLogLevel : unsigned char { @@ -1024,7 +1029,7 @@ class EnvWrapper : public Env { Status UnlockFile(FileLock* l) override { return target_->UnlockFile(l); } void Schedule(void (*f)(void* arg), void* a, Priority pri, - void* tag = nullptr, void (*u)(void* arg) = 0) override { + void* tag = nullptr, void (*u)(void* arg) = nullptr) override { return target_->Schedule(f, a, pri, tag, u); } diff --git a/include/rocksdb/env_encryption.h b/include/rocksdb/env_encryption.h index e4c924a4b4c..70dce616a62 100644 --- a/include/rocksdb/env_encryption.h +++ b/include/rocksdb/env_encryption.h @@ -133,7 +133,7 @@ class EncryptionProvider { // GetPrefixLength returns the length of the prefix that is added to every file // and used for storing encryption options. // For optimal performance, the prefix length should be a multiple of - // the a page size. + // the page size. virtual size_t GetPrefixLength() = 0; // CreateNewPrefix initialized an allocated block of prefix memory @@ -165,7 +165,7 @@ class CTREncryptionProvider : public EncryptionProvider { // GetPrefixLength returns the length of the prefix that is added to every file // and used for storing encryption options. // For optimal performance, the prefix length should be a multiple of - // the a page size. + // the page size. virtual size_t GetPrefixLength() override; // CreateNewPrefix initialized an allocated block of prefix memory diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h index 452d1913eca..4706f38c30a 100644 --- a/include/rocksdb/filter_policy.h +++ b/include/rocksdb/filter_policy.h @@ -50,7 +50,7 @@ class FilterBitsBuilder { #pragma warning(push) #pragma warning(disable : 4702) // unreachable code #endif - virtual int CalculateNumEntry(const uint32_t space) { + virtual int CalculateNumEntry(const uint32_t /*space*/) { #ifndef ROCKSDB_LITE throw std::runtime_error("CalculateNumEntry not Implemented"); #else @@ -122,7 +122,8 @@ class FilterPolicy { // Get the FilterBitsReader, which is ONLY used for full filter block // It contains interface to tell if key can be in filter // The input slice should NOT be deleted by FilterPolicy - virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents) const { + virtual FilterBitsReader* GetFilterBitsReader( + const Slice& /*contents*/) const { return nullptr; } }; diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index c32bd1cb63a..ad2df66f84a 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -83,7 +83,7 @@ enum class CompactionReason { }; enum class FlushReason : int { - kUnknown = 0x00, + kOthers = 0x00, kGetLiveFiles = 0x01, kShutDown = 0x02, kExternalFileIngestion = 0x03, @@ -91,7 +91,9 @@ enum class FlushReason : int { kWriteBufferManager = 0x05, kWriteBufferFull = 0x06, kTest = 0x07, - kSuperVersionChange = 0x08, + kDeleteFiles = 0x08, + kAutoCompaction = 0x09, + kManualFlush = 0x0a, }; enum class BackgroundErrorReason { @@ -227,30 +229,6 @@ struct ExternalFileIngestionInfo { TableProperties table_properties; }; -// A call-back function to RocksDB which will be called when the compaction -// iterator is compacting values. It is meant to be returned from -// EventListner::GetCompactionEventListner() at the beginning of compaction -// job. -class CompactionEventListener { - public: - enum CompactionListenerValueType { - kValue, - kMergeOperand, - kDelete, - kSingleDelete, - kRangeDelete, - kBlobIndex, - kInvalid, - }; - - virtual void OnCompaction(int level, const Slice& key, - CompactionListenerValueType value_type, - const Slice& existing_value, - const SequenceNumber& sn, bool is_new) = 0; - - virtual ~CompactionEventListener() = default; -}; - // EventListener class contains a set of call-back functions that will // be called when specific RocksDB event happens such as flush. It can // be used as a building block for developing custom features such as @@ -379,8 +357,8 @@ class EventListener { // returns. Otherwise, RocksDB may be blocked. // @param handle is a pointer to the column family handle to be deleted // which will become a dangling pointer after the deletion. - virtual void OnColumnFamilyHandleDeletionStarted(ColumnFamilyHandle* handle) { - } + virtual void OnColumnFamilyHandleDeletionStarted( + ColumnFamilyHandle* /*handle*/) {} // A call-back function for RocksDB which will be called after an external // file is ingested using IngestExternalFile. @@ -413,12 +391,6 @@ class EventListener { // returns. Otherwise, RocksDB may be blocked. virtual void OnStallConditionsChanged(const WriteStallInfo& /*info*/) {} - // Factory method to return CompactionEventListener. If multiple listeners - // provides CompactionEventListner, only the first one will be used. - virtual CompactionEventListener* GetCompactionEventListener() { - return nullptr; - } - virtual ~EventListener() {} }; diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index 1e8f41a4522..4b6e897a6d8 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -39,24 +39,34 @@ #include #include #include +#include namespace rocksdb { class Arena; class Allocator; class LookupKey; -class Slice; class SliceTransform; class Logger; typedef void* KeyHandle; +extern Slice GetLengthPrefixedSlice(const char* data); + class MemTableRep { public: // KeyComparator provides a means to compare keys, which are internal keys // concatenated with values. class KeyComparator { public: + typedef rocksdb::Slice DecodedType; + + virtual DecodedType decode_key(const char* key) const { + // The format of key is frozen and can be terated as a part of the API + // contract. Refer to MemTable::Add for details. + return GetLengthPrefixedSlice(key); + } + // Compare a and b. Return a negative value if a is less than b, 0 if they // are equal, and a positive value if a is greater than b virtual int operator()(const char* prefix_len_key1, @@ -97,7 +107,7 @@ class MemTableRep { // // Currently only skip-list based memtable implement the interface. Other // implementations will fallback to Insert() by default. - virtual void InsertWithHint(KeyHandle handle, void** hint) { + virtual void InsertWithHint(KeyHandle handle, void** /*hint*/) { // Ignore the hint by default. Insert(handle); } @@ -149,8 +159,8 @@ class MemTableRep { virtual void Get(const LookupKey& k, void* callback_args, bool (*callback_func)(void* arg, const char* entry)); - virtual uint64_t ApproximateNumEntries(const Slice& start_ikey, - const Slice& end_key) { + virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/, + const Slice& /*end_key*/) { return 0; } @@ -346,7 +356,7 @@ extern MemTableRepFactory* NewHashLinkListRepFactory( // This factory creates a cuckoo-hashing based mem-table representation. // Cuckoo-hash is a closed-hash strategy, in which all key/value pairs -// are stored in the bucket array itself intead of in some data structures +// are stored in the bucket array itself instead of in some data structures // external to the bucket array. In addition, each key in cuckoo hash // has a constant number of possible buckets in the bucket array. These // two properties together makes cuckoo hash more memory efficient and diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h index d263ae88b05..cd7563cff34 100644 --- a/include/rocksdb/merge_operator.h +++ b/include/rocksdb/merge_operator.h @@ -66,11 +66,9 @@ class MergeOperator { // internal corruption. This will be treated as an error by the library. // // Also make use of the *logger for error messages. - virtual bool FullMerge(const Slice& key, - const Slice* existing_value, - const std::deque& operand_list, - std::string* new_value, - Logger* logger) const { + virtual bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/, + const std::deque& /*operand_list*/, + std::string* /*new_value*/, Logger* /*logger*/) const { // deprecated, please use FullMergeV2() assert(false); return false; @@ -89,7 +87,7 @@ class MergeOperator { // The key associated with the merge operation. const Slice& key; // The existing value of the current key, nullptr means that the - // value dont exist. + // value doesn't exist. const Slice* existing_value; // A list of operands to apply. const std::vector& operand_list; @@ -145,9 +143,10 @@ class MergeOperator { // If there is corruption in the data, handle it in the FullMergeV2() function // and return false there. The default implementation of PartialMerge will // always return false. - virtual bool PartialMerge(const Slice& key, const Slice& left_operand, - const Slice& right_operand, std::string* new_value, - Logger* logger) const { + virtual bool PartialMerge(const Slice& /*key*/, const Slice& /*left_operand*/, + const Slice& /*right_operand*/, + std::string* /*new_value*/, + Logger* /*logger*/) const { return false; } @@ -186,9 +185,9 @@ class MergeOperator { // Determines whether the MergeOperator can be called with just a single // merge operand. - // Override and return true for allowing a single operand. FullMergeV2 and - // PartialMerge/PartialMergeMulti should be implemented accordingly to handle - // a single operand. + // Override and return true for allowing a single operand. Both FullMergeV2 + // and PartialMerge/PartialMergeMulti should be overridden and implemented + // correctly to handle a single operand. virtual bool AllowSingleOperand() const { return false; } // Allows to control when to invoke a full merge during Get. @@ -196,7 +195,7 @@ class MergeOperator { // during a point lookup, thereby helping in limiting the number of levels to // read from. // Doesn't help with iterators. - virtual bool ShouldMerge(const std::vector& operands) const { + virtual bool ShouldMerge(const std::vector& /*operands*/) const { return false; } }; diff --git a/include/rocksdb/rate_limiter.h b/include/rocksdb/rate_limiter.h index 84183803503..a81a3ac9192 100644 --- a/include/rocksdb/rate_limiter.h +++ b/include/rocksdb/rate_limiter.h @@ -45,7 +45,7 @@ class RateLimiter { // Request for token for bytes. If this request can not be satisfied, the call // is blocked. Caller is responsible to make sure // bytes <= GetSingleBurstBytes() - virtual void Request(const int64_t bytes, const Env::IOPriority pri) { + virtual void Request(const int64_t /*bytes*/, const Env::IOPriority /*pri*/) { assert(false); } diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h index 4f24c8a2217..76ecce68426 100644 --- a/include/rocksdb/slice.h +++ b/include/rocksdb/slice.h @@ -121,7 +121,7 @@ class Slice { /** * A Slice that can be pinned with some cleanup tasks, which will be run upon * ::Reset() or object destruction, whichever is invoked first. This can be used - * to avoid memcpy by having the PinnsableSlice object referring to the data + * to avoid memcpy by having the PinnableSlice object referring to the data * that is locked in the memory and release them after the data is consumed. */ class PinnableSlice : public Slice, public Cleanable { @@ -177,7 +177,7 @@ class PinnableSlice : public Slice, public Cleanable { } } - void remove_prefix(size_t n) { + void remove_prefix(size_t /*n*/) { assert(0); // Not implemented } diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h index fc82bf58456..39999cc6247 100644 --- a/include/rocksdb/slice_transform.h +++ b/include/rocksdb/slice_transform.h @@ -22,7 +22,7 @@ namespace rocksdb { class Slice; /* - * A SliceTranform is a generic pluggable way of transforming one string + * A SliceTransform is a generic pluggable way of transforming one string * to another. Its primary use-case is in configuring rocksdb * to store prefix blooms by setting prefix_extractor in * ColumnFamilyOptions. @@ -58,7 +58,7 @@ class SliceTransform { virtual bool InDomain(const Slice& key) const = 0; // This is currently not used and remains here for backward compatibility. - virtual bool InRange(const Slice& dst) const { return false; } + virtual bool InRange(const Slice& /*dst*/) const { return false; } // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix. // @@ -72,7 +72,7 @@ class SliceTransform { // by setting ReadOptions.total_order_seek = true. // // Here is an example: Suppose we implement a slice transform that returns - // the first part of the string after spliting it using delimiter ",": + // the first part of the string after splitting it using delimiter ",": // 1. SameResultWhenAppended("abc,") should return true. If applying prefix // bloom filter using it, all slices matching "abc:.*" will be extracted // to "abc,", so any SST file or memtable containing any of those key @@ -83,7 +83,7 @@ class SliceTransform { // "abcd,e", the file can be filtered out and the key will be invisible. // // i.e., an implementation always returning false is safe. - virtual bool SameResultWhenAppended(const Slice& prefix) const { + virtual bool SameResultWhenAppended(const Slice& /*prefix*/) const { return false; } }; diff --git a/include/rocksdb/sst_file_manager.h b/include/rocksdb/sst_file_manager.h index cb626b1a6c5..c0e109f1eb2 100644 --- a/include/rocksdb/sst_file_manager.h +++ b/include/rocksdb/sst_file_manager.h @@ -35,12 +35,22 @@ class SstFileManager { // thread-safe. virtual void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) = 0; + // Set the amount of buffer room each compaction should be able to leave. + // In other words, at its maximum disk space consumption, the compaction + // should still leave compaction_buffer_size available on the disk so that + // other background functions may continue, such as logging and flushing. + virtual void SetCompactionBufferSize(uint64_t compaction_buffer_size) = 0; + // Return true if the total size of SST files exceeded the maximum allowed // space usage. // // thread-safe. virtual bool IsMaxAllowedSpaceReached() = 0; + // Returns true if the total size of SST files as well as estimated size + // of ongoing compactions exceeds the maximums allowed space usage. + virtual bool IsMaxAllowedSpaceReachedIncludingCompactions() = 0; + // Return the total size of all tracked files. // thread-safe virtual uint64_t GetTotalSize() = 0; @@ -86,10 +96,14 @@ class SstFileManager { // @param max_trash_db_ratio: If the trash size constitutes for more than this // fraction of the total DB size we will start deleting new files passed to // DeleteScheduler immediately +// @param bytes_max_delete_chunk: if a single file is larger than delete chunk, +// ftruncate the file by this size each time, rather than dropping the whole +// file. 0 means to always delete the whole file. extern SstFileManager* NewSstFileManager( Env* env, std::shared_ptr info_log = nullptr, std::string trash_dir = "", int64_t rate_bytes_per_sec = 0, bool delete_existing_trash = true, Status* status = nullptr, - double max_trash_db_ratio = 0.25); + double max_trash_db_ratio = 0.25, + uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024); } // namespace rocksdb diff --git a/include/rocksdb/sst_file_writer.h b/include/rocksdb/sst_file_writer.h index c7a874ab11c..10c580c9cd1 100644 --- a/include/rocksdb/sst_file_writer.h +++ b/include/rocksdb/sst_file_writer.h @@ -67,7 +67,7 @@ class SstFileWriter { // be ingested into this column_family, note that passing nullptr means that // the column_family is unknown. // If invalidate_page_cache is set to true, SstFileWriter will give the OS a - // hint that this file pages is not needed everytime we write 1MB to the file. + // hint that this file pages is not needed every time we write 1MB to the file. // To use the rate limiter an io_priority smaller than IO_TOTAL can be passed. SstFileWriter(const EnvOptions& env_options, const Options& options, ColumnFamilyHandle* column_family = nullptr, diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 2dd0db5ca6b..7fa287f0bd1 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -108,6 +108,8 @@ enum Tickers : uint32_t { COMPACTION_RANGE_DEL_DROP_OBSOLETE, // all keys in range were deleted. // Deletions obsoleted before bottom level due to file gap optimization. COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE, + // If a compaction was cancelled in sfm to prevent ENOSPC + COMPACTION_CANCELLED, // Number of keys written to the database via the Put and Write call's NUMBER_KEYS_WRITTEN, @@ -265,7 +267,16 @@ enum Tickers : uint32_t { BLOB_DB_BLOB_FILE_SYNCED, // # of blob index evicted from base DB by BlobDB compaction filter because // of expiration. - BLOB_DB_BLOB_INDEX_EXPIRED, + BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, + // size of blob index evicted from base DB by BlobDB compaction filter + // because of expiration. + BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, + // # of blob index evicted from base DB by BlobDB compaction filter because + // of corresponding file deleted. + BLOB_DB_BLOB_INDEX_EVICTED_COUNT, + // size of blob index evicted from base DB by BlobDB compaction filter + // because of corresponding file deleted. + BLOB_DB_BLOB_INDEX_EVICTED_SIZE, // # of blob files being garbage collected. BLOB_DB_GC_NUM_FILES, // # of blob files generated by garbage collection. @@ -338,6 +349,8 @@ const std::vector> TickersNameMap = { "rocksdb.compaction.range_del.drop.obsolete"}, {COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE, "rocksdb.compaction.optimized.del.drop.obsolete"}, + {COMPACTION_CANCELLED, + "rocksdb.compaction.cancelled"}, {NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"}, {NUMBER_KEYS_READ, "rocksdb.number.keys.read"}, {NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"}, @@ -417,7 +430,12 @@ const std::vector> TickersNameMap = { {BLOB_DB_BLOB_FILE_BYTES_WRITTEN, "rocksdb.blobdb.blob.file.bytes.written"}, {BLOB_DB_BLOB_FILE_BYTES_READ, "rocksdb.blobdb.blob.file.bytes.read"}, {BLOB_DB_BLOB_FILE_SYNCED, "rocksdb.blobdb.blob.file.synced"}, - {BLOB_DB_BLOB_INDEX_EXPIRED, "rocksdb.blobdb.blob.index.expired"}, + {BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, + "rocksdb.blobdb.blob.index.expired.count"}, + {BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, "rocksdb.blobdb.blob.index.expired.size"}, + {BLOB_DB_BLOB_INDEX_EVICTED_COUNT, + "rocksdb.blobdb.blob.index.evicted.count"}, + {BLOB_DB_BLOB_INDEX_EVICTED_SIZE, "rocksdb.blobdb.blob.index.evicted.size"}, {BLOB_DB_GC_NUM_FILES, "rocksdb.blobdb.gc.num.files"}, {BLOB_DB_GC_NUM_NEW_FILES, "rocksdb.blobdb.gc.num.new.files"}, {BLOB_DB_GC_FAILURES, "rocksdb.blobdb.gc.failures"}, @@ -597,7 +615,7 @@ class Statistics { virtual uint64_t getTickerCount(uint32_t tickerType) const = 0; virtual void histogramData(uint32_t type, HistogramData* const data) const = 0; - virtual std::string getHistogramString(uint32_t type) const { return ""; } + virtual std::string getHistogramString(uint32_t /*type*/) const { return ""; } virtual void recordTick(uint32_t tickerType, uint64_t count = 0) = 0; virtual void setTickerCount(uint32_t tickerType, uint64_t count) = 0; virtual uint64_t getAndResetTickerCount(uint32_t tickerType) = 0; diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index 709f3837098..3573d37e320 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -58,7 +58,8 @@ class Status { kAborted = 10, kBusy = 11, kExpired = 12, - kTryAgain = 13 + kTryAgain = 13, + kCompactionTooLarge = 14 }; Code code() const { return code_; } @@ -162,6 +163,14 @@ class Status { return Status(kTryAgain, msg, msg2); } + static Status CompactionTooLarge(SubCode msg = kNone) { + return Status(kCompactionTooLarge, msg); + } + static Status CompactionTooLarge(const Slice& msg, + const Slice& msg2 = Slice()) { + return Status(kCompactionTooLarge, msg, msg2); + } + static Status NoSpace() { return Status(kIOError, kNoSpace); } static Status NoSpace(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kIOError, kNoSpace, msg, msg2); @@ -221,6 +230,9 @@ class Status { // re-attempted. bool IsTryAgain() const { return code() == kTryAgain; } + // Returns true iff the status indicates the proposed compaction is too large + bool IsCompactionTooLarge() const { return code() == kCompactionTooLarge; } + // Returns true iff the status indicates a NoSpace error // This is caused by an I/O error returning the specific "out of space" // error condition. Stricto sensu, an NoSpace error is an I/O error diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 8703a0eb584..a86c763c0d2 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -156,7 +156,7 @@ struct BlockBasedTableOptions { // well. // TODO(myabandeh): remove the note above once the limitation is lifted // Use partitioned full filters for each SST file. This option is - // incompatibile with block-based filters. + // incompatible with block-based filters. bool partition_filters = false; // Use delta encoding to compress keys in blocks. @@ -214,7 +214,7 @@ struct BlockBasedTableOptions { // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you // don't plan to run RocksDB before version 3.10, you should probably use // this. - // This option only affects newly written tables. When reading exising tables, + // This option only affects newly written tables. When reading existing tables, // the information about version is read from the footer. uint32_t format_version = 2; @@ -222,11 +222,14 @@ struct BlockBasedTableOptions { // false will avoid the overhead of decompression if index blocks are evicted // and read back bool enable_index_compression = true; + + // Align data blocks on lesser of page size and block size + bool block_align = false; }; // Table Properties that are specific to block-based table properties. struct BlockBasedTablePropertyNames { - // value of this propertis is a fixed int32 number. + // value of this properties is a fixed int32 number. static const std::string kIndexType; // value is "1" for true and "0" for false. static const std::string kWholeKeyFiltering; @@ -319,7 +322,7 @@ struct PlainTableOptions { }; // -- Plain Table with prefix-only seek -// For this factory, you need to set Options.prefix_extrator properly to make it +// For this factory, you need to set Options.prefix_extractor properly to make it // work. Look-up will starts with prefix hash lookup for key prefix. Inside the // hash bucket found, a binary search is executed for hash conflicts. Finally, // a linear search is used. @@ -382,7 +385,7 @@ struct CuckooTableOptions { bool identity_as_first_hash = false; // If this option is set to true, module is used during hash calculation. // This often yields better space efficiency at the cost of performance. - // If this optino is set to false, # of entries in table is constrained to be + // If this option is set to false, # of entries in table is constrained to be // power of two, and bit and is used to calculate hash, which is faster in // general. bool use_module_hash = true; @@ -467,8 +470,8 @@ class TableFactory { // RocksDB prints configurations at DB Open(). virtual std::string GetPrintableTableOptions() const = 0; - virtual Status GetOptionString(std::string* opt_string, - const std::string& delimiter) const { + virtual Status GetOptionString(std::string* /*opt_string*/, + const std::string& /*delimiter*/) const { return Status::NotSupported( "The table factory doesn't implement GetOptionString()."); } diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h index 0ab4d53110b..4a525591bc0 100644 --- a/include/rocksdb/table_properties.h +++ b/include/rocksdb/table_properties.h @@ -15,7 +15,7 @@ namespace rocksdb { // Other than basic table properties, each table may also have the user // collected properties. // The value of the user-collected properties are encoded as raw bytes -- -// users have to interprete these values by themselves. +// users have to interpret these values by themselves. // Note: To do prefix seek/scan in `UserCollectedProperties`, you can do // something similar to: // @@ -59,7 +59,7 @@ extern const std::string kRangeDelBlock; // `TablePropertiesCollector` provides the mechanism for users to collect // their own properties that they are interested in. This class is essentially // a collection of callback functions that will be invoked during table -// building. It is construced with TablePropertiesCollectorFactory. The methods +// building. It is constructed with TablePropertiesCollectorFactory. The methods // don't need to be thread-safe, as we will create exactly one // TablePropertiesCollector object per table and then call it sequentially class TablePropertiesCollector { diff --git a/include/rocksdb/utilities/geo_db.h b/include/rocksdb/utilities/geo_db.h index 408774c5990..ec3cbdf265a 100644 --- a/include/rocksdb/utilities/geo_db.h +++ b/include/rocksdb/utilities/geo_db.h @@ -80,7 +80,7 @@ class GeoDB : public StackableDB { // GeoDB owns the pointer `DB* db` now. You should not delete it or // use it after the invocation of GeoDB // GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {} - GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {} + GeoDB(DB* db, const GeoDBOptions& /*options*/) : StackableDB(db) {} virtual ~GeoDB() {} // Insert a new object into the location database. The object is diff --git a/include/rocksdb/utilities/optimistic_transaction_db.h b/include/rocksdb/utilities/optimistic_transaction_db.h index 02917ff5830..518bc610c6d 100644 --- a/include/rocksdb/utilities/optimistic_transaction_db.h +++ b/include/rocksdb/utilities/optimistic_transaction_db.h @@ -62,7 +62,7 @@ class OptimisticTransactionDB { protected: // To Create an OptimisticTransactionDB, call Open() - explicit OptimisticTransactionDB(DB* db) {} + explicit OptimisticTransactionDB(DB* /*db*/) {} OptimisticTransactionDB() {} private: diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index b84c6341e22..721203f7ce4 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -219,10 +219,11 @@ class StackableDB : public DB { const CompactionOptions& compact_options, ColumnFamilyHandle* column_family, const std::vector& input_file_names, - const int output_level, const int output_path_id = -1) override { + const int output_level, const int output_path_id = -1, + std::vector* const output_file_names = nullptr) override { return db_->CompactFiles( compact_options, column_family, input_file_names, - output_level, output_path_id); + output_level, output_path_id, output_file_names); } virtual Status PauseBackgroundWork() override { diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h index 11317540aee..d87dfb500cb 100644 --- a/include/rocksdb/utilities/transaction.h +++ b/include/rocksdb/utilities/transaction.h @@ -118,7 +118,7 @@ class Transaction { // longer be valid and should be discarded after a call to ClearSnapshot(). virtual void ClearSnapshot() = 0; - // Prepare the current transation for 2PC + // Prepare the current transaction for 2PC virtual Status Prepare() = 0; // Write all batched keys to the db atomically. @@ -169,8 +169,8 @@ class Transaction { ColumnFamilyHandle* column_family, const Slice& key, std::string* value) = 0; - // An overload of the the above method that receives a PinnableSlice - // For backward compatiblity a default implementation is provided + // An overload of the above method that receives a PinnableSlice + // For backward compatibility a default implementation is provided virtual Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* pinnable_val) { @@ -230,12 +230,12 @@ class Transaction { const Slice& key, std::string* value, bool exclusive = true) = 0; - // An overload of the the above method that receives a PinnableSlice - // For backward compatiblity a default implementation is provided + // An overload of the above method that receives a PinnableSlice + // For backward compatibility a default implementation is provided virtual Status GetForUpdate(const ReadOptions& options, - ColumnFamilyHandle* column_family, + ColumnFamilyHandle* /*column_family*/, const Slice& key, PinnableSlice* pinnable_val, - bool exclusive = true) { + bool /*exclusive*/ = true) { if (pinnable_val == nullptr) { std::string* null_str = nullptr; return GetForUpdate(options, key, null_str); @@ -368,7 +368,7 @@ class Transaction { virtual void EnableIndexing() = 0; // Returns the number of distinct Keys being tracked by this transaction. - // If this transaction was created by a TransactinDB, this is the number of + // If this transaction was created by a TransactionDB, this is the number of // keys that are currently locked by this transaction. // If this transaction was created by an OptimisticTransactionDB, this is the // number of keys that need to be checked for conflicts at commit time. @@ -440,8 +440,8 @@ class Transaction { virtual bool IsDeadlockDetect() const { return false; } - virtual std::vector GetWaitingTxns(uint32_t* column_family_id, - std::string* key) const { + virtual std::vector GetWaitingTxns( + uint32_t* /*column_family_id*/, std::string* /*key*/) const { assert(false); return std::vector(); } @@ -469,7 +469,7 @@ class Transaction { uint64_t GetId() { return id_; } protected: - explicit Transaction(const TransactionDB* db) {} + explicit Transaction(const TransactionDB* /*db*/) {} Transaction() : log_number_(0), txn_state_(STARTED) {} // the log in which the prepared section for this txn resides diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index f3566928691..1482b246662 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -75,7 +75,7 @@ struct TransactionDBOptions { // expiration set. int64_t default_lock_timeout = 1000; // 1 second - // If set, the TransactionDB will use this implemenation of a mutex and + // If set, the TransactionDB will use this implementation of a mutex and // condition variable for all transaction locking instead of the default // mutex/condvar implementation. std::shared_ptr custom_mutex_factory; @@ -100,7 +100,7 @@ struct TransactionOptions { // If set, it states that the CommitTimeWriteBatch represents the latest state // of the application and meant to be used later during recovery. It enables // an optimization to postpone updating the memtable with CommitTimeWriteBatch - // to only SwithcMamtable or recovery. + // to only SwitchMemtable or recovery. bool use_only_the_last_commit_time_batch_for_recovery = false; // TODO(agiardullo): TransactionDB does not yet support comparators that allow @@ -131,15 +131,15 @@ struct TransactionOptions { }; // The per-write optimizations that do not involve transactions. TransactionDB -// implemenation might or might not make use of the specified optimizations. +// implementation might or might not make use of the specified optimizations. struct TransactionDBWriteOptimizations { - // If it is true it means that the applicatinn guratnees that the + // If it is true it means that the application guarantees that the // key-set in the write batch do not conflict with any concurrent transaction // and hence the concurrency control mechanism could be skipped for this // write. bool skip_concurrency_control = false; // If true, the application guarantees that there is no duplicate in the write batch and any employed mechanism to hanlde + // family, key> in the write batch and any employed mechanism to handle // duplicate keys could be skipped. bool skip_duplicate_key_check = false; }; diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h index a0c9635da84..5fd7700f191 100644 --- a/include/rocksdb/utilities/write_batch_with_index.h +++ b/include/rocksdb/utilities/write_batch_with_index.h @@ -188,7 +188,7 @@ class WriteBatchWithIndex : public WriteBatchBase { Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, const Slice& key, std::string* value); - // An overload of the the above method that receives a PinnableSlice + // An overload of the above method that receives a PinnableSlice Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, const Slice& key, PinnableSlice* value); @@ -196,7 +196,7 @@ class WriteBatchWithIndex : public WriteBatchBase { ColumnFamilyHandle* column_family, const Slice& key, std::string* value); - // An overload of the the above method that receives a PinnableSlice + // An overload of the above method that receives a PinnableSlice Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value); @@ -227,6 +227,7 @@ class WriteBatchWithIndex : public WriteBatchBase { void SetMaxBytes(size_t max_bytes) override; private: + friend class PessimisticTransactionDB; friend class WritePreparedTxn; friend class WriteBatchWithIndex_SubBatchCnt_Test; // Returns the number of sub-batches inside the write batch. A sub-batch diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index 20b82afbe60..1f114089135 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -5,7 +5,7 @@ #pragma once #define ROCKSDB_MAJOR 5 -#define ROCKSDB_MINOR 10 +#define ROCKSDB_MINOR 13 #define ROCKSDB_PATCH 0 // Do not use these. We made the mistake of declaring macros starting with diff --git a/include/rocksdb/wal_filter.h b/include/rocksdb/wal_filter.h index 686fa499893..a22dca92377 100644 --- a/include/rocksdb/wal_filter.h +++ b/include/rocksdb/wal_filter.h @@ -44,8 +44,8 @@ class WalFilter { // @params cf_name_id_map column_family_name to column_family_id map virtual void ColumnFamilyLogNumberMap( - const std::map& cf_lognumber_map, - const std::map& cf_name_id_map) {} + const std::map& /*cf_lognumber_map*/, + const std::map& /*cf_name_id_map*/) {} // LogRecord is invoked for each log record encountered for all the logs // during replay on logs on recovery. This method can be used to: @@ -75,11 +75,9 @@ class WalFilter { // @returns Processing option for the current record. // Please see WalProcessingOption enum above for // details. - virtual WalProcessingOption LogRecordFound(unsigned long long log_number, - const std::string& log_file_name, - const WriteBatch& batch, - WriteBatch* new_batch, - bool* batch_changed) { + virtual WalProcessingOption LogRecordFound( + unsigned long long /*log_number*/, const std::string& /*log_file_name*/, + const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) { // Default implementation falls back to older function for compatibility return LogRecord(batch, new_batch, batch_changed); } @@ -87,9 +85,9 @@ class WalFilter { // Please see the comments for LogRecord above. This function is for // compatibility only and contains a subset of parameters. // New code should use the function above. - virtual WalProcessingOption LogRecord(const WriteBatch& batch, - WriteBatch* new_batch, - bool* batch_changed) const { + virtual WalProcessingOption LogRecord(const WriteBatch& /*batch*/, + WriteBatch* /*new_batch*/, + bool* /*batch_changed*/) const { return WalProcessingOption::kContinueProcessing; } diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h index dd738616360..38bdcfc6905 100644 --- a/include/rocksdb/write_batch.h +++ b/include/rocksdb/write_batch.h @@ -217,8 +217,9 @@ class WriteBatch : public WriteBatchBase { } virtual void SingleDelete(const Slice& /*key*/) {} - virtual Status DeleteRangeCF(uint32_t column_family_id, - const Slice& begin_key, const Slice& end_key) { + virtual Status DeleteRangeCF(uint32_t /*column_family_id*/, + const Slice& /*begin_key*/, + const Slice& /*end_key*/) { return Status::InvalidArgument("DeleteRangeCF not implemented"); } @@ -246,20 +247,20 @@ class WriteBatch : public WriteBatchBase { return Status::InvalidArgument("MarkBeginPrepare() handler not defined."); } - virtual Status MarkEndPrepare(const Slice& xid) { + virtual Status MarkEndPrepare(const Slice& /*xid*/) { return Status::InvalidArgument("MarkEndPrepare() handler not defined."); } - virtual Status MarkNoop(bool empty_batch) { + virtual Status MarkNoop(bool /*empty_batch*/) { return Status::InvalidArgument("MarkNoop() handler not defined."); } - virtual Status MarkRollback(const Slice& xid) { + virtual Status MarkRollback(const Slice& /*xid*/) { return Status::InvalidArgument( "MarkRollbackPrepare() handler not defined."); } - virtual Status MarkCommit(const Slice& xid) { + virtual Status MarkCommit(const Slice& /*xid*/) { return Status::InvalidArgument("MarkCommit() handler not defined."); } @@ -333,7 +334,7 @@ class WriteBatch : public WriteBatchBase { friend class WriteBatchInternal; friend class LocalSavePoint; // TODO(myabandeh): this is needed for a hack to collapse the write batch and - // remove duplicate keys. Remove it when the hack is replaced with a propper + // remove duplicate keys. Remove it when the hack is replaced with a proper // solution. friend class WriteBatchWithIndex; SavePoints* save_points_; diff --git a/include/rocksdb/write_batch_base.h b/include/rocksdb/write_batch_base.h index 3e6d011bd59..f91332ee2f6 100644 --- a/include/rocksdb/write_batch_base.h +++ b/include/rocksdb/write_batch_base.h @@ -20,7 +20,7 @@ struct SliceParts; // Abstract base class that defines the basic interface for a write batch. // See WriteBatch for a basic implementation and WrithBatchWithIndex for an -// indexed implemenation. +// indexed implementation. class WriteBatchBase { public: virtual ~WriteBatchBase() {} diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt index c80d9220ebb..6dd828bebf8 100644 --- a/java/CMakeLists.txt +++ b/java/CMakeLists.txt @@ -24,6 +24,7 @@ set(JNI_NATIVE_SOURCES rocksjni/lru_cache.cc rocksjni/memtablejni.cc rocksjni/merge_operator.cc + rocksjni/native_comparator_wrapper_test.cc rocksjni/options.cc rocksjni/options_util.cc rocksjni/ratelimiterjni.cc @@ -87,6 +88,8 @@ set(NATIVE_JAVA_CLASSES org.rocksdb.LRUCache org.rocksdb.MemTableConfig org.rocksdb.MergeOperator + org.rocksdb.NativeComparatorWrapper + org.rocksdb.NativeComparatorWrapperTest.NativeStringComparatorWrapper org.rocksdb.NativeLibraryLoader org.rocksdb.Options org.rocksdb.OptionsUtil @@ -122,6 +125,8 @@ set(NATIVE_JAVA_CLASSES org.rocksdb.WriteBatchTestInternalHelper org.rocksdb.WriteBatchWithIndex org.rocksdb.WriteOptions + org.rocksdb.util.CapturingWriteBatchHandler + org.rocksdb.util.WriteBatchGetter ) include(FindJava) @@ -177,6 +182,7 @@ add_jar( src/main/java/org/rocksdb/CompactionStyle.java src/main/java/org/rocksdb/Comparator.java src/main/java/org/rocksdb/ComparatorOptions.java + src/main/java/org/rocksdb/ComparatorType.java src/main/java/org/rocksdb/CompressionOptions.java src/main/java/org/rocksdb/CompressionType.java src/main/java/org/rocksdb/DBOptions.java @@ -203,6 +209,7 @@ add_jar( src/main/java/org/rocksdb/MergeOperator.java src/main/java/org/rocksdb/MutableColumnFamilyOptions.java src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java + src/main/java/org/rocksdb/NativeComparatorWrapper.java src/main/java/org/rocksdb/NativeLibraryLoader.java src/main/java/org/rocksdb/Options.java src/main/java/org/rocksdb/OptionsUtil.java @@ -244,11 +251,14 @@ add_jar( src/main/java/org/rocksdb/WriteOptions.java src/test/java/org/rocksdb/BackupEngineTest.java src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java + src/test/java/org/rocksdb/NativeComparatorWrapperTest.java src/test/java/org/rocksdb/PlatformRandomHelper.java src/test/java/org/rocksdb/RocksDBExceptionTest.java src/test/java/org/rocksdb/RocksMemoryResource.java src/test/java/org/rocksdb/SnapshotTest.java src/test/java/org/rocksdb/WriteBatchTest.java + src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java + src/test/java/org/rocksdb/util/WriteBatchGetter.java INCLUDE_JARS ${JAVA_TESTCLASSPATH} ) diff --git a/java/Makefile b/java/Makefile index 11c6c807e39..ec553cd3a2f 100644 --- a/java/Makefile +++ b/java/Makefile @@ -1,6 +1,7 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractCompactionFilter\ org.rocksdb.AbstractCompactionFilterFactory\ org.rocksdb.AbstractSlice\ + org.rocksdb.AbstractTransactionNotifier\ org.rocksdb.BackupEngine\ org.rocksdb.BackupableDBOptions\ org.rocksdb.BlockBasedTableConfig\ @@ -29,6 +30,9 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractCompactionFilter\ org.rocksdb.Logger\ org.rocksdb.LRUCache\ org.rocksdb.MergeOperator\ + org.rocksdb.NativeComparatorWrapper\ + org.rocksdb.OptimisticTransactionDB\ + org.rocksdb.OptimisticTransactionOptions\ org.rocksdb.Options\ org.rocksdb.OptionsUtil\ org.rocksdb.PlainTableConfig\ @@ -45,6 +49,10 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractCompactionFilter\ org.rocksdb.Slice\ org.rocksdb.SstFileWriter\ org.rocksdb.Statistics\ + org.rocksdb.Transaction\ + org.rocksdb.TransactionDB\ + org.rocksdb.TransactionDBOptions\ + org.rocksdb.TransactionOptions\ org.rocksdb.TransactionLogIterator\ org.rocksdb.TtlDB\ org.rocksdb.VectorMemTableConfig\ @@ -57,6 +65,7 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractCompactionFilter\ org.rocksdb.WBWIRocksIterator NATIVE_JAVA_TEST_CLASSES = org.rocksdb.RocksDBExceptionTest\ + org.rocksdb.NativeComparatorWrapperTest.NativeStringComparatorWrapper\ org.rocksdb.WriteBatchTest\ org.rocksdb.WriteBatchTestInternalHelper @@ -104,7 +113,11 @@ JAVA_TESTS = org.rocksdb.BackupableDBOptionsTest\ org.rocksdb.MergeTest\ org.rocksdb.MixedOptionsTest\ org.rocksdb.MutableColumnFamilyOptionsTest\ + org.rocksdb.NativeComparatorWrapperTest\ org.rocksdb.NativeLibraryLoaderTest\ + org.rocksdb.OptimisticTransactionTest\ + org.rocksdb.OptimisticTransactionDBTest\ + org.rocksdb.OptimisticTransactionOptionsTest\ org.rocksdb.OptionsUtilTest\ org.rocksdb.OptionsTest\ org.rocksdb.PlainTableConfigTest\ @@ -120,6 +133,10 @@ JAVA_TESTS = org.rocksdb.BackupableDBOptionsTest\ org.rocksdb.SliceTest\ org.rocksdb.SnapshotTest\ org.rocksdb.SstFileWriterTest\ + org.rocksdb.TransactionTest\ + org.rocksdb.TransactionDBTest\ + org.rocksdb.TransactionOptionsTest\ + org.rocksdb.TransactionDBOptionsTest\ org.rocksdb.TransactionLogIteratorTest\ org.rocksdb.TtlDBTest\ org.rocksdb.StatisticsTest\ @@ -209,6 +226,20 @@ column_family_sample: java java $(JAVA_ARGS) -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBColumnFamilySample /tmp/rocksdbjni $(AM_V_at)@rm -rf /tmp/rocksdbjni +transaction_sample: java + $(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES) + $(AM_V_at)javac -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/TransactionSample.java + $(AM_V_at)@rm -rf /tmp/rocksdbjni + java -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) TransactionSample /tmp/rocksdbjni + $(AM_V_at)@rm -rf /tmp/rocksdbjni + +optimistic_transaction_sample: java + $(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES) + $(AM_V_at)javac -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/OptimisticTransactionSample.java + $(AM_V_at)@rm -rf /tmp/rocksdbjni + java -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) OptimisticTransactionSample /tmp/rocksdbjni + $(AM_V_at)@rm -rf /tmp/rocksdbjni + resolve_test_deps: test -d "$(JAVA_TEST_LIBDIR)" || mkdir -p "$(JAVA_TEST_LIBDIR)" test -s "$(JAVA_JUNIT_JAR)" || cp $(MVN_LOCAL)/junit/junit/4.12/junit-4.12.jar $(JAVA_TEST_LIBDIR) || curl -k -L -o $(JAVA_JUNIT_JAR) $(SEARCH_REPO_URL)junit/junit/4.12/junit-4.12.jar diff --git a/java/rocksjni/columnfamilyhandle.cc b/java/rocksjni/columnfamilyhandle.cc index 6e40a7e010b..c3274ed98fe 100644 --- a/java/rocksjni/columnfamilyhandle.cc +++ b/java/rocksjni/columnfamilyhandle.cc @@ -3,8 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -// This file implements the "bridge" between Java and C++ and enables -// calling c++ rocksdb::Iterator methods from Java side. +// This file implements the "bridge" between Java and C++ for +// rocksdb::ColumnFamilyHandle. #include #include @@ -13,14 +13,56 @@ #include "include/org_rocksdb_ColumnFamilyHandle.h" #include "rocksjni/portal.h" +/* + * Class: org_rocksdb_ColumnFamilyHandle + * Method: getName + * Signature: (J)[B + */ +jbyteArray Java_org_rocksdb_ColumnFamilyHandle_getName( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto* cfh = reinterpret_cast(jhandle); + std::string cf_name = cfh->GetName(); + return rocksdb::JniUtil::copyBytes(env, cf_name); +} + +/* +* Class: org_rocksdb_ColumnFamilyHandle +* Method: getID +* Signature: (J)I +*/ +jint Java_org_rocksdb_ColumnFamilyHandle_getID( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto* cfh = reinterpret_cast(jhandle); + const int32_t id = cfh->GetID(); + return static_cast(id); +} + +/* + * Class: org_rocksdb_ColumnFamilyHandle + * Method: getDescriptor + * Signature: (J)Lorg/rocksdb/ColumnFamilyDescriptor; + */ +jobject Java_org_rocksdb_ColumnFamilyHandle_getDescriptor( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto* cfh = reinterpret_cast(jhandle); + rocksdb::ColumnFamilyDescriptor desc; + rocksdb::Status s = cfh->GetDescriptor(&desc); + if (s.ok()) { + return rocksdb::ColumnFamilyDescriptorJni::construct(env, &desc); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return nullptr; + } +} + /* * Class: org_rocksdb_ColumnFamilyHandle * Method: disposeInternal * Signature: (J)V */ void Java_org_rocksdb_ColumnFamilyHandle_disposeInternal( - JNIEnv* env, jobject jobj, jlong handle) { - auto* cfh = reinterpret_cast(handle); + JNIEnv* env, jobject jobj, jlong jhandle) { + auto* cfh = reinterpret_cast(jhandle); assert(cfh != nullptr); delete cfh; } diff --git a/java/rocksjni/comparator.cc b/java/rocksjni/comparator.cc index d4f02b28d6d..c5e590b7adf 100644 --- a/java/rocksjni/comparator.cc +++ b/java/rocksjni/comparator.cc @@ -14,6 +14,7 @@ #include "include/org_rocksdb_Comparator.h" #include "include/org_rocksdb_DirectComparator.h" +#include "include/org_rocksdb_NativeComparatorWrapper.h" #include "rocksjni/comparatorjnicallback.h" #include "rocksjni/portal.h" @@ -49,4 +50,16 @@ jlong Java_org_rocksdb_DirectComparator_createNewDirectComparator0( new rocksdb::DirectComparatorJniCallback(env, jobj, copt); return reinterpret_cast(c); } + +/* + * Class: org_rocksdb_NativeComparatorWrapper + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_NativeComparatorWrapper_disposeInternal( + JNIEnv* env, jobject jobj, jlong jcomparator_handle) { + auto* comparator = + reinterpret_cast(jcomparator_handle); + delete comparator; +} // diff --git a/java/rocksjni/merge_operator.cc b/java/rocksjni/merge_operator.cc index 1b94382ef04..01779081796 100644 --- a/java/rocksjni/merge_operator.cc +++ b/java/rocksjni/merge_operator.cc @@ -26,12 +26,12 @@ /* * Class: org_rocksdb_StringAppendOperator * Method: newSharedStringAppendOperator - * Signature: ()J + * Signature: (C)J */ jlong Java_org_rocksdb_StringAppendOperator_newSharedStringAppendOperator -(JNIEnv* env, jclass jclazz) { +(JNIEnv* env, jclass jclazz, jchar jdelim) { auto* sptr_string_append_op = new std::shared_ptr( - rocksdb::MergeOperators::CreateFromStringId("stringappend")); + rocksdb::MergeOperators::CreateStringAppendOperator((char) jdelim)); return reinterpret_cast(sptr_string_append_op); } diff --git a/java/rocksjni/native_comparator_wrapper_test.cc b/java/rocksjni/native_comparator_wrapper_test.cc new file mode 100644 index 00000000000..6f4c64020eb --- /dev/null +++ b/java/rocksjni/native_comparator_wrapper_test.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" + +#include "include/org_rocksdb_NativeComparatorWrapperTest_NativeStringComparatorWrapper.h" + +namespace rocksdb { + +class NativeComparatorWrapperTestStringComparator + : public Comparator { + + const char* Name() const { + return "NativeComparatorWrapperTestStringComparator"; + } + + int Compare( + const Slice& a, const Slice& b) const { + return a.ToString().compare(b.ToString()); + } + + void FindShortestSeparator( + std::string* start, const Slice& limit) const { + return; + } + + void FindShortSuccessor( + std::string* key) const { + return; + } +}; +} // end of rocksdb namespace + +/* + * Class: org_rocksdb_NativeComparatorWrapperTest_NativeStringComparatorWrapper + * Method: newStringComparator + * Signature: ()J + */ +jlong Java_org_rocksdb_NativeComparatorWrapperTest_00024NativeStringComparatorWrapper_newStringComparator( + JNIEnv* env , jobject jobj) { + auto* comparator = + new rocksdb::NativeComparatorWrapperTestStringComparator(); + return reinterpret_cast(comparator); +} diff --git a/java/rocksjni/optimistic_transaction_db.cc b/java/rocksjni/optimistic_transaction_db.cc new file mode 100644 index 00000000000..3381c78f259 --- /dev/null +++ b/java/rocksjni/optimistic_transaction_db.cc @@ -0,0 +1,267 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the "bridge" between Java and C++ +// for rocksdb::TransactionDB. + +#include + +#include "include/org_rocksdb_OptimisticTransactionDB.h" + +#include "rocksdb/options.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction.h" + +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_OptimisticTransactionDB + * Method: open + * Signature: (JLjava/lang/String;)J + */ +jlong Java_org_rocksdb_OptimisticTransactionDB_open__JLjava_lang_String_2( + JNIEnv* env, jclass jcls, jlong joptions_handle, jstring jdb_path) { + const char* db_path = env->GetStringUTFChars(jdb_path, nullptr); + if (db_path == nullptr) { + // exception thrown: OutOfMemoryError + return 0; + } + + auto* options = reinterpret_cast(joptions_handle); + rocksdb::OptimisticTransactionDB* otdb = nullptr; + rocksdb::Status s = + rocksdb::OptimisticTransactionDB::Open(*options, db_path, &otdb); + env->ReleaseStringUTFChars(jdb_path, db_path); + + if (s.ok()) { + return reinterpret_cast(otdb); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return 0; + } +} + +/* + * Class: org_rocksdb_OptimisticTransactionDB + * Method: open + * Signature: (JLjava/lang/String;[[B[J)[J + */ +jlongArray Java_org_rocksdb_OptimisticTransactionDB_open__JLjava_lang_String_2_3_3B_3J( + JNIEnv* env, jclass jcls, jlong jdb_options_handle, jstring jdb_path, + jobjectArray jcolumn_names, jlongArray jcolumn_options_handles) { + const char* db_path = env->GetStringUTFChars(jdb_path, nullptr); + if (db_path == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + std::vector column_families; + const jsize len_cols = env->GetArrayLength(jcolumn_names); + if (len_cols > 0) { + if (env->EnsureLocalCapacity(len_cols) != 0) { + // out of memory + env->ReleaseStringUTFChars(jdb_path, db_path); + return nullptr; + } + + jlong* jco = + env->GetLongArrayElements(jcolumn_options_handles, nullptr); + if(jco == nullptr) { + // exception thrown: OutOfMemoryError + env->ReleaseStringUTFChars(jdb_path, db_path); + return nullptr; + } + + for (int i = 0; i < len_cols; i++) { + const jobject jcn = env->GetObjectArrayElement(jcolumn_names, i); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT); + env->ReleaseStringUTFChars(jdb_path, db_path); + return nullptr; + } + + const jbyteArray jcn_ba = reinterpret_cast(jcn); + const jsize jcf_name_len = env->GetArrayLength(jcn_ba); + if (env->EnsureLocalCapacity(jcf_name_len) != 0) { + // out of memory + env->DeleteLocalRef(jcn); + env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT); + env->ReleaseStringUTFChars(jdb_path, db_path); + return nullptr; + } + + jbyte* jcf_name = env->GetByteArrayElements(jcn_ba, nullptr); + if (jcf_name == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jcn); + env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT); + env->ReleaseStringUTFChars(jdb_path, db_path); + return nullptr; + } + + const std::string cf_name(reinterpret_cast(jcf_name), jcf_name_len); + const rocksdb::ColumnFamilyOptions* cf_options = + reinterpret_cast(jco[i]); + column_families.push_back( + rocksdb::ColumnFamilyDescriptor(cf_name, *cf_options)); + + env->ReleaseByteArrayElements(jcn_ba, jcf_name, JNI_ABORT); + env->DeleteLocalRef(jcn); + } + env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT); + } + + auto* db_options = reinterpret_cast(jdb_options_handle); + std::vector handles; + rocksdb::OptimisticTransactionDB* otdb = nullptr; + const rocksdb::Status s = rocksdb::OptimisticTransactionDB::Open(*db_options, + db_path, column_families, &handles, &otdb); + + env->ReleaseStringUTFChars(jdb_path, db_path); + + // check if open operation was successful + if (s.ok()) { + const jsize resultsLen = 1 + len_cols; // db handle + column family handles + std::unique_ptr results = + std::unique_ptr(new jlong[resultsLen]); + results[0] = reinterpret_cast(otdb); + for (int i = 1; i <= len_cols; i++) { + results[i] = reinterpret_cast(handles[i - 1]); + } + + jlongArray jresults = env->NewLongArray(resultsLen); + if (jresults == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + env->SetLongArrayRegion(jresults, 0, resultsLen, results.get()); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + return nullptr; + } + return jresults; + } + + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return nullptr; +} + +/* + * Class: org_rocksdb_OptimisticTransactionDB + * Method: beginTransaction + * Signature: (JJ)J + */ +jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction__JJ( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_options_handle) { + auto* optimistic_txn_db = + reinterpret_cast(jhandle); + auto* write_options = + reinterpret_cast(jwrite_options_handle); + rocksdb::Transaction* txn = + optimistic_txn_db->BeginTransaction(*write_options); + return reinterpret_cast(txn); +} + +/* + * Class: org_rocksdb_OptimisticTransactionDB + * Method: beginTransaction + * Signature: (JJJ)J + */ +jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction__JJJ( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_options_handle, + jlong joptimistic_txn_options_handle) { + auto* optimistic_txn_db = + reinterpret_cast(jhandle); + auto* write_options = + reinterpret_cast(jwrite_options_handle); + auto* optimistic_txn_options = + reinterpret_cast( + joptimistic_txn_options_handle); + rocksdb::Transaction* txn = + optimistic_txn_db->BeginTransaction(*write_options, + *optimistic_txn_options); + return reinterpret_cast(txn); +} + +/* + * Class: org_rocksdb_OptimisticTransactionDB + * Method: beginTransaction_withOld + * Signature: (JJJ)J + */ +jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction_1withOld__JJJ( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_options_handle, + jlong jold_txn_handle) { + auto* optimistic_txn_db = + reinterpret_cast(jhandle); + auto* write_options = + reinterpret_cast(jwrite_options_handle); + auto* old_txn = + reinterpret_cast( + jold_txn_handle); + rocksdb::OptimisticTransactionOptions optimistic_txn_options; + rocksdb::Transaction* txn = + optimistic_txn_db->BeginTransaction(*write_options, + optimistic_txn_options, old_txn); + + // RocksJava relies on the assumption that + // we do not allocate a new Transaction object + // when providing an old_optimistic_txn + assert(txn == old_txn); + + return reinterpret_cast(txn); +} + +/* + * Class: org_rocksdb_OptimisticTransactionDB + * Method: beginTransaction_withOld + * Signature: (JJJJ)J + */ +jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction_1withOld__JJJJ( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_options_handle, + jlong joptimistic_txn_options_handle, jlong jold_txn_handle) { + auto* optimistic_txn_db = + reinterpret_cast(jhandle); + auto* write_options = + reinterpret_cast(jwrite_options_handle); + auto* optimistic_txn_options = + reinterpret_cast( + joptimistic_txn_options_handle); + auto* old_txn = + reinterpret_cast( + jold_txn_handle); + rocksdb::Transaction* txn = + optimistic_txn_db->BeginTransaction(*write_options, + *optimistic_txn_options, old_txn); + + // RocksJava relies on the assumption that + // we do not allocate a new Transaction object + // when providing an old_optimisic_txn + assert(txn == old_txn); + + return reinterpret_cast(txn); +} + +/* + * Class: org_rocksdb_OptimisticTransactionDB + * Method: getBaseDB + * Signature: (J)J + */ +jlong Java_org_rocksdb_OptimisticTransactionDB_getBaseDB( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto* optimistic_txn_db = + reinterpret_cast(jhandle); + return reinterpret_cast(optimistic_txn_db->GetBaseDB()); +} + +/* + * Class: org_rocksdb_OptimisticTransactionDB + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_OptimisticTransactionDB_disposeInternal(JNIEnv* env, + jobject jobj, jlong jhandle) { + delete reinterpret_cast(jhandle); +} diff --git a/java/rocksjni/optimistic_transaction_options.cc b/java/rocksjni/optimistic_transaction_options.cc new file mode 100644 index 00000000000..4c666e6ac16 --- /dev/null +++ b/java/rocksjni/optimistic_transaction_options.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the "bridge" between Java and C++ +// for rocksdb::OptimisticTransactionOptions. + +#include + +#include "include/org_rocksdb_OptimisticTransactionOptions.h" + +#include "rocksdb/comparator.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" + +/* + * Class: org_rocksdb_OptimisticTransactionOptions + * Method: newOptimisticTransactionOptions + * Signature: ()J + */ +jlong Java_org_rocksdb_OptimisticTransactionOptions_newOptimisticTransactionOptions( + JNIEnv* env, jclass jcls) { + rocksdb::OptimisticTransactionOptions* opts = + new rocksdb::OptimisticTransactionOptions(); + return reinterpret_cast(opts); +} + +/* + * Class: org_rocksdb_OptimisticTransactionOptions + * Method: isSetSnapshot + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_OptimisticTransactionOptions_isSetSnapshot( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto* opts = + reinterpret_cast(jhandle); + return opts->set_snapshot; +} + +/* + * Class: org_rocksdb_OptimisticTransactionOptions + * Method: setSetSnapshot + * Signature: (JZ)V + */ +void Java_org_rocksdb_OptimisticTransactionOptions_setSetSnapshot(JNIEnv* env, + jobject jobj, jlong jhandle, jboolean jset_snapshot) { + auto* opts = + reinterpret_cast(jhandle); + opts->set_snapshot = jset_snapshot; +} + +/* + * Class: org_rocksdb_OptimisticTransactionOptions + * Method: setComparator + * Signature: (JJ)V + */ +void Java_org_rocksdb_OptimisticTransactionOptions_setComparator( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jcomparator_handle) { + auto* opts = + reinterpret_cast(jhandle); + opts->cmp = reinterpret_cast(jcomparator_handle); +} + +/* + * Class: org_rocksdb_OptimisticTransactionOptions + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_OptimisticTransactionOptions_disposeInternal(JNIEnv* env, + jobject jobj, jlong jhandle) { + delete reinterpret_cast(jhandle); +} diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc index 8a55275a6ff..13fd1988cf2 100644 --- a/java/rocksjni/options.cc +++ b/java/rocksjni/options.cc @@ -158,19 +158,33 @@ void Java_org_rocksdb_Options_setComparatorHandle__JI( /* * Class: org_rocksdb_Options * Method: setComparatorHandle - * Signature: (JJZ)V + * Signature: (JJB)V */ -void Java_org_rocksdb_Options_setComparatorHandle__JJZ( +void Java_org_rocksdb_Options_setComparatorHandle__JJB( JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jcomparator_handle, - jboolean is_direct) { - auto* opt = reinterpret_cast(jopt_handle); - if(is_direct) { - opt->comparator = - reinterpret_cast(jcomparator_handle); - } else { - opt->comparator = - reinterpret_cast(jcomparator_handle); + jbyte jcomparator_type) { + rocksdb::Comparator *comparator = nullptr; + switch(jcomparator_type) { + // JAVA_COMPARATOR + case 0x0: + comparator = + reinterpret_cast(jcomparator_handle); + break; + + // JAVA_DIRECT_COMPARATOR + case 0x1: + comparator = + reinterpret_cast(jcomparator_handle); + break; + + // JAVA_NATIVE_COMPARATOR_WRAPPER + case 0x2: + comparator = + reinterpret_cast(jcomparator_handle); + break; } + auto* opt = reinterpret_cast(jopt_handle); + opt->comparator = comparator; } /* @@ -2984,19 +2998,33 @@ void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JI( /* * Class: org_rocksdb_ColumnFamilyOptions * Method: setComparatorHandle - * Signature: (JJZ)V + * Signature: (JJB)V */ -void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JJZ( +void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JJB( JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jcomparator_handle, - jboolean is_direct) { - auto* opt = reinterpret_cast(jopt_handle); - if(is_direct) { - opt->comparator = - reinterpret_cast(jcomparator_handle); - } else { - opt->comparator = - reinterpret_cast(jcomparator_handle); + jbyte jcomparator_type) { + rocksdb::Comparator *comparator = nullptr; + switch(jcomparator_type) { + // JAVA_COMPARATOR + case 0x0: + comparator = + reinterpret_cast(jcomparator_handle); + break; + + // JAVA_DIRECT_COMPARATOR + case 0x1: + comparator = + reinterpret_cast(jcomparator_handle); + break; + + // JAVA_NATIVE_COMPARATOR_WRAPPER + case 0x2: + comparator = + reinterpret_cast(jcomparator_handle); + break; } + auto* opt = reinterpret_cast(jopt_handle); + opt->comparator = comparator; } /* diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 522c374acd3..912fc3578cd 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -14,9 +14,11 @@ #include #include #include +#include #include #include #include +#include #include #include "rocksdb/db.h" @@ -24,10 +26,12 @@ #include "rocksdb/rate_limiter.h" #include "rocksdb/status.h" #include "rocksdb/utilities/backupable_db.h" +#include "rocksdb/utilities/transaction_db.h" #include "rocksdb/utilities/write_batch_with_index.h" #include "rocksjni/compaction_filter_factory_jnicallback.h" #include "rocksjni/comparatorjnicallback.h" #include "rocksjni/loggerjnicallback.h" +#include "rocksjni/transaction_notifier_jnicallback.h" #include "rocksjni/writebatchhandlerjnicallback.h" // Remove macro on windows @@ -171,6 +175,107 @@ class RocksDBJni : public RocksDBNativeClass { } }; +// The portal class for org.rocksdb.Status.Code +class CodeJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.Status.Code + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/Status$Code"); + } + + /** + * Get the Java Method: Status.Code#getValue + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getValueMethod(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "getValue", "()b"); + assert(mid != nullptr); + return mid; + } +}; + +// The portal class for org.rocksdb.Status.SubCode +class SubCodeJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.Status.SubCode + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/Status$SubCode"); + } + + /** + * Get the Java Method: Status.SubCode#getValue + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getValueMethod(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "getValue", "()b"); + assert(mid != nullptr); + return mid; + } + + static rocksdb::Status::SubCode toCppSubCode(const jbyte jsub_code) { + switch (jsub_code) { + case 0x0: + return rocksdb::Status::SubCode::kNone; + case 0x1: + return rocksdb::Status::SubCode::kMutexTimeout; + case 0x2: + return rocksdb::Status::SubCode::kLockTimeout; + case 0x3: + return rocksdb::Status::SubCode::kLockLimit; + case 0x4: + return rocksdb::Status::SubCode::kNoSpace; + case 0x5: + return rocksdb::Status::SubCode::kDeadlock; + case 0x6: + return rocksdb::Status::SubCode::kStaleFile; + case 0x7: + return rocksdb::Status::SubCode::kMemoryLimit; + + case 0x7F: + default: + return rocksdb::Status::SubCode::kNone; + } + } +}; + // The portal class for org.rocksdb.Status class StatusJni : public RocksDBNativeClass { public: @@ -187,6 +292,69 @@ class StatusJni : public RocksDBNativeClass { return RocksDBNativeClass::getJClass(env, "org/rocksdb/Status"); } + /** + * Get the Java Method: Status#getCode + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getCodeMethod(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "getCode", "()Lorg/rocksdb/Status$Code;"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: Status#getSubCode + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getSubCodeMethod(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "getSubCode", "()Lorg/rocksdb/Status$SubCode;"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: Status#getState + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getStateMethod(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "getState", "()Ljava/lang/String;"); + assert(mid != nullptr); + return mid; + } + /** * Create a new Java org.rocksdb.Status object with the same properties as * the provided C++ rocksdb::Status object @@ -303,6 +471,158 @@ class StatusJni : public RocksDBNativeClass { return 0x7F; // undefined } } + + // Returns the equivalent rocksdb::Status for the Java org.rocksdb.Status + static std::unique_ptr toCppStatus(JNIEnv* env, const jobject jstatus) { + jmethodID mid_code = getCodeMethod(env); + if (mid_code == nullptr) { + // exception occurred + return nullptr; + } + jobject jcode = env->CallObjectMethod(jstatus, mid_code); + if (env->ExceptionCheck()) { + // exception occurred + return nullptr; + } + + jmethodID mid_code_value = rocksdb::CodeJni::getValueMethod(env); + if (mid_code_value == nullptr) { + // exception occurred + return nullptr; + } + jbyte jcode_value = env->CallByteMethod(jcode, mid_code_value); + if (env->ExceptionCheck()) { + // exception occurred + if (jcode != nullptr) { + env->DeleteLocalRef(jcode); + } + return nullptr; + } + + jmethodID mid_subCode = getSubCodeMethod(env); + if (mid_subCode == nullptr) { + // exception occurred + return nullptr; + } + jobject jsubCode = env->CallObjectMethod(jstatus, mid_subCode); + if (env->ExceptionCheck()) { + // exception occurred + if (jcode != nullptr) { + env->DeleteLocalRef(jcode); + } + return nullptr; + } + + jbyte jsubCode_value = 0x0; // None + if (jsubCode != nullptr) { + jmethodID mid_subCode_value = rocksdb::SubCodeJni::getValueMethod(env); + if (mid_subCode_value == nullptr) { + // exception occurred + return nullptr; + } + jsubCode_value =env->CallByteMethod(jsubCode, mid_subCode_value); + if (env->ExceptionCheck()) { + // exception occurred + if (jcode != nullptr) { + env->DeleteLocalRef(jcode); + } + return nullptr; + } + } + + jmethodID mid_state = getStateMethod(env); + if (mid_state == nullptr) { + // exception occurred + return nullptr; + } + jobject jstate = env->CallObjectMethod(jstatus, mid_state); + if (env->ExceptionCheck()) { + // exception occurred + if (jsubCode != nullptr) { + env->DeleteLocalRef(jsubCode); + } + if (jcode != nullptr) { + env->DeleteLocalRef(jcode); + } + return nullptr; + } + + std::unique_ptr status; + switch (jcode_value) { + case 0x0: + //Ok + status = std::unique_ptr(new rocksdb::Status(rocksdb::Status::OK())); + break; + case 0x1: + //NotFound + status = std::unique_ptr(new rocksdb::Status(rocksdb::Status::NotFound(rocksdb::SubCodeJni::toCppSubCode(jsubCode_value)))); + break; + case 0x2: + //Corruption + status = std::unique_ptr(new rocksdb::Status(rocksdb::Status::Corruption(rocksdb::SubCodeJni::toCppSubCode(jsubCode_value)))); + break; + case 0x3: + //NotSupported + status = std::unique_ptr(new rocksdb::Status(rocksdb::Status::NotSupported(rocksdb::SubCodeJni::toCppSubCode(jsubCode_value)))); + break; + case 0x4: + //InvalidArgument + status = std::unique_ptr(new rocksdb::Status(rocksdb::Status::InvalidArgument(rocksdb::SubCodeJni::toCppSubCode(jsubCode_value)))); + break; + case 0x5: + //IOError + status = std::unique_ptr(new rocksdb::Status(rocksdb::Status::IOError(rocksdb::SubCodeJni::toCppSubCode(jsubCode_value)))); + break; + case 0x6: + //MergeInProgress + status = std::unique_ptr(new rocksdb::Status(rocksdb::Status::MergeInProgress(rocksdb::SubCodeJni::toCppSubCode(jsubCode_value)))); + break; + case 0x7: + //Incomplete + status = std::unique_ptr(new rocksdb::Status(rocksdb::Status::Incomplete(rocksdb::SubCodeJni::toCppSubCode(jsubCode_value)))); + break; + case 0x8: + //ShutdownInProgress + status = std::unique_ptr(new rocksdb::Status(rocksdb::Status::ShutdownInProgress(rocksdb::SubCodeJni::toCppSubCode(jsubCode_value)))); + break; + case 0x9: + //TimedOut + status = std::unique_ptr(new rocksdb::Status(rocksdb::Status::TimedOut(rocksdb::SubCodeJni::toCppSubCode(jsubCode_value)))); + break; + case 0xA: + //Aborted + status = std::unique_ptr(new rocksdb::Status(rocksdb::Status::Aborted(rocksdb::SubCodeJni::toCppSubCode(jsubCode_value)))); + break; + case 0xB: + //Busy + status = std::unique_ptr(new rocksdb::Status(rocksdb::Status::Busy(rocksdb::SubCodeJni::toCppSubCode(jsubCode_value)))); + break; + case 0xC: + //Expired + status = std::unique_ptr(new rocksdb::Status(rocksdb::Status::Expired(rocksdb::SubCodeJni::toCppSubCode(jsubCode_value)))); + break; + case 0xD: + //TryAgain + status = std::unique_ptr(new rocksdb::Status(rocksdb::Status::TryAgain(rocksdb::SubCodeJni::toCppSubCode(jsubCode_value)))); + break; + case 0x7F: + default: + return nullptr; + } + + // delete all local refs + if (jstate != nullptr) { + env->DeleteLocalRef(jstate); + } + if (jsubCode != nullptr) { + env->DeleteLocalRef(jsubCode); + } + if (jcode != nullptr) { + env->DeleteLocalRef(jcode); + } + + return status; + } }; // The portal class for org.rocksdb.RocksDBException @@ -334,6 +654,20 @@ class RocksDBExceptionJni : return JavaException::ThrowNew(env, msg); } + /** + * Create and throw a Java RocksDBException with the provided status + * + * If s->ok() == true, then this function will not throw any exception. + * + * @param env A pointer to the Java environment + * @param s The status for the exception + * + * @return true if an exception was thrown, false otherwise + */ + static bool ThrowNew(JNIEnv* env, std::unique_ptr& s) { + return rocksdb::RocksDBExceptionJni::ThrowNew(env, *(s.get())); + } + /** * Create and throw a Java RocksDBException with the provided status * @@ -510,6 +844,54 @@ class RocksDBExceptionJni : return true; } + + /** + * Get the Java Method: RocksDBException#getStatus + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getStatusMethod(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "getStatus", "()Lorg/rocksdb/Status;"); + assert(mid != nullptr); + return mid; + } + + static std::unique_ptr toCppStatus( + JNIEnv* env, jthrowable jrocksdb_exception) { + if(!env->IsInstanceOf(jrocksdb_exception, getJClass(env))) { + // not an instance of RocksDBException + return nullptr; + } + + // get the java status object + jmethodID mid = getStatusMethod(env); + if(mid == nullptr) { + // exception occurred accessing class or method + return nullptr; + } + + jobject jstatus = env->CallObjectMethod(jrocksdb_exception, mid); + if(env->ExceptionCheck()) { + // exception occurred + return nullptr; + } + + if(jstatus == nullptr) { + return nullptr; // no status available + } + + return rocksdb::StatusJni::toCppStatus(env, jstatus); + } }; // The portal class for java.lang.IllegalArgumentException @@ -698,20 +1080,50 @@ class WriteBatchJni : public RocksDBNativeClass< static jclass getJClass(JNIEnv* env) { return RocksDBNativeClass::getJClass(env, "org/rocksdb/WriteBatch"); } -}; -// The portal class for org.rocksdb.WriteBatch.Handler -class WriteBatchHandlerJni : public RocksDBNativeClass< - const rocksdb::WriteBatchHandlerJniCallback*, - WriteBatchHandlerJni> { - public: /** - * Get the Java Class org.rocksdb.WriteBatch.Handler + * Create a new Java org.rocksdb.WriteBatch object * * @param env A pointer to the Java environment + * @param wb A pointer to rocksdb::WriteBatch object * - * @return The Java Class or nullptr if one of the - * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * @return A reference to a Java org.rocksdb.WriteBatch object, or + * nullptr if an an exception occurs + */ + static jobject construct(JNIEnv* env, const WriteBatch* wb) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID(jclazz, "", "(J)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jobject jwb = env->NewObject(jclazz, mid, reinterpret_cast(wb)); + if (env->ExceptionCheck()) { + return nullptr; + } + + return jwb; + } +}; + +// The portal class for org.rocksdb.WriteBatch.Handler +class WriteBatchHandlerJni : public RocksDBNativeClass< + const rocksdb::WriteBatchHandlerJniCallback*, + WriteBatchHandlerJni> { + public: + /** + * Get the Java Class org.rocksdb.WriteBatch.Handler + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown */ static jclass getJClass(JNIEnv* env) { @@ -719,6 +1131,26 @@ class WriteBatchHandlerJni : public RocksDBNativeClass< "org/rocksdb/WriteBatch$Handler"); } + /** + * Get the Java Method: WriteBatch.Handler#put + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getPutCfMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "put", "(I[B[B)V"); + assert(mid != nullptr); + return mid; + } + /** * Get the Java Method: WriteBatch.Handler#put * @@ -739,6 +1171,26 @@ class WriteBatchHandlerJni : public RocksDBNativeClass< return mid; } + /** + * Get the Java Method: WriteBatch.Handler#merge + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getMergeCfMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "merge", "(I[B[B)V"); + assert(mid != nullptr); + return mid; + } + /** * Get the Java Method: WriteBatch.Handler#merge * @@ -759,6 +1211,26 @@ class WriteBatchHandlerJni : public RocksDBNativeClass< return mid; } + /** + * Get the Java Method: WriteBatch.Handler#delete + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getDeleteCfMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "delete", "(I[B)V"); + assert(mid != nullptr); + return mid; + } + /** * Get the Java Method: WriteBatch.Handler#delete * @@ -779,6 +1251,66 @@ class WriteBatchHandlerJni : public RocksDBNativeClass< return mid; } + /** + * Get the Java Method: WriteBatch.Handler#singleDelete + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getSingleDeleteCfMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "singleDelete", "(I[B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#singleDelete + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getSingleDeleteMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "singleDelete", "([B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#deleteRange + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getDeleteRangeCfMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "deleteRange", "(I[B[B)V"); + assert(mid != nullptr); + return mid; + } + /** * Get the Java Method: WriteBatch.Handler#deleteRange * @@ -819,6 +1351,126 @@ class WriteBatchHandlerJni : public RocksDBNativeClass< return mid; } + /** + * Get the Java Method: WriteBatch.Handler#putBlobIndex + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getPutBlobIndexCfMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "putBlobIndex", "(I[B[B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#markBeginPrepare + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getMarkBeginPrepareMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "markBeginPrepare", "()V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#markEndPrepare + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getMarkEndPrepareMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "markEndPrepare", "([B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#markNoop + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getMarkNoopMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "markNoop", "(Z)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#markRollback + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getMarkRollbackMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "markRollback", "([B)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: WriteBatch.Handler#markCommit + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getMarkCommitMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "markCommit", "([B)V"); + assert(mid != nullptr); + return mid; + } + /** * Get the Java Method: WriteBatch.Handler#shouldContinue * @@ -840,6 +1492,75 @@ class WriteBatchHandlerJni : public RocksDBNativeClass< } }; +class WriteBatchSavePointJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.WriteBatch.SavePoint + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/WriteBatch$SavePoint"); + } + + /** + * Get the Java Method: HistogramData constructor + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getConstructorMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "", "(JJJ)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Create a new Java org.rocksdb.WriteBatch.SavePoint object + * + * @param env A pointer to the Java environment + * @param savePoint A pointer to rocksdb::WriteBatch::SavePoint object + * + * @return A reference to a Java org.rocksdb.WriteBatch.SavePoint object, or + * nullptr if an an exception occurs + */ + static jobject construct(JNIEnv* env, const SavePoint &save_point) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = getConstructorMethodId(env); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jobject jsave_point = env->NewObject(jclazz, mid, + static_cast(save_point.size), + static_cast(save_point.count), + static_cast(save_point.content_flags)); + if (env->ExceptionCheck()) { + return nullptr; + } + + return jsave_point; + } +}; + // The portal class for org.rocksdb.WriteBatchWithIndex class WriteBatchWithIndexJni : public RocksDBNativeClass< rocksdb::WriteBatchWithIndex*, WriteBatchWithIndexJni> { @@ -1087,6 +1808,31 @@ class AbstractCompactionFilterFactoryJni : public RocksDBNativeClass< } }; +// The portal class for org.rocksdb.AbstractTransactionNotifier +class AbstractTransactionNotifierJni : public RocksDBNativeClass< + const rocksdb::TransactionNotifierJniCallback*, + AbstractTransactionNotifierJni> { + public: + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/AbstractTransactionNotifier"); + } + + // Get the java method `snapshotCreated` + // of org.rocksdb.AbstractTransactionNotifier. + static jmethodID getSnapshotCreatedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = env->GetMethodID(jclazz, "snapshotCreated", "(J)V"); + assert(mid != nullptr); + return mid; + } +}; + // The portal class for org.rocksdb.AbstractComparator class AbstractComparatorJni : public RocksDBNativeClass< const rocksdb::BaseComparatorJniCallback*, @@ -1790,53 +2536,76 @@ class WBWIRocksIteratorJni : public JavaClass { // The portal class for org.rocksdb.WBWIRocksIterator.WriteType class WriteTypeJni : public JavaClass { public: - /** - * Get the PUT enum field value of WBWIRocksIterator.WriteType - * - * @param env A pointer to the Java environment - * - * @return A reference to the enum field value or a nullptr if - * the enum field value could not be retrieved - */ - static jobject PUT(JNIEnv* env) { - return getEnum(env, "PUT"); - } + /** + * Get the PUT enum field value of WBWIRocksIterator.WriteType + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject PUT(JNIEnv* env) { + return getEnum(env, "PUT"); + } - /** - * Get the MERGE enum field value of WBWIRocksIterator.WriteType - * - * @param env A pointer to the Java environment - * - * @return A reference to the enum field value or a nullptr if - * the enum field value could not be retrieved - */ - static jobject MERGE(JNIEnv* env) { - return getEnum(env, "MERGE"); - } + /** + * Get the MERGE enum field value of WBWIRocksIterator.WriteType + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject MERGE(JNIEnv* env) { + return getEnum(env, "MERGE"); + } - /** - * Get the DELETE enum field value of WBWIRocksIterator.WriteType - * - * @param env A pointer to the Java environment - * - * @return A reference to the enum field value or a nullptr if - * the enum field value could not be retrieved - */ - static jobject DELETE(JNIEnv* env) { - return getEnum(env, "DELETE"); - } + /** + * Get the DELETE enum field value of WBWIRocksIterator.WriteType + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject DELETE(JNIEnv* env) { + return getEnum(env, "DELETE"); + } - /** - * Get the LOG enum field value of WBWIRocksIterator.WriteType - * - * @param env A pointer to the Java environment - * - * @return A reference to the enum field value or a nullptr if - * the enum field value could not be retrieved - */ - static jobject LOG(JNIEnv* env) { - return getEnum(env, "LOG"); + /** + * Get the LOG enum field value of WBWIRocksIterator.WriteType + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject LOG(JNIEnv* env) { + return getEnum(env, "LOG"); + } + + // Returns the equivalent org.rocksdb.WBWIRocksIterator.WriteType for the + // provided C++ rocksdb::WriteType enum + static jbyte toJavaWriteType(const rocksdb::WriteType& writeType) { + switch (writeType) { + case rocksdb::WriteType::kPutRecord: + return 0x0; + case rocksdb::WriteType::kMergeRecord: + return 0x1; + case rocksdb::WriteType::kDeleteRecord: + return 0x2; + case rocksdb::WriteType::kSingleDeleteRecord: + return 0x3; + case rocksdb::WriteType::kDeleteRangeRecord: + return 0x4; + case rocksdb::WriteType::kLogDataRecord: + return 0x5; + case rocksdb::WriteType::kXIDRecord: + return 0x6; + default: + return 0x7F; // undefined } + } private: /** @@ -2915,62 +3684,390 @@ class StatsLevelJni { // undefined/default return 0x0; } - } + } + + // Returns the equivalent C++ rocksdb::StatsLevel enum for the + // provided Java org.rocksdb.StatsLevel + static rocksdb::StatsLevel toCppStatsLevel(jbyte jstats_level) { + switch(jstats_level) { + case 0x0: + return rocksdb::StatsLevel::kExceptDetailedTimers; + case 0x1: + return rocksdb::StatsLevel::kExceptTimeForMutex; + case 0x2: + return rocksdb::StatsLevel::kAll; + + default: + // undefined/default + return rocksdb::StatsLevel::kExceptDetailedTimers; + } + } +}; + +// The portal class for org.rocksdb.RateLimiterMode +class RateLimiterModeJni { + public: + // Returns the equivalent org.rocksdb.RateLimiterMode for the provided + // C++ rocksdb::RateLimiter::Mode enum + static jbyte toJavaRateLimiterMode( + const rocksdb::RateLimiter::Mode& rate_limiter_mode) { + switch(rate_limiter_mode) { + case rocksdb::RateLimiter::Mode::kReadsOnly: + return 0x0; + case rocksdb::RateLimiter::Mode::kWritesOnly: + return 0x1; + case rocksdb::RateLimiter::Mode::kAllIo: + return 0x2; + + default: + // undefined/default + return 0x1; + } + } + + // Returns the equivalent C++ rocksdb::RateLimiter::Mode enum for the + // provided Java org.rocksdb.RateLimiterMode + static rocksdb::RateLimiter::Mode toCppRateLimiterMode(jbyte jrate_limiter_mode) { + switch(jrate_limiter_mode) { + case 0x0: + return rocksdb::RateLimiter::Mode::kReadsOnly; + case 0x1: + return rocksdb::RateLimiter::Mode::kWritesOnly; + case 0x2: + return rocksdb::RateLimiter::Mode::kAllIo; + + default: + // undefined/default + return rocksdb::RateLimiter::Mode::kWritesOnly; + } + } +}; + +// The portal class for org.rocksdb.Transaction +class TransactionJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.Transaction + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, + "org/rocksdb/Transaction"); + } + + /** + * Create a new Java org.rocksdb.Transaction.WaitingTransactions object + * + * @param env A pointer to the Java environment + * @param jtransaction A Java org.rocksdb.Transaction object + * @param column_family_id The id of the column family + * @param key The key + * @param transaction_ids The transaction ids + * + * @return A reference to a Java + * org.rocksdb.Transaction.WaitingTransactions object, + * or nullptr if an an exception occurs + */ + static jobject newWaitingTransactions(JNIEnv* env, jobject jtransaction, + const uint32_t column_family_id, const std::string &key, + const std::vector &transaction_ids) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID( + jclazz, "newWaitingTransactions", "(JLjava/lang/String;[J)Lorg/rocksdb/Transaction$WaitingTransactions;"); + if(mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jstring jkey = env->NewStringUTF(key.c_str()); + if(jkey == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + const size_t len = transaction_ids.size(); + jlongArray jtransaction_ids = env->NewLongArray(static_cast(len)); + if(jtransaction_ids == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jkey); + return nullptr; + } + + jlong *body = env->GetLongArrayElements(jtransaction_ids, nullptr); + if(body == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jkey); + env->DeleteLocalRef(jtransaction_ids); + return nullptr; + } + for(size_t i = 0; i < len; ++i) { + body[i] = static_cast(transaction_ids[i]); + } + env->ReleaseLongArrayElements(jtransaction_ids, body, 0); + + jobject jwaiting_transactions = env->CallObjectMethod(jtransaction, + mid, static_cast(column_family_id), jkey, jtransaction_ids); + if(env->ExceptionCheck()) { + // exception thrown: InstantiationException or OutOfMemoryError + env->DeleteLocalRef(jkey); + env->DeleteLocalRef(jtransaction_ids); + return nullptr; + } + + return jwaiting_transactions; + } +}; + +// The portal class for org.rocksdb.TransactionDB +class TransactionDBJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.TransactionDB + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, + "org/rocksdb/TransactionDB"); + } + + /** + * Create a new Java org.rocksdb.TransactionDB.DeadlockInfo object + * + * @param env A pointer to the Java environment + * @param jtransaction A Java org.rocksdb.Transaction object + * @param column_family_id The id of the column family + * @param key The key + * @param transaction_ids The transaction ids + * + * @return A reference to a Java + * org.rocksdb.Transaction.WaitingTransactions object, + * or nullptr if an an exception occurs + */ + static jobject newDeadlockInfo(JNIEnv* env, jobject jtransaction_db, + const rocksdb::TransactionID transaction_id, + const uint32_t column_family_id, const std::string &waiting_key, + const bool exclusive) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID( + jclazz, "newDeadlockInfo", "(JJLjava/lang/String;Z)Lorg/rocksdb/TransactionDB$DeadlockInfo;"); + if(mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jstring jwaiting_key = env->NewStringUTF(waiting_key.c_str()); + if(jwaiting_key == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + // resolve the column family id to a ColumnFamilyHandle + jobject jdeadlock_info = env->CallObjectMethod(jtransaction_db, + mid, transaction_id, static_cast(column_family_id), + jwaiting_key, exclusive); + if(env->ExceptionCheck()) { + // exception thrown: InstantiationException or OutOfMemoryError + env->DeleteLocalRef(jwaiting_key); + return nullptr; + } + + return jdeadlock_info; + } +}; + +// The portal class for org.rocksdb.TxnDBWritePolicy +class TxnDBWritePolicyJni { + public: + // Returns the equivalent org.rocksdb.TxnDBWritePolicy for the provided + // C++ rocksdb::TxnDBWritePolicy enum + static jbyte toJavaTxnDBWritePolicy( + const rocksdb::TxnDBWritePolicy& txndb_write_policy) { + switch(txndb_write_policy) { + case rocksdb::TxnDBWritePolicy::WRITE_COMMITTED: + return 0x0; + case rocksdb::TxnDBWritePolicy::WRITE_PREPARED: + return 0x1; + case rocksdb::TxnDBWritePolicy::WRITE_UNPREPARED: + return 0x2; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ rocksdb::TxnDBWritePolicy enum for the + // provided Java org.rocksdb.TxnDBWritePolicy + static rocksdb::TxnDBWritePolicy toCppTxnDBWritePolicy( + jbyte jtxndb_write_policy) { + switch(jtxndb_write_policy) { + case 0x0: + return rocksdb::TxnDBWritePolicy::WRITE_COMMITTED; + case 0x1: + return rocksdb::TxnDBWritePolicy::WRITE_PREPARED; + case 0x2: + return rocksdb::TxnDBWritePolicy::WRITE_UNPREPARED; + default: + // undefined/default + return rocksdb::TxnDBWritePolicy::WRITE_COMMITTED; + } + } +}; + +// The portal class for org.rocksdb.TransactionDB.KeyLockInfo +class KeyLockInfoJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.TransactionDB.KeyLockInfo + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, + "org/rocksdb/TransactionDB$KeyLockInfo"); + } + + /** + * Create a new Java org.rocksdb.TransactionDB.KeyLockInfo object + * with the same properties as the provided C++ rocksdb::KeyLockInfo object + * + * @param env A pointer to the Java environment + * @param key_lock_info The rocksdb::KeyLockInfo object + * + * @return A reference to a Java + * org.rocksdb.TransactionDB.KeyLockInfo object, + * or nullptr if an an exception occurs + */ + static jobject construct(JNIEnv* env, + const rocksdb::KeyLockInfo& key_lock_info) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID( + jclazz, "", "(Ljava/lang/String;[JZ)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } - // Returns the equivalent C++ rocksdb::StatsLevel enum for the - // provided Java org.rocksdb.StatsLevel - static rocksdb::StatsLevel toCppStatsLevel(jbyte jstats_level) { - switch(jstats_level) { - case 0x0: - return rocksdb::StatsLevel::kExceptDetailedTimers; - case 0x1: - return rocksdb::StatsLevel::kExceptTimeForMutex; - case 0x2: - return rocksdb::StatsLevel::kAll; + jstring jkey = env->NewStringUTF(key_lock_info.key.c_str()); + if (jkey == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } - default: - // undefined/default - return rocksdb::StatsLevel::kExceptDetailedTimers; + const jsize jtransaction_ids_len = static_cast(key_lock_info.ids.size()); + jlongArray jtransactions_ids = env->NewLongArray(jtransaction_ids_len); + if (jtransactions_ids == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jkey); + return nullptr; + } + + const jobject jkey_lock_info = env->NewObject(jclazz, mid, + jkey, jtransactions_ids, key_lock_info.exclusive); + if(jkey_lock_info == nullptr) { + // exception thrown: InstantiationException or OutOfMemoryError + env->DeleteLocalRef(jtransactions_ids); + env->DeleteLocalRef(jkey); + return nullptr; } + + return jkey_lock_info; } }; -// The portal class for org.rocksdb.RateLimiterMode -class RateLimiterModeJni { +// The portal class for org.rocksdb.TransactionDB.DeadlockInfo +class DeadlockInfoJni : public JavaClass { public: - // Returns the equivalent org.rocksdb.RateLimiterMode for the provided - // C++ rocksdb::RateLimiter::Mode enum - static jbyte toJavaRateLimiterMode( - const rocksdb::RateLimiter::Mode& rate_limiter_mode) { - switch(rate_limiter_mode) { - case rocksdb::RateLimiter::Mode::kReadsOnly: - return 0x0; - case rocksdb::RateLimiter::Mode::kWritesOnly: - return 0x1; - case rocksdb::RateLimiter::Mode::kAllIo: - return 0x2; + /** + * Get the Java Class org.rocksdb.TransactionDB.DeadlockInfo + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env,"org/rocksdb/TransactionDB$DeadlockInfo"); + } +}; - default: - // undefined/default - return 0x1; - } +// The portal class for org.rocksdb.TransactionDB.DeadlockPath +class DeadlockPathJni : public JavaClass { + public: + /** + * Get the Java Class org.rocksdb.TransactionDB.DeadlockPath + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, + "org/rocksdb/TransactionDB$DeadlockPath"); } - // Returns the equivalent C++ rocksdb::RateLimiter::Mode enum for the - // provided Java org.rocksdb.RateLimiterMode - static rocksdb::RateLimiter::Mode toCppRateLimiterMode(jbyte jrate_limiter_mode) { - switch(jrate_limiter_mode) { - case 0x0: - return rocksdb::RateLimiter::Mode::kReadsOnly; - case 0x1: - return rocksdb::RateLimiter::Mode::kWritesOnly; - case 0x2: - return rocksdb::RateLimiter::Mode::kAllIo; + /** + * Create a new Java org.rocksdb.TransactionDB.DeadlockPath object + * + * @param env A pointer to the Java environment + * + * @return A reference to a Java + * org.rocksdb.TransactionDB.DeadlockPath object, + * or nullptr if an an exception occurs + */ + static jobject construct(JNIEnv* env, + const jobjectArray jdeadlock_infos, const bool limit_exceeded) { + jclass jclazz = getJClass(env); + if(jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } - default: - // undefined/default - return rocksdb::RateLimiter::Mode::kWritesOnly; + jmethodID mid = env->GetMethodID( + jclazz, "", "([LDeadlockInfo;Z)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + const jobject jdeadlock_path = env->NewObject(jclazz, mid, + jdeadlock_infos, limit_exceeded); + if(jdeadlock_path == nullptr) { + // exception thrown: InstantiationException or OutOfMemoryError + return nullptr; } + + return jdeadlock_path; } }; @@ -3275,6 +4372,27 @@ class JniUtil { std::function string_fn, jboolean* has_exception) { const jsize jbyte_string_len = env->GetArrayLength(jbyte_string_ary); + return byteString(env, jbyte_string_ary, jbyte_string_len, string_fn, + has_exception); + } + + /** + * Given a Java String which is expressed as a Java Byte Array byte[], + * the passed function `string_fn` will be called on the String + * and the result returned + * + * @param env (IN) A pointer to the java environment + * @param jbyte_string_ary (IN) A Java String expressed in bytes + * @param jbyte_string_len (IN) The length of the Java String + * expressed in bytes + * @param string_fn (IN) A transform function to call on the String + * @param has_exception (OUT) will be set to JNI_TRUE + * if an OutOfMemoryError exception occurs + */ + template static T byteString(JNIEnv* env, + jbyteArray jbyte_string_ary, const jsize jbyte_string_len, + std::function string_fn, + jboolean* has_exception) { jbyte* jbyte_string = env->GetByteArrayElements(jbyte_string_ary, nullptr); if(jbyte_string == nullptr) { @@ -3352,71 +4470,101 @@ class JniUtil { return jbyte_strings; } + /** + * Copies bytes from a rocksdb::Slice to a jByteArray + * + * @param env A pointer to the java environment + * @param bytes The bytes to copy + * + * @return the Java byte[] or nullptr if an exception occurs + */ + static jbyteArray copyBytes(JNIEnv* env, const Slice& bytes) { + const jsize jlen = static_cast(bytes.size()); + + jbyteArray jbytes = env->NewByteArray(jlen); + if(jbytes == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + env->SetByteArrayRegion(jbytes, 0, jlen, + const_cast(reinterpret_cast(bytes.data()))); + if(env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + env->DeleteLocalRef(jbytes); + return nullptr; + } + + return jbytes; + } + /* * Helper for operations on a key and value * for example WriteBatch->Put * - * TODO(AR) could be extended to cover returning rocksdb::Status - * from `op` and used for RocksDB->Put etc. + * TODO(AR) could be used for RocksDB->Put etc. */ - static void kv_op( - std::function op, + static std::unique_ptr kv_op( + std::function op, JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len, - jbyteArray jentry_value, jint jentry_value_len) { + jbyteArray jvalue, jint jvalue_len) { jbyte* key = env->GetByteArrayElements(jkey, nullptr); if(env->ExceptionCheck()) { // exception thrown: OutOfMemoryError - return; + return nullptr; } - jbyte* value = env->GetByteArrayElements(jentry_value, nullptr); + jbyte* value = env->GetByteArrayElements(jvalue, nullptr); if(env->ExceptionCheck()) { // exception thrown: OutOfMemoryError if(key != nullptr) { env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); } - return; + return nullptr; } rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); rocksdb::Slice value_slice(reinterpret_cast(value), - jentry_value_len); + jvalue_len); - op(key_slice, value_slice); + auto status = op(key_slice, value_slice); if(value != nullptr) { - env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT); + env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT); } if(key != nullptr) { env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); } + + return std::unique_ptr(new rocksdb::Status(status)); } /* * Helper for operations on a key * for example WriteBatch->Delete * - * TODO(AR) could be extended to cover returning rocksdb::Status - * from `op` and used for RocksDB->Delete etc. + * TODO(AR) could be used for RocksDB->Delete etc. */ - static void k_op( - std::function op, + static std::unique_ptr k_op( + std::function op, JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len) { jbyte* key = env->GetByteArrayElements(jkey, nullptr); if(env->ExceptionCheck()) { // exception thrown: OutOfMemoryError - return; + return nullptr; } rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); - op(key_slice); + auto status = op(key_slice); if(key != nullptr) { env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); } + + return std::unique_ptr(new rocksdb::Status(status)); } /* @@ -3497,7 +4645,7 @@ class ColumnFamilyDescriptorJni : public JavaClass { * nullptr if an an exception occurs */ static jobject construct(JNIEnv* env, ColumnFamilyDescriptor* cfd) { - jbyteArray cfname = JniUtil::copyBytes(env, cfd->name); + jbyteArray jcf_name = JniUtil::copyBytes(env, cfd->name); jobject cfopts = ColumnFamilyOptionsJni::construct(env, &(cfd->options)); jclass jclazz = getJClass(env); @@ -3510,11 +4658,13 @@ class ColumnFamilyDescriptorJni : public JavaClass { "([BLorg/rocksdb/ColumnFamilyOptions;)V"); if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError + env->DeleteLocalRef(jcf_name); return nullptr; } - jobject jcfd = env->NewObject(jclazz, mid, cfname, cfopts); + jobject jcfd = env->NewObject(jclazz, mid, jcf_name, cfopts); if (env->ExceptionCheck()) { + env->DeleteLocalRef(jcf_name); return nullptr; } @@ -3563,5 +4713,169 @@ class ColumnFamilyDescriptorJni : public JavaClass { } }; +class MapJni : public JavaClass { + public: + /** + * Get the Java Class java.util.Map + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getClass(JNIEnv* env) { + return JavaClass::getJClass(env, "java/util/Map"); + } + + /** + * Get the Java Method: Map#put + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retieved + */ + static jmethodID getMapPutMethodId(JNIEnv* env) { + jclass jlist_clazz = getClass(env); + if(jlist_clazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jlist_clazz, "put", "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;"); + assert(mid != nullptr); + return mid; + } +}; + +class HashMapJni : public JavaClass { + public: + /** + * Get the Java Class java.util.HashMap + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "java/util/HashMap"); + } + + /** + * Create a new Java java.util.HashMap object. + * + * @param env A pointer to the Java environment + * + * @return A reference to a Java java.util.HashMap object, or + * nullptr if an an exception occurs + */ + static jobject construct(JNIEnv* env, const uint32_t initial_capacity = 16) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = env->GetMethodID(jclazz, "", "(I)V"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + jobject jhash_map = env->NewObject(jclazz, mid, static_cast(initial_capacity)); + if (env->ExceptionCheck()) { + return nullptr; + } + + return jhash_map; + } + + /** + * A function which maps a std::pair to a std::pair + * + * @return Either a pointer to a std::pair, or nullptr + * if an error occurs during the mapping + */ + template + using FnMapKV = std::function> (const std::pair&)>; + + // template ::value_type, std::pair>::value, int32_t>::type = 0> + // static void putAll(JNIEnv* env, const jobject jhash_map, I iterator, const FnMapKV &fn_map_kv) { + /** + * Returns true if it succeeds, false if an error occurs + */ + template + static bool putAll(JNIEnv* env, const jobject jhash_map, iterator_type iterator, iterator_type end, const FnMapKV &fn_map_kv) { + const jmethodID jmid_put = rocksdb::MapJni::getMapPutMethodId(env); + if (jmid_put == nullptr) { + return false; + } + + for (auto it = iterator; it != end; ++it) { + const std::unique_ptr> result = fn_map_kv(*it); + if (result == nullptr) { + // an error occurred during fn_map_kv + return false; + } + env->CallObjectMethod(jhash_map, jmid_put, result->first, result->second); + if (env->ExceptionCheck()) { + // exception occurred + env->DeleteLocalRef(result->second); + env->DeleteLocalRef(result->first); + return false; + } + + // release local references + env->DeleteLocalRef(result->second); + env->DeleteLocalRef(result->first); + } + + return true; + } +}; + +class LongJni : public JavaClass { + public: + /** + * Get the Java Class java.lang.Long + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "java/lang/Long"); + } + + static jobject valueOf(JNIEnv* env, jlong jprimitive_long) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID mid = + env->GetStaticMethodID(jclazz, "valueOf", "(J)Ljava/lang/Long;"); + if (mid == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + const jobject jlong_obj = + env->CallStaticObjectMethod(jclazz, mid, jprimitive_long); + if (env->ExceptionCheck()) { + // exception occurred + return nullptr; + } + + return jlong_obj; + } +}; } // namespace rocksdb #endif // JAVA_ROCKSJNI_PORTAL_H_ diff --git a/java/rocksjni/sst_file_writerjni.cc b/java/rocksjni/sst_file_writerjni.cc index 83f6b614511..2abb8d5ffad 100644 --- a/java/rocksjni/sst_file_writerjni.cc +++ b/java/rocksjni/sst_file_writerjni.cc @@ -20,24 +20,34 @@ /* * Class: org_rocksdb_SstFileWriter * Method: newSstFileWriter - * Signature: (JJJZ)J + * Signature: (JJJB)J */ -jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJJZ(JNIEnv *env, - jclass jcls, jlong jenvoptions, jlong joptions, jlong jcomparator, - jboolean is_direct) { +jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJJB(JNIEnv *env, + jclass jcls, jlong jenvoptions, jlong joptions, jlong jcomparator_handle, + jbyte jcomparator_type) { + rocksdb::Comparator *comparator = nullptr; + switch(jcomparator_type) { + // JAVA_COMPARATOR + case 0x0: + comparator = + reinterpret_cast(jcomparator_handle); + break; + + // JAVA_DIRECT_COMPARATOR + case 0x1: + comparator = + reinterpret_cast(jcomparator_handle); + break; + + // JAVA_NATIVE_COMPARATOR_WRAPPER + case 0x2: + comparator = + reinterpret_cast(jcomparator_handle); + break; + } auto *env_options = reinterpret_cast(jenvoptions); auto *options = reinterpret_cast(joptions); - - rocksdb::Comparator *comparator = nullptr; - if(is_direct) { - comparator = - reinterpret_cast(jcomparator); - } else { - comparator = - reinterpret_cast(jcomparator); - } - rocksdb::SstFileWriter *sst_file_writer = new rocksdb::SstFileWriter(*env_options, *options, comparator); return reinterpret_cast(sst_file_writer); diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc index 5f0a4735fed..ae6504c8233 100644 --- a/java/rocksjni/table.cc +++ b/java/rocksjni/table.cc @@ -38,13 +38,14 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle( /* * Class: org_rocksdb_BlockBasedTableConfig * Method: newTableFactoryHandle - * Signature: (ZJIJIIZIZZZJIBBI)J + * Signature: (ZJIJJIIZIZZZJIBBI)J */ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( - JNIEnv* env, jobject jobj, jboolean no_block_cache, jlong block_cache_size, - jint block_cache_num_shardbits, jlong block_size, jint block_size_deviation, - jint block_restart_interval, jboolean whole_key_filtering, - jlong jfilterPolicy, jboolean cache_index_and_filter_blocks, + JNIEnv *env, jobject jobj, jboolean no_block_cache, jlong block_cache_size, + jint block_cache_num_shardbits, jlong jblock_cache, jlong block_size, + jint block_size_deviation, jint block_restart_interval, + jboolean whole_key_filtering, jlong jfilter_policy, + jboolean cache_index_and_filter_blocks, jboolean pin_l0_filter_and_index_blocks_in_cache, jboolean hash_index_allow_collision, jlong block_cache_compressed_size, jint block_cache_compressd_num_shard_bits, jbyte jchecksum_type, @@ -52,22 +53,28 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( rocksdb::BlockBasedTableOptions options; options.no_block_cache = no_block_cache; - if (!no_block_cache && block_cache_size > 0) { - if (block_cache_num_shardbits > 0) { - options.block_cache = - rocksdb::NewLRUCache(block_cache_size, block_cache_num_shardbits); - } else { - options.block_cache = rocksdb::NewLRUCache(block_cache_size); + if (!no_block_cache) { + if (jblock_cache > 0) { + std::shared_ptr *pCache = + reinterpret_cast *>(jblock_cache); + options.block_cache = *pCache; + } else if (block_cache_size > 0) { + if (block_cache_num_shardbits > 0) { + options.block_cache = + rocksdb::NewLRUCache(block_cache_size, block_cache_num_shardbits); + } else { + options.block_cache = rocksdb::NewLRUCache(block_cache_size); + } } } options.block_size = block_size; options.block_size_deviation = block_size_deviation; options.block_restart_interval = block_restart_interval; options.whole_key_filtering = whole_key_filtering; - if (jfilterPolicy > 0) { + if (jfilter_policy > 0) { std::shared_ptr *pFilterPolicy = reinterpret_cast *>( - jfilterPolicy); + jfilter_policy); options.filter_policy = *pFilterPolicy; } options.cache_index_and_filter_blocks = cache_index_and_filter_blocks; diff --git a/java/rocksjni/transaction.cc b/java/rocksjni/transaction.cc new file mode 100644 index 00000000000..b2f928fb925 --- /dev/null +++ b/java/rocksjni/transaction.cc @@ -0,0 +1,1535 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the "bridge" between Java and C++ +// for rocksdb::Transaction. + +#include +#include + +#include "include/org_rocksdb_Transaction.h" + +#include "rocksdb/utilities/transaction.h" +#include "rocksjni/portal.h" + +using namespace std::placeholders; + +/* + * Class: org_rocksdb_Transaction + * Method: setSnapshot + * Signature: (J)V + */ +void Java_org_rocksdb_Transaction_setSnapshot(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + txn->SetSnapshot(); +} + +/* + * Class: org_rocksdb_Transaction + * Method: setSnapshotOnNextOperation + * Signature: (J)V + */ +void Java_org_rocksdb_Transaction_setSnapshotOnNextOperation__J(JNIEnv* env, + jobject jobj, jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + txn->SetSnapshotOnNextOperation(nullptr); +} + +/* + * Class: org_rocksdb_Transaction + * Method: setSnapshotOnNextOperation + * Signature: (JJ)V + */ +void Java_org_rocksdb_Transaction_setSnapshotOnNextOperation__JJ(JNIEnv* env, + jobject jobj, jlong jhandle, jlong jtxn_notifier_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* txn_notifier = + reinterpret_cast*>( + jtxn_notifier_handle); + txn->SetSnapshotOnNextOperation(*txn_notifier); +} + +/* + * Class: org_rocksdb_Transaction + * Method: getSnapshot + * Signature: (J)J + */ +jlong Java_org_rocksdb_Transaction_getSnapshot(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + const rocksdb::Snapshot* snapshot = txn->GetSnapshot(); + return reinterpret_cast(snapshot); +} + +/* + * Class: org_rocksdb_Transaction + * Method: clearSnapshot + * Signature: (J)V + */ +void Java_org_rocksdb_Transaction_clearSnapshot(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + txn->ClearSnapshot(); +} + +/* + * Class: org_rocksdb_Transaction + * Method: prepare + * Signature: (J)V + */ +void Java_org_rocksdb_Transaction_prepare(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + rocksdb::Status s = txn->Prepare(); + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_Transaction + * Method: commit + * Signature: (J)V + */ +void Java_org_rocksdb_Transaction_commit(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + rocksdb::Status s = txn->Commit(); + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_Transaction + * Method: rollback + * Signature: (J)V + */ +void Java_org_rocksdb_Transaction_rollback(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + rocksdb::Status s = txn->Rollback(); + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_Transaction + * Method: setSavePoint + * Signature: (J)V + */ +void Java_org_rocksdb_Transaction_setSavePoint(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + txn->SetSavePoint(); +} + +/* + * Class: org_rocksdb_Transaction + * Method: rollbackToSavePoint + * Signature: (J)V + */ +void Java_org_rocksdb_Transaction_rollbackToSavePoint(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + rocksdb::Status s = txn->RollbackToSavePoint(); + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +typedef std::function FnGet; + +// TODO(AR) consider refactoring to share this between here and rocksjni.cc +jbyteArray txn_get_helper(JNIEnv* env, const FnGet &fn_get, + const jlong &jread_options_handle, const jbyteArray &jkey, + const jint &jkey_part_len) { + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + if (key == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_part_len); + + auto* read_options = + reinterpret_cast(jread_options_handle); + std::string value; + rocksdb::Status s = fn_get(*read_options, key_slice, &value); + + // trigger java unref on key. + // by passing JNI_ABORT, it will simply release the reference without + // copying the result back to the java byte array. + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + + if (s.IsNotFound()) { + return nullptr; + } + + if (s.ok()) { + jbyteArray jret_value = + env->NewByteArray(static_cast(value.size())); + if (jret_value == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + env->SetByteArrayRegion(jret_value, 0, static_cast(value.size()), + reinterpret_cast(value.c_str())); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + return nullptr; + } + return jret_value; + } + + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return nullptr; +} + +/* + * Class: org_rocksdb_Transaction + * Method: get + * Signature: (JJ[BIJ)[B + */ +jbyteArray Java_org_rocksdb_Transaction_get__JJ_3BIJ(JNIEnv* env, jobject jobj, + jlong jhandle, jlong jread_options_handle, jbyteArray jkey, jint jkey_part_len, + jlong jcolumn_family_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* column_family_handle = + reinterpret_cast(jcolumn_family_handle); + FnGet fn_get = + std::bind( + &rocksdb::Transaction::Get, txn, _1, column_family_handle, _2, _3); + return txn_get_helper(env, fn_get, jread_options_handle, jkey, + jkey_part_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: get + * Signature: (JJ[BI)[B + */ +jbyteArray Java_org_rocksdb_Transaction_get__JJ_3BI(JNIEnv* env, jobject jobj, + jlong jhandle, jlong jread_options_handle, jbyteArray jkey, + jint jkey_part_len) { + auto* txn = reinterpret_cast(jhandle); + FnGet fn_get = + std::bind( + &rocksdb::Transaction::Get, txn, _1, _2, _3); + return txn_get_helper(env, fn_get, jread_options_handle, jkey, + jkey_part_len); +} + +// TODO(AR) consider refactoring to share this between here and rocksjni.cc +// used by txn_multi_get_helper below +std::vector txn_column_families_helper( + JNIEnv* env, jlongArray jcolumn_family_handles, bool* has_exception) { + std::vector cf_handles; + if (jcolumn_family_handles != nullptr) { + const jsize len_cols = env->GetArrayLength(jcolumn_family_handles); + if (len_cols > 0) { + if (env->EnsureLocalCapacity(len_cols) != 0) { + // out of memory + *has_exception = JNI_TRUE; + return std::vector(); + } + + jlong* jcfh = env->GetLongArrayElements(jcolumn_family_handles, nullptr); + if (jcfh == nullptr) { + // exception thrown: OutOfMemoryError + *has_exception = JNI_TRUE; + return std::vector(); + } + for (int i = 0; i < len_cols; i++) { + auto* cf_handle = + reinterpret_cast(jcfh[i]); + cf_handles.push_back(cf_handle); + } + env->ReleaseLongArrayElements(jcolumn_family_handles, jcfh, JNI_ABORT); + } + } + return cf_handles; +} + +typedef std::function ( + const rocksdb::ReadOptions&, + const std::vector&, + std::vector*)> FnMultiGet; + +void free_key_parts(JNIEnv* env, std::vector> key_parts_to_free) { + for (std::vector>::size_type i = 0; + i < key_parts_to_free.size(); i++) { + jobject jk; + jbyteArray jk_ba; + jbyte* jk_val; + std::tie(jk_ba, jk_val, jk) = key_parts_to_free[i]; + env->ReleaseByteArrayElements(jk_ba, jk_val, JNI_ABORT); + env->DeleteLocalRef(jk); + } +} + +// TODO(AR) consider refactoring to share this between here and rocksjni.cc +// cf multi get +jobjectArray txn_multi_get_helper(JNIEnv* env, const FnMultiGet &fn_multi_get, + const jlong &jread_options_handle, const jobjectArray &jkey_parts) { + const jsize len_key_parts = env->GetArrayLength(jkey_parts); + if (env->EnsureLocalCapacity(len_key_parts) != 0) { + // out of memory + return nullptr; + } + + std::vector key_parts; + std::vector> key_parts_to_free; + for (int i = 0; i < len_key_parts; i++) { + const jobject jk = env->GetObjectArrayElement(jkey_parts, i); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + free_key_parts(env, key_parts_to_free); + return nullptr; + } + jbyteArray jk_ba = reinterpret_cast(jk); + const jsize len_key = env->GetArrayLength(jk_ba); + if (env->EnsureLocalCapacity(len_key) != 0) { + // out of memory + env->DeleteLocalRef(jk); + free_key_parts(env, key_parts_to_free); + return nullptr; + } + jbyte* jk_val = env->GetByteArrayElements(jk_ba, nullptr); + if (jk_val == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jk); + free_key_parts(env, key_parts_to_free); + return nullptr; + } + + rocksdb::Slice key_slice(reinterpret_cast(jk_val), len_key); + key_parts.push_back(key_slice); + + key_parts_to_free.push_back(std::make_tuple(jk_ba, jk_val, jk)); + } + + auto* read_options = + reinterpret_cast(jread_options_handle); + std::vector value_parts; + std::vector s = + fn_multi_get(*read_options, key_parts, &value_parts); + + // free up allocated byte arrays + free_key_parts(env, key_parts_to_free); + + // prepare the results + const jclass jcls_ba = env->FindClass("[B"); + jobjectArray jresults = + env->NewObjectArray(static_cast(s.size()), jcls_ba, nullptr); + if (jresults == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + // add to the jresults + for (std::vector::size_type i = 0; i != s.size(); i++) { + if (s[i].ok()) { + jbyteArray jentry_value = + env->NewByteArray(static_cast(value_parts[i].size())); + if (jentry_value == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + env->SetByteArrayRegion( + jentry_value, 0, static_cast(value_parts[i].size()), + reinterpret_cast(value_parts[i].c_str())); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + env->DeleteLocalRef(jentry_value); + return nullptr; + } + + env->SetObjectArrayElement(jresults, static_cast(i), jentry_value); + env->DeleteLocalRef(jentry_value); + } + } + + return jresults; +} + +/* + * Class: org_rocksdb_Transaction + * Method: multiGet + * Signature: (JJ[[B[J)[[B + */ +jobjectArray Java_org_rocksdb_Transaction_multiGet__JJ_3_3B_3J(JNIEnv* env, + jobject jobj, jlong jhandle, jlong jread_options_handle, + jobjectArray jkey_parts, jlongArray jcolumn_family_handles) { + bool has_exception = false; + const std::vector column_family_handles = + txn_column_families_helper(env, jcolumn_family_handles, &has_exception); + if (has_exception) { + // exception thrown: OutOfMemoryError + return nullptr; + } + auto* txn = reinterpret_cast(jhandle); + FnMultiGet fn_multi_get = + std::bind (rocksdb::Transaction::*) (const rocksdb::ReadOptions&, const std::vector&, const std::vector&, std::vector*)>( + &rocksdb::Transaction::MultiGet, txn, _1, column_family_handles, _2, + _3); + return txn_multi_get_helper(env, fn_multi_get, jread_options_handle, + jkey_parts); +} + +/* + * Class: org_rocksdb_Transaction + * Method: multiGet + * Signature: (JJ[[B)[[B + */ +jobjectArray Java_org_rocksdb_Transaction_multiGet__JJ_3_3B(JNIEnv* env, + jobject jobj, jlong jhandle, jlong jread_options_handle, + jobjectArray jkey_parts) { + auto* txn = reinterpret_cast(jhandle); + FnMultiGet fn_multi_get = + std::bind (rocksdb::Transaction::*) (const rocksdb::ReadOptions&, const std::vector&, std::vector*)>( + &rocksdb::Transaction::MultiGet, txn, _1, _2, _3); + return txn_multi_get_helper(env, fn_multi_get, jread_options_handle, + jkey_parts); +} + +/* + * Class: org_rocksdb_Transaction + * Method: getForUpdate + * Signature: (JJ[BIJZ)[B + */ +jbyteArray Java_org_rocksdb_Transaction_getForUpdate__JJ_3BIJZ(JNIEnv* env, + jobject jobj, jlong jhandle, jlong jread_options_handle, jbyteArray jkey, + jint jkey_part_len, jlong jcolumn_family_handle, jboolean jexclusive) { + auto* column_family_handle = + reinterpret_cast(jcolumn_family_handle); + auto* txn = reinterpret_cast(jhandle); + FnGet fn_get_for_update = + std::bind( + &rocksdb::Transaction::GetForUpdate, txn, _1, column_family_handle, + _2, _3, jexclusive); + return txn_get_helper(env, fn_get_for_update, jread_options_handle, jkey, + jkey_part_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: getForUpdate + * Signature: (JJ[BIZ)[B + */ +jbyteArray Java_org_rocksdb_Transaction_getForUpdate__JJ_3BIZ(JNIEnv* env, + jobject jobj, jlong jhandle, jlong jread_options_handle, jbyteArray jkey, + jint jkey_part_len, jboolean jexclusive) { + auto* txn = reinterpret_cast(jhandle); + FnGet fn_get_for_update = + std::bind( + &rocksdb::Transaction::GetForUpdate, txn, _1, _2, _3, jexclusive); + return txn_get_helper(env, fn_get_for_update, jread_options_handle, jkey, + jkey_part_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: multiGetForUpdate + * Signature: (JJ[[B[J)[[B + */ +jobjectArray Java_org_rocksdb_Transaction_multiGetForUpdate__JJ_3_3B_3J( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jread_options_handle, + jobjectArray jkey_parts, jlongArray jcolumn_family_handles) { + bool has_exception = false; + const std::vector column_family_handles = + txn_column_families_helper(env, jcolumn_family_handles, &has_exception); + if (has_exception) { + // exception thrown: OutOfMemoryError + return nullptr; + } + auto* txn = reinterpret_cast(jhandle); + FnMultiGet fn_multi_get_for_update = + std::bind (rocksdb::Transaction::*) (const rocksdb::ReadOptions&, const std::vector&, const std::vector&, std::vector*)>( + &rocksdb::Transaction::MultiGetForUpdate, txn, _1, + column_family_handles, _2, _3); + return txn_multi_get_helper(env, fn_multi_get_for_update, + jread_options_handle, jkey_parts); +} + +/* + * Class: org_rocksdb_Transaction + * Method: multiGetForUpdate + * Signature: (JJ[[B)[[B + */ +jobjectArray Java_org_rocksdb_Transaction_multiGetForUpdate__JJ_3_3B( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jread_options_handle, + jobjectArray jkey_parts) { + auto* txn = reinterpret_cast(jhandle); + FnMultiGet fn_multi_get_for_update = + std::bind (rocksdb::Transaction::*) (const rocksdb::ReadOptions&, const std::vector&, std::vector*)>( + &rocksdb::Transaction::MultiGetForUpdate, txn, _1, _2, _3); + return txn_multi_get_helper(env, fn_multi_get_for_update, + jread_options_handle, jkey_parts); +} + +/* + * Class: org_rocksdb_Transaction + * Method: getIterator + * Signature: (JJ)J + */ +jlong Java_org_rocksdb_Transaction_getIterator__JJ(JNIEnv* env, jobject jobj, + jlong jhandle, jlong jread_options_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* read_options = + reinterpret_cast(jread_options_handle); + return reinterpret_cast( + txn->GetIterator(*read_options)); +} + +/* + * Class: org_rocksdb_Transaction + * Method: getIterator + * Signature: (JJJ)J + */ +jlong Java_org_rocksdb_Transaction_getIterator__JJJ(JNIEnv* env, jobject jobj, + jlong jhandle, jlong jread_options_handle, jlong jcolumn_family_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* read_options = + reinterpret_cast(jread_options_handle); + auto* column_family_handle = + reinterpret_cast(jcolumn_family_handle); + return reinterpret_cast( + txn->GetIterator(*read_options, column_family_handle)); +} + +typedef std::function FnWriteKV; + +// TODO(AR) consider refactoring to share this between here and rocksjni.cc +void txn_write_kv_helper(JNIEnv* env, const FnWriteKV &fn_write_kv, + const jbyteArray &jkey, const jint &jkey_part_len, + const jbyteArray &jval, const jint &jval_len) { + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + if (key == nullptr) { + // exception thrown: OutOfMemoryError + return; + } + jbyte* value = env->GetByteArrayElements(jval, nullptr); + if (value == nullptr) { + // exception thrown: OutOfMemoryError + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + return; + } + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_part_len); + rocksdb::Slice value_slice(reinterpret_cast(value), jval_len); + + rocksdb::Status s = fn_write_kv(key_slice, value_slice); + + // trigger java unref on key. + // by passing JNI_ABORT, it will simply release the reference without + // copying the result back to the java byte array. + env->ReleaseByteArrayElements(jval, value, JNI_ABORT); + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + + if (s.ok()) { + return; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_Transaction + * Method: put + * Signature: (J[BI[BIJ)V + */ +void Java_org_rocksdb_Transaction_put__J_3BI_3BIJ(JNIEnv* env, jobject jobj, + jlong jhandle, jbyteArray jkey, jint jkey_part_len, jbyteArray jval, + jint jval_len, jlong jcolumn_family_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* column_family_handle = + reinterpret_cast(jcolumn_family_handle); + FnWriteKV fn_put = + std::bind( + &rocksdb::Transaction::Put, txn, column_family_handle, _1, _2); + txn_write_kv_helper(env, fn_put, jkey, jkey_part_len, jval, jval_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: put + * Signature: (J[BI[BI)V + */ +void Java_org_rocksdb_Transaction_put__J_3BI_3BI(JNIEnv* env, jobject jobj, + jlong jhandle, jbyteArray jkey, jint jkey_part_len, jbyteArray jval, + jint jval_len) { + auto* txn = reinterpret_cast(jhandle); + FnWriteKV fn_put = + std::bind( + &rocksdb::Transaction::Put, txn, _1, _2); + txn_write_kv_helper(env, fn_put, jkey, jkey_part_len, jval, jval_len); +} + +typedef std::function FnWriteKVParts; + +void free_key_value_parts(JNIEnv* env, const int32_t len, + std::tuple jkey_parts_to_free[], + std::tuple jvalue_parts_to_free[]) { + for (int32_t i = len - 1; i >= 0; --i) { + jbyteArray jba_value_part; + jbyte* jvalue_part; + jobject jobj_value_part; + std::tie(jba_value_part, jvalue_part, jobj_value_part) = + jvalue_parts_to_free[i]; + env->ReleaseByteArrayElements(jba_value_part, jvalue_part, JNI_ABORT); + env->DeleteLocalRef(jobj_value_part); + + jbyteArray jba_key_part; + jbyte* jkey_part; + jobject jobj_key_part; + std::tie(jba_key_part, jkey_part, jobj_key_part) = + jkey_parts_to_free[i]; + env->ReleaseByteArrayElements(jba_key_part, jkey_part, JNI_ABORT); + env->DeleteLocalRef(jobj_key_part); + } +} + +// TODO(AR) consider refactoring to share this between here and rocksjni.cc +void txn_write_kv_parts_helper(JNIEnv* env, + const FnWriteKVParts &fn_write_kv_parts, const jobjectArray &jkey_parts, + const jint &jkey_parts_len, const jobjectArray &jvalue_parts, + const jint &jvalue_parts_len) { + assert(jkey_parts_len == jvalue_parts_len); + + rocksdb::Slice key_parts[jkey_parts_len]; + rocksdb::Slice value_parts[jvalue_parts_len]; + std::tuple jkey_parts_to_free[jkey_parts_len]; + std::tuple jvalue_parts_to_free[jvalue_parts_len]; + + // convert java key_parts/value_parts byte[][] to Slice(s) + for (jsize i = 0; i < jkey_parts_len; ++i) { + const jobject jobj_key_part = env->GetObjectArrayElement(jkey_parts, i); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + free_key_value_parts(env, jkey_parts_len, jkey_parts_to_free, + jvalue_parts_to_free); + return; + } + const jobject jobj_value_part = env->GetObjectArrayElement(jvalue_parts, i); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + env->DeleteLocalRef(jobj_key_part); + free_key_value_parts(env, jkey_parts_len, jkey_parts_to_free, + jvalue_parts_to_free); + return; + } + + const jbyteArray jba_key_part = reinterpret_cast(jobj_key_part); + const jsize jkey_part_len = env->GetArrayLength(jba_key_part); + if (env->EnsureLocalCapacity(jkey_part_len) != 0) { + // out of memory + env->DeleteLocalRef(jobj_value_part); + env->DeleteLocalRef(jobj_key_part); + free_key_value_parts(env, jkey_parts_len, jkey_parts_to_free, + jvalue_parts_to_free); + return; + } + jbyte* jkey_part = env->GetByteArrayElements(jba_key_part, nullptr); + if (jkey_part == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jobj_value_part); + env->DeleteLocalRef(jobj_key_part); + free_key_value_parts(env, jkey_parts_len, jkey_parts_to_free, + jvalue_parts_to_free); + return; + } + + const jbyteArray jba_value_part = reinterpret_cast(jobj_value_part); + const jsize jvalue_part_len = env->GetArrayLength(jba_value_part); + if (env->EnsureLocalCapacity(jvalue_part_len) != 0) { + // out of memory + env->DeleteLocalRef(jobj_value_part); + env->DeleteLocalRef(jobj_key_part); + free_key_value_parts(env, jkey_parts_len, jkey_parts_to_free, + jvalue_parts_to_free); + return; + } + jbyte* jvalue_part = env->GetByteArrayElements(jba_value_part, nullptr); + if (jvalue_part == nullptr) { + // exception thrown: OutOfMemoryError + env->ReleaseByteArrayElements(jba_value_part, jvalue_part, JNI_ABORT); + env->DeleteLocalRef(jobj_value_part); + env->DeleteLocalRef(jobj_key_part); + free_key_value_parts(env, jkey_parts_len, jkey_parts_to_free, + jvalue_parts_to_free); + return; + } + + jkey_parts_to_free[i] = std::tuple( + jba_key_part, jkey_part, jobj_key_part); + jvalue_parts_to_free[i] = std::tuple( + jba_value_part, jvalue_part, jobj_value_part); + + key_parts[i] = + rocksdb::Slice(reinterpret_cast(jkey_part), jkey_part_len); + value_parts[i] = + rocksdb::Slice(reinterpret_cast(jvalue_part), jvalue_part_len); + } + + // call the write_multi function + rocksdb::Status s = fn_write_kv_parts( + rocksdb::SliceParts(key_parts, jkey_parts_len), + rocksdb::SliceParts(value_parts, jvalue_parts_len)); + + // cleanup temporary memory + free_key_value_parts(env, jkey_parts_len, jkey_parts_to_free, + jvalue_parts_to_free); + + // return + if (s.ok()) { + return; + } + + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_Transaction + * Method: put + * Signature: (J[[BI[[BIJ)V + */ +void Java_org_rocksdb_Transaction_put__J_3_3BI_3_3BIJ(JNIEnv* env, + jobject jobj, jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len, + jobjectArray jvalue_parts, jint jvalue_parts_len, + jlong jcolumn_family_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* column_family_handle = + reinterpret_cast(jcolumn_family_handle); + FnWriteKVParts fn_put_parts = + std::bind( + &rocksdb::Transaction::Put, txn, column_family_handle, _1, _2); + txn_write_kv_parts_helper(env, fn_put_parts, jkey_parts, jkey_parts_len, + jvalue_parts, jvalue_parts_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: put + * Signature: (J[[BI[[BI)V + */ +void Java_org_rocksdb_Transaction_put__J_3_3BI_3_3BI(JNIEnv* env, + jobject jobj, jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len, + jobjectArray jvalue_parts, jint jvalue_parts_len) { + auto* txn = reinterpret_cast(jhandle); + FnWriteKVParts fn_put_parts = + std::bind( + &rocksdb::Transaction::Put, txn, _1, _2); + txn_write_kv_parts_helper(env, fn_put_parts, jkey_parts, jkey_parts_len, + jvalue_parts, jvalue_parts_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: merge + * Signature: (J[BI[BIJ)V + */ +void Java_org_rocksdb_Transaction_merge__J_3BI_3BIJ(JNIEnv* env, jobject jobj, + jlong jhandle, jbyteArray jkey, jint jkey_part_len, jbyteArray jval, + jint jval_len, jlong jcolumn_family_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* column_family_handle = + reinterpret_cast(jcolumn_family_handle); + FnWriteKV fn_merge = + std::bind( + &rocksdb::Transaction::Merge, txn, column_family_handle, _1, _2); + txn_write_kv_helper(env, fn_merge, jkey, jkey_part_len, jval, jval_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: merge + * Signature: (J[BI[BI)V + */ +void Java_org_rocksdb_Transaction_merge__J_3BI_3BI(JNIEnv* env, jobject jobj, + jlong jhandle, jbyteArray jkey, jint jkey_part_len, jbyteArray jval, + jint jval_len) { + auto* txn = reinterpret_cast(jhandle); + FnWriteKV fn_merge = + std::bind( + &rocksdb::Transaction::Merge, txn, _1, _2); + txn_write_kv_helper(env, fn_merge, jkey, jkey_part_len, jval, jval_len); +} + +typedef std::function FnWriteK; + +// TODO(AR) consider refactoring to share this between here and rocksjni.cc +void txn_write_k_helper(JNIEnv* env, const FnWriteK &fn_write_k, + const jbyteArray &jkey, const jint &jkey_part_len) { + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + if (key == nullptr) { + // exception thrown: OutOfMemoryError + return; + } + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_part_len); + + rocksdb::Status s = fn_write_k(key_slice); + + // trigger java unref on key. + // by passing JNI_ABORT, it will simply release the reference without + // copying the result back to the java byte array. + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + + if (s.ok()) { + return; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_Transaction + * Method: delete + * Signature: (J[BIJ)V + */ +void Java_org_rocksdb_Transaction_delete__J_3BIJ(JNIEnv* env, jobject jobj, + jlong jhandle, jbyteArray jkey, jint jkey_part_len, + jlong jcolumn_family_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* column_family_handle = + reinterpret_cast(jcolumn_family_handle); + FnWriteK fn_delete = + std::bind( + &rocksdb::Transaction::Delete, txn, column_family_handle, _1); + txn_write_k_helper(env, fn_delete, jkey, jkey_part_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: delete + * Signature: (J[BI)V + */ +void Java_org_rocksdb_Transaction_delete__J_3BI(JNIEnv* env, jobject jobj, + jlong jhandle, jbyteArray jkey, jint jkey_part_len) { + auto* txn = reinterpret_cast(jhandle); + FnWriteK fn_delete = + std::bind( + &rocksdb::Transaction::Delete, txn, _1); + txn_write_k_helper(env, fn_delete, jkey, jkey_part_len); +} + +typedef std::function FnWriteKParts; + +void free_key_parts(JNIEnv* env, const int32_t len, + std::tuple jkey_parts_to_free[]) { + for (int32_t i = len - 1; i >= 0; --i) { + jbyteArray jba_key_part; + jbyte* jkey; + jobject jobj_key_part; + std::tie(jba_key_part, jkey, jobj_key_part) = jkey_parts_to_free[i]; + env->ReleaseByteArrayElements(jba_key_part, jkey, JNI_ABORT); + env->DeleteLocalRef(jobj_key_part); + } +} + +// TODO(AR) consider refactoring to share this between here and rocksjni.cc +void txn_write_k_parts_helper(JNIEnv* env, + const FnWriteKParts &fn_write_k_parts, const jobjectArray &jkey_parts, + const jint &jkey_parts_len) { + + rocksdb::Slice key_parts[jkey_parts_len]; + std::tuple jkey_parts_to_free[jkey_parts_len]; + + // convert java key_parts byte[][] to Slice(s) + for (jint i = 0; i < jkey_parts_len; ++i) { + const jobject jobj_key_part = env->GetObjectArrayElement(jkey_parts, i); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + free_key_parts(env, jkey_parts_len, jkey_parts_to_free); + return; + } + + const jbyteArray jba_key_part = reinterpret_cast(jobj_key_part); + const jsize jkey_part_len = env->GetArrayLength(jba_key_part); + if (env->EnsureLocalCapacity(jkey_part_len) != 0) { + // out of memory + env->DeleteLocalRef(jobj_key_part); + free_key_parts(env, jkey_parts_len, jkey_parts_to_free); + return; + } + jbyte* jkey_part = env->GetByteArrayElements(jba_key_part, nullptr); + if (jkey_part == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jobj_key_part); + free_key_parts(env, jkey_parts_len, jkey_parts_to_free); + return; + } + + jkey_parts_to_free[i] = std::tuple( + jba_key_part, jkey_part, jobj_key_part); + + key_parts[i] = rocksdb::Slice(reinterpret_cast(jkey_part), jkey_part_len); + } + + // call the write_multi function + rocksdb::Status s = fn_write_k_parts( + rocksdb::SliceParts(key_parts, jkey_parts_len)); + + // cleanup temporary memory + free_key_parts(env, jkey_parts_len, jkey_parts_to_free); + + // return + if (s.ok()) { + return; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_Transaction + * Method: delete + * Signature: (J[[BIJ)V + */ +void Java_org_rocksdb_Transaction_delete__J_3_3BIJ(JNIEnv* env, jobject jobj, + jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len, + jlong jcolumn_family_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* column_family_handle = + reinterpret_cast(jcolumn_family_handle); + FnWriteKParts fn_delete_parts = + std::bind( + &rocksdb::Transaction::Delete, txn, column_family_handle, _1); + txn_write_k_parts_helper(env, fn_delete_parts, jkey_parts, jkey_parts_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: delete + * Signature: (J[[BI)V + */ +void Java_org_rocksdb_Transaction_delete__J_3_3BI(JNIEnv* env, jobject jobj, + jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len) { + auto* txn = reinterpret_cast(jhandle); + FnWriteKParts fn_delete_parts = + std::bind( + &rocksdb::Transaction::Delete, txn, _1); + txn_write_k_parts_helper(env, fn_delete_parts, jkey_parts, jkey_parts_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: singleDelete + * Signature: (J[BIJ)V + */ +void Java_org_rocksdb_Transaction_singleDelete__J_3BIJ(JNIEnv* env, + jobject jobj, jlong jhandle, jbyteArray jkey, jint jkey_part_len, + jlong jcolumn_family_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* column_family_handle = + reinterpret_cast(jcolumn_family_handle); + FnWriteK fn_single_delete = + std::bind( + &rocksdb::Transaction::SingleDelete, txn, column_family_handle, _1); + txn_write_k_helper(env, fn_single_delete, jkey, jkey_part_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: singleDelete + * Signature: (J[BI)V + */ +void Java_org_rocksdb_Transaction_singleDelete__J_3BI(JNIEnv* env, + jobject jobj, jlong jhandle, jbyteArray jkey, jint jkey_part_len) { + auto* txn = reinterpret_cast(jhandle); + FnWriteK fn_single_delete = + std::bind( + &rocksdb::Transaction::SingleDelete, txn, _1); + txn_write_k_helper(env, fn_single_delete, jkey, jkey_part_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: singleDelete + * Signature: (J[[BIJ)V + */ +void Java_org_rocksdb_Transaction_singleDelete__J_3_3BIJ(JNIEnv* env, + jobject jobj, jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len, + jlong jcolumn_family_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* column_family_handle = + reinterpret_cast(jcolumn_family_handle); + FnWriteKParts fn_single_delete_parts = + std::bind( + &rocksdb::Transaction::SingleDelete, txn, column_family_handle, _1); + txn_write_k_parts_helper(env, fn_single_delete_parts, jkey_parts, + jkey_parts_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: singleDelete + * Signature: (J[[BI)V + */ +void Java_org_rocksdb_Transaction_singleDelete__J_3_3BI(JNIEnv* env, + jobject jobj, jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len) { + auto* txn = reinterpret_cast(jhandle); + FnWriteKParts fn_single_delete_parts = + std::bind( + &rocksdb::Transaction::SingleDelete, txn, _1); + txn_write_k_parts_helper(env, fn_single_delete_parts, jkey_parts, + jkey_parts_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: putUntracked + * Signature: (J[BI[BIJ)V + */ +void Java_org_rocksdb_Transaction_putUntracked__J_3BI_3BIJ(JNIEnv* env, + jobject jobj, jlong jhandle, jbyteArray jkey, jint jkey_part_len, + jbyteArray jval, jint jval_len, jlong jcolumn_family_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* column_family_handle = + reinterpret_cast(jcolumn_family_handle); + FnWriteKV fn_put_untracked = + std::bind( + &rocksdb::Transaction::PutUntracked, txn, column_family_handle, _1, + _2); + txn_write_kv_helper(env, fn_put_untracked, jkey, jkey_part_len, jval, + jval_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: putUntracked + * Signature: (J[BI[BI)V + */ +void Java_org_rocksdb_Transaction_putUntracked__J_3BI_3BI(JNIEnv* env, + jobject jobj, jlong jhandle, jbyteArray jkey, jint jkey_part_len, + jbyteArray jval, jint jval_len) { + auto* txn = reinterpret_cast(jhandle); + FnWriteKV fn_put_untracked = + std::bind( + &rocksdb::Transaction::PutUntracked, txn, _1, _2); + txn_write_kv_helper(env, fn_put_untracked, jkey, jkey_part_len, jval, + jval_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: putUntracked + * Signature: (J[[BI[[BIJ)V + */ +void Java_org_rocksdb_Transaction_putUntracked__J_3_3BI_3_3BIJ(JNIEnv* env, + jobject jobj, jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len, + jobjectArray jvalue_parts, jint jvalue_parts_len, + jlong jcolumn_family_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* column_family_handle = + reinterpret_cast(jcolumn_family_handle); + FnWriteKVParts fn_put_parts_untracked = + std::bind( + &rocksdb::Transaction::PutUntracked, txn, column_family_handle, _1, + _2); + txn_write_kv_parts_helper(env, fn_put_parts_untracked, jkey_parts, + jkey_parts_len, jvalue_parts, jvalue_parts_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: putUntracked + * Signature: (J[[BI[[BI)V + */ +void Java_org_rocksdb_Transaction_putUntracked__J_3_3BI_3_3BI(JNIEnv* env, + jobject jobj, jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len, + jobjectArray jvalue_parts, jint jvalue_parts_len) { + auto* txn = reinterpret_cast(jhandle); + FnWriteKVParts fn_put_parts_untracked = + std::bind( + &rocksdb::Transaction::PutUntracked, txn, _1, _2); + txn_write_kv_parts_helper(env, fn_put_parts_untracked, jkey_parts, + jkey_parts_len, jvalue_parts, jvalue_parts_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: mergeUntracked + * Signature: (J[BI[BIJ)V + */ +void Java_org_rocksdb_Transaction_mergeUntracked__J_3BI_3BIJ(JNIEnv* env, + jobject jobj, jlong jhandle, jbyteArray jkey, jint jkey_part_len, + jbyteArray jval, jint jval_len, jlong jcolumn_family_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* column_family_handle = + reinterpret_cast(jcolumn_family_handle); + FnWriteKV fn_merge_untracked = + std::bind( + &rocksdb::Transaction::MergeUntracked, txn, column_family_handle, _1, + _2); + txn_write_kv_helper(env, fn_merge_untracked, jkey, jkey_part_len, jval, + jval_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: mergeUntracked + * Signature: (J[BI[BI)V + */ +void Java_org_rocksdb_Transaction_mergeUntracked__J_3BI_3BI(JNIEnv* env, + jobject jobj, jlong jhandle, jbyteArray jkey, jint jkey_part_len, + jbyteArray jval, jint jval_len) { + auto* txn = reinterpret_cast(jhandle); + FnWriteKV fn_merge_untracked = + std::bind( + &rocksdb::Transaction::MergeUntracked, txn, _1, _2); + txn_write_kv_helper(env, fn_merge_untracked, jkey, jkey_part_len, jval, + jval_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: deleteUntracked + * Signature: (J[BIJ)V + */ +void Java_org_rocksdb_Transaction_deleteUntracked__J_3BIJ(JNIEnv* env, + jobject jobj, jlong jhandle, jbyteArray jkey, jint jkey_part_len, + jlong jcolumn_family_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* column_family_handle = + reinterpret_cast(jcolumn_family_handle); + FnWriteK fn_delete_untracked = + std::bind( + &rocksdb::Transaction::DeleteUntracked, txn, column_family_handle, + _1); + txn_write_k_helper(env, fn_delete_untracked, jkey, jkey_part_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: deleteUntracked + * Signature: (J[BI)V + */ +void Java_org_rocksdb_Transaction_deleteUntracked__J_3BI(JNIEnv* env, + jobject jobj, jlong jhandle, jbyteArray jkey, jint jkey_part_len) { + auto* txn = reinterpret_cast(jhandle); + FnWriteK fn_delete_untracked = + std::bind( + &rocksdb::Transaction::DeleteUntracked, txn, _1); + txn_write_k_helper(env, fn_delete_untracked, jkey, jkey_part_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: deleteUntracked + * Signature: (J[[BIJ)V + */ +void Java_org_rocksdb_Transaction_deleteUntracked__J_3_3BIJ(JNIEnv* env, + jobject jobj, jlong jhandle, jobjectArray jkey_parts, jint jkey_parts_len, + jlong jcolumn_family_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* column_family_handle = + reinterpret_cast(jcolumn_family_handle); + FnWriteKParts fn_delete_untracked_parts = + std::bind( + &rocksdb::Transaction::DeleteUntracked, txn, column_family_handle, + _1); + txn_write_k_parts_helper(env, fn_delete_untracked_parts, jkey_parts, + jkey_parts_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: deleteUntracked + * Signature: (J[[BI)V + */ +void Java_org_rocksdb_Transaction_deleteUntracked__J_3_3BI(JNIEnv* env, + jobject jobj, jlong jhandle, jobjectArray jkey_parts, + jint jkey_parts_len) { + auto* txn = reinterpret_cast(jhandle); + FnWriteKParts fn_delete_untracked_parts = + std::bind( + &rocksdb::Transaction::DeleteUntracked, txn, _1); + txn_write_k_parts_helper(env, fn_delete_untracked_parts, jkey_parts, + jkey_parts_len); +} + +/* + * Class: org_rocksdb_Transaction + * Method: putLogData + * Signature: (J[BI)V + */ +void Java_org_rocksdb_Transaction_putLogData(JNIEnv* env, + jobject jobj, jlong jhandle, jbyteArray jkey, jint jkey_part_len) { + auto* txn = reinterpret_cast(jhandle); + + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + if (key == nullptr) { + // exception thrown: OutOfMemoryError + return; + } + + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_part_len); + txn->PutLogData(key_slice); + + // trigger java unref on key. + // by passing JNI_ABORT, it will simply release the reference without + // copying the result back to the java byte array. + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); +} + +/* + * Class: org_rocksdb_Transaction + * Method: disableIndexing + * Signature: (J)V + */ +void Java_org_rocksdb_Transaction_disableIndexing(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + txn->DisableIndexing(); +} + +/* + * Class: org_rocksdb_Transaction + * Method: enableIndexing + * Signature: (J)V + */ +void Java_org_rocksdb_Transaction_enableIndexing(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + txn->EnableIndexing(); +} + +/* + * Class: org_rocksdb_Transaction + * Method: getNumKeys + * Signature: (J)J + */ +jlong Java_org_rocksdb_Transaction_getNumKeys(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + return txn->GetNumKeys(); +} + +/* + * Class: org_rocksdb_Transaction + * Method: getNumPuts + * Signature: (J)J + */ +jlong Java_org_rocksdb_Transaction_getNumPuts(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + return txn->GetNumPuts(); +} + +/* + * Class: org_rocksdb_Transaction + * Method: getNumDeletes + * Signature: (J)J + */ +jlong Java_org_rocksdb_Transaction_getNumDeletes(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + return txn->GetNumDeletes(); +} + +/* + * Class: org_rocksdb_Transaction + * Method: getNumMerges + * Signature: (J)J + */ +jlong Java_org_rocksdb_Transaction_getNumMerges(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + return txn->GetNumMerges(); +} + +/* + * Class: org_rocksdb_Transaction + * Method: getElapsedTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_Transaction_getElapsedTime(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + return txn->GetElapsedTime(); +} + +/* + * Class: org_rocksdb_Transaction + * Method: getWriteBatch + * Signature: (J)J + */ +jlong Java_org_rocksdb_Transaction_getWriteBatch(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + return reinterpret_cast(txn->GetWriteBatch()); +} + +/* + * Class: org_rocksdb_Transaction + * Method: setLockTimeout + * Signature: (JJ)V + */ +void Java_org_rocksdb_Transaction_setLockTimeout(JNIEnv* env, jobject jobj, + jlong jhandle, jlong jlock_timeout) { + auto* txn = reinterpret_cast(jhandle); + txn->SetLockTimeout(jlock_timeout); +} + +/* + * Class: org_rocksdb_Transaction + * Method: getWriteOptions + * Signature: (J)J + */ +jlong Java_org_rocksdb_Transaction_getWriteOptions(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + return reinterpret_cast(txn->GetWriteOptions()); +} + +/* + * Class: org_rocksdb_Transaction + * Method: setWriteOptions + * Signature: (JJ)V + */ +void Java_org_rocksdb_Transaction_setWriteOptions(JNIEnv* env, jobject jobj, + jlong jhandle, jlong jwrite_options_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* write_options = + reinterpret_cast(jwrite_options_handle); + txn->SetWriteOptions(*write_options); +} + +/* + * Class: org_rocksdb_Transaction + * Method: undo + * Signature: (J[BIJ)V + */ +void Java_org_rocksdb_Transaction_undoGetForUpdate__J_3BIJ(JNIEnv* env, + jobject jobj, jlong jhandle, jbyteArray jkey, jint jkey_part_len, + jlong jcolumn_family_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* column_family_handle = + reinterpret_cast(jcolumn_family_handle); + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + if (key == nullptr) { + // exception thrown: OutOfMemoryError + return; + } + + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_part_len); + txn->UndoGetForUpdate(column_family_handle, key_slice); + + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); +} + +/* + * Class: org_rocksdb_Transaction + * Method: undoGetForUpdate + * Signature: (J[BI)V + */ +void Java_org_rocksdb_Transaction_undoGetForUpdate__J_3BI(JNIEnv* env, + jobject jobj, jlong jhandle, jbyteArray jkey, jint jkey_part_len) { + auto* txn = reinterpret_cast(jhandle); + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + if (key == nullptr) { + // exception thrown: OutOfMemoryError + return; + } + + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_part_len); + txn->UndoGetForUpdate(key_slice); + + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); +} + +/* + * Class: org_rocksdb_Transaction + * Method: rebuildFromWriteBatch + * Signature: (JJ)V + */ +void Java_org_rocksdb_Transaction_rebuildFromWriteBatch(JNIEnv* env, + jobject jobj, jlong jhandle, jlong jwrite_batch_handle) { + auto* txn = reinterpret_cast(jhandle); + auto* write_batch = + reinterpret_cast(jwrite_batch_handle); + rocksdb::Status s = txn->RebuildFromWriteBatch(write_batch); + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_Transaction + * Method: getCommitTimeWriteBatch + * Signature: (J)J + */ +jlong Java_org_rocksdb_Transaction_getCommitTimeWriteBatch(JNIEnv* env, + jobject jobj, jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + return reinterpret_cast(txn->GetCommitTimeWriteBatch()); +} + +/* + * Class: org_rocksdb_Transaction + * Method: setLogNumber + * Signature: (JJ)V + */ +void Java_org_rocksdb_Transaction_setLogNumber(JNIEnv* env, jobject jobj, + jlong jhandle, jlong jlog_number) { + auto* txn = reinterpret_cast(jhandle); + txn->SetLogNumber(jlog_number); +} + +/* + * Class: org_rocksdb_Transaction + * Method: getLogNumber + * Signature: (J)J + */ +jlong Java_org_rocksdb_Transaction_getLogNumber(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + return txn->GetLogNumber(); +} + +/* + * Class: org_rocksdb_Transaction + * Method: setName + * Signature: (JLjava/lang/String;)V + */ +void Java_org_rocksdb_Transaction_setName(JNIEnv* env, jobject jobj, + jlong jhandle, jstring jname) { + auto* txn = reinterpret_cast(jhandle); + const char* name = env->GetStringUTFChars(jname, nullptr); + if (name == nullptr) { + // exception thrown: OutOfMemoryError + return; + } + + rocksdb::Status s = txn->SetName(name); + + env->ReleaseStringUTFChars(jname, name); + + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_Transaction + * Method: getName + * Signature: (J)Ljava/lang/String; + */ +jstring Java_org_rocksdb_Transaction_getName(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + rocksdb::TransactionName name = txn->GetName(); + return env->NewStringUTF(name.data()); +} + +/* + * Class: org_rocksdb_Transaction + * Method: getID + * Signature: (J)J + */ +jlong Java_org_rocksdb_Transaction_getID(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + rocksdb::TransactionID id = txn->GetID(); + return static_cast(id); +} + +/* + * Class: org_rocksdb_Transaction + * Method: isDeadlockDetect + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Transaction_isDeadlockDetect(JNIEnv* env, + jobject jobj, jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + return static_cast(txn->IsDeadlockDetect()); +} + +/* + * Class: org_rocksdb_Transaction + * Method: getWaitingTxns + * Signature: (J)Lorg/rocksdb/Transaction/WaitingTransactions; + */ +jobject Java_org_rocksdb_Transaction_getWaitingTxns(JNIEnv* env, + jobject jtransaction_obj, jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + uint32_t column_family_id; + std::string key; + std::vector waiting_txns = + txn->GetWaitingTxns(&column_family_id, &key); + jobject jwaiting_txns = + rocksdb::TransactionJni::newWaitingTransactions( + env, jtransaction_obj, column_family_id, key, waiting_txns); + return jwaiting_txns; +} + +/* + * Class: org_rocksdb_Transaction + * Method: getState + * Signature: (J)B + */ +jbyte Java_org_rocksdb_Transaction_getState(JNIEnv* env, + jobject jobj, jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + rocksdb::Transaction::TransactionState txn_status = txn->GetState(); + switch (txn_status) { + case rocksdb::Transaction::TransactionState::STARTED: + return 0x0; + + case rocksdb::Transaction::TransactionState::AWAITING_PREPARE: + return 0x1; + + case rocksdb::Transaction::TransactionState::PREPARED: + return 0x2; + + case rocksdb::Transaction::TransactionState::AWAITING_COMMIT: + return 0x3; + + case rocksdb::Transaction::TransactionState::COMMITED: + return 0x4; + + case rocksdb::Transaction::TransactionState::AWAITING_ROLLBACK: + return 0x5; + + case rocksdb::Transaction::TransactionState::ROLLEDBACK: + return 0x6; + + case rocksdb::Transaction::TransactionState::LOCKS_STOLEN: + return 0x7; + } + + assert(false); + return 0xFF; +} + +/* + * Class: org_rocksdb_Transaction + * Method: getId + * Signature: (J)J + */ +jlong Java_org_rocksdb_Transaction_getId(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto* txn = reinterpret_cast(jhandle); + uint64_t id = txn->GetId(); + return static_cast(id); +} + +/* + * Class: org_rocksdb_Transaction + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_Transaction_disposeInternal(JNIEnv* env, jobject jobj, + jlong jhandle) { + delete reinterpret_cast(jhandle); +} diff --git a/java/rocksjni/transaction_db.cc b/java/rocksjni/transaction_db.cc new file mode 100644 index 00000000000..6bb8024202b --- /dev/null +++ b/java/rocksjni/transaction_db.cc @@ -0,0 +1,431 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the "bridge" between Java and C++ +// for rocksdb::TransactionDB. + +#include +#include +#include +#include + + +#include "include/org_rocksdb_TransactionDB.h" + +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" + +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_TransactionDB + * Method: open + * Signature: (JJLjava/lang/String;)J + */ +jlong Java_org_rocksdb_TransactionDB_open__JJLjava_lang_String_2(JNIEnv* env, + jclass jcls, jlong joptions_handle, jlong jtxn_db_options_handle, + jstring jdb_path) { + auto* options = reinterpret_cast(joptions_handle); + auto* txn_db_options = + reinterpret_cast(jtxn_db_options_handle); + rocksdb::TransactionDB* tdb = nullptr; + const char* db_path = env->GetStringUTFChars(jdb_path, nullptr); + if (db_path == nullptr) { + // exception thrown: OutOfMemoryError + return 0; + } + rocksdb::Status s = + rocksdb::TransactionDB::Open(*options, *txn_db_options, db_path, &tdb); + env->ReleaseStringUTFChars(jdb_path, db_path); + + if (s.ok()) { + return reinterpret_cast(tdb); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return 0; + } +} + +/* + * Class: org_rocksdb_TransactionDB + * Method: open + * Signature: (JJLjava/lang/String;[[B[J)[J + */ +jlongArray Java_org_rocksdb_TransactionDB_open__JJLjava_lang_String_2_3_3B_3J( + JNIEnv* env, jclass jcls, jlong jdb_options_handle, + jlong jtxn_db_options_handle, jstring jdb_path, + jobjectArray jcolumn_names, + jlongArray jcolumn_options_handles) { + const char* db_path = env->GetStringUTFChars(jdb_path, nullptr); + if (db_path == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + const jsize len_cols = env->GetArrayLength(jcolumn_names); + if (env->EnsureLocalCapacity(len_cols) != 0) { + // out of memory + env->ReleaseStringUTFChars(jdb_path, db_path); + return nullptr; + } + + jlong* jco = env->GetLongArrayElements(jcolumn_options_handles, nullptr); + if (jco == nullptr) { + // exception thrown: OutOfMemoryError + env->ReleaseStringUTFChars(jdb_path, db_path); + return nullptr; + } + std::vector column_families; + for (int i = 0; i < len_cols; i++) { + const jobject jcn = env->GetObjectArrayElement(jcolumn_names, i); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT); + env->ReleaseStringUTFChars(jdb_path, db_path); + return nullptr; + } + const jbyteArray jcn_ba = reinterpret_cast(jcn); + jbyte* jcf_name = env->GetByteArrayElements(jcn_ba, nullptr); + if (jcf_name == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jcn); + env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT); + env->ReleaseStringUTFChars(jdb_path, db_path); + return nullptr; + } + + const int jcf_name_len = env->GetArrayLength(jcn_ba); + if (env->EnsureLocalCapacity(jcf_name_len) != 0) { + // out of memory + env->ReleaseByteArrayElements(jcn_ba, jcf_name, JNI_ABORT); + env->DeleteLocalRef(jcn); + env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT); + env->ReleaseStringUTFChars(jdb_path, db_path); + return nullptr; + } + const std::string cf_name(reinterpret_cast(jcf_name), jcf_name_len); + const rocksdb::ColumnFamilyOptions* cf_options = + reinterpret_cast(jco[i]); + column_families.push_back( + rocksdb::ColumnFamilyDescriptor(cf_name, *cf_options)); + + env->ReleaseByteArrayElements(jcn_ba, jcf_name, JNI_ABORT); + env->DeleteLocalRef(jcn); + } + env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT); + + auto* db_options = reinterpret_cast(jdb_options_handle); + auto* txn_db_options = + reinterpret_cast(jtxn_db_options_handle); + std::vector handles; + rocksdb::TransactionDB* tdb = nullptr; + const rocksdb::Status s = rocksdb::TransactionDB::Open(*db_options, *txn_db_options, + db_path, column_families, &handles, &tdb); + + // check if open operation was successful + if (s.ok()) { + const jsize resultsLen = 1 + len_cols; // db handle + column family handles + std::unique_ptr results = + std::unique_ptr(new jlong[resultsLen]); + results[0] = reinterpret_cast(tdb); + for (int i = 1; i <= len_cols; i++) { + results[i] = reinterpret_cast(handles[i - 1]); + } + + jlongArray jresults = env->NewLongArray(resultsLen); + if (jresults == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + env->SetLongArrayRegion(jresults, 0, resultsLen, results.get()); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + env->DeleteLocalRef(jresults); + return nullptr; + } + return jresults; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return nullptr; + } +} + +/* + * Class: org_rocksdb_TransactionDB + * Method: beginTransaction + * Signature: (JJ)J + */ +jlong Java_org_rocksdb_TransactionDB_beginTransaction__JJ(JNIEnv* env, + jobject jobj, jlong jhandle, jlong jwrite_options_handle) { + auto* txn_db = reinterpret_cast(jhandle); + auto* write_options = + reinterpret_cast(jwrite_options_handle); + rocksdb::Transaction* txn = txn_db->BeginTransaction(*write_options); + return reinterpret_cast(txn); +} + +/* + * Class: org_rocksdb_TransactionDB + * Method: beginTransaction + * Signature: (JJJ)J + */ +jlong Java_org_rocksdb_TransactionDB_beginTransaction__JJJ(JNIEnv* env, + jobject jobj, jlong jhandle, jlong jwrite_options_handle, + jlong jtxn_options_handle) { + auto* txn_db = reinterpret_cast(jhandle); + auto* write_options = + reinterpret_cast(jwrite_options_handle); + auto* txn_options = + reinterpret_cast(jtxn_options_handle); + rocksdb::Transaction* txn = + txn_db->BeginTransaction(*write_options, *txn_options); + return reinterpret_cast(txn); +} + +/* + * Class: org_rocksdb_TransactionDB + * Method: beginTransaction_withOld + * Signature: (JJJ)J + */ +jlong Java_org_rocksdb_TransactionDB_beginTransaction_1withOld__JJJ( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_options_handle, + jlong jold_txn_handle) { + auto* txn_db = reinterpret_cast(jhandle); + auto* write_options = + reinterpret_cast(jwrite_options_handle); + auto* old_txn = reinterpret_cast(jold_txn_handle); + rocksdb::TransactionOptions txn_options; + rocksdb::Transaction* txn = + txn_db->BeginTransaction(*write_options, txn_options, old_txn); + + // RocksJava relies on the assumption that + // we do not allocate a new Transaction object + // when providing an old_txn + assert(txn == old_txn); + + return reinterpret_cast(txn); +} + +/* + * Class: org_rocksdb_TransactionDB + * Method: beginTransaction_withOld + * Signature: (JJJJ)J + */ +jlong Java_org_rocksdb_TransactionDB_beginTransaction_1withOld__JJJJ( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_options_handle, + jlong jtxn_options_handle, jlong jold_txn_handle) { + auto* txn_db = reinterpret_cast(jhandle); + auto* write_options = + reinterpret_cast(jwrite_options_handle); + auto* txn_options = + reinterpret_cast(jtxn_options_handle); + auto* old_txn = reinterpret_cast(jold_txn_handle); + rocksdb::Transaction* txn = txn_db->BeginTransaction(*write_options, + *txn_options, old_txn); + + // RocksJava relies on the assumption that + // we do not allocate a new Transaction object + // when providing an old_txn + assert(txn == old_txn); + + return reinterpret_cast(txn); +} + +/* + * Class: org_rocksdb_TransactionDB + * Method: getTransactionByName + * Signature: (JLjava/lang/String;)J + */ +jlong Java_org_rocksdb_TransactionDB_getTransactionByName(JNIEnv* env, + jobject jobj, jlong jhandle, jstring jname) { + auto* txn_db = reinterpret_cast(jhandle); + const char* name = env->GetStringUTFChars(jname, nullptr); + if (name == nullptr) { + // exception thrown: OutOfMemoryError + return 0; + } + rocksdb::Transaction* txn = txn_db->GetTransactionByName(name); + env->ReleaseStringUTFChars(jname, name); + return reinterpret_cast(txn); +} + +/* + * Class: org_rocksdb_TransactionDB + * Method: getAllPreparedTransactions + * Signature: (J)[J + */ +jlongArray Java_org_rocksdb_TransactionDB_getAllPreparedTransactions( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto* txn_db = reinterpret_cast(jhandle); + std::vector txns; + txn_db->GetAllPreparedTransactions(&txns); + + const size_t size = txns.size(); + assert(size < UINT32_MAX); // does it fit in a jint? + + const jsize len = static_cast(size); + jlong tmp[len]; + for (jsize i = 0; i < len; ++i) { + tmp[i] = reinterpret_cast(txns[i]); + } + + jlongArray jtxns = env->NewLongArray(len); + if (jtxns == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + env->SetLongArrayRegion(jtxns, 0, len, tmp); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + env->DeleteLocalRef(jtxns); + return nullptr; + } + + return jtxns; +} + +/* + * Class: org_rocksdb_TransactionDB + * Method: getLockStatusData + * Signature: (J)Ljava/util/Map; + */ +jobject Java_org_rocksdb_TransactionDB_getLockStatusData( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto* txn_db = reinterpret_cast(jhandle); + const std::unordered_multimap lock_status_data = + txn_db->GetLockStatusData(); + const jobject jlock_status_data = rocksdb::HashMapJni::construct(env, + static_cast(lock_status_data.size())); + if (jlock_status_data == nullptr) { + // exception occurred + return nullptr; + } + + const rocksdb::HashMapJni::FnMapKV fn_map_kv = + [env, txn_db, &lock_status_data](const std::pair& pair) { + const jobject jlong_column_family_id = + rocksdb::LongJni::valueOf(env, pair.first); + if (jlong_column_family_id == nullptr) { + // an error occurred + return std::unique_ptr>(nullptr); + } + const jobject jkey_lock_info = + rocksdb::KeyLockInfoJni::construct(env, pair.second); + if (jkey_lock_info == nullptr) { + // an error occurred + return std::unique_ptr>(nullptr); + } + return std::unique_ptr>(new std::pair(jlong_column_family_id, + jkey_lock_info)); + }; + + if(!rocksdb::HashMapJni::putAll(env, jlock_status_data, + lock_status_data.begin(), lock_status_data.end(), fn_map_kv)) { + // exception occcurred + return nullptr; + } + + return jlock_status_data; +} + +/* +* Class: org_rocksdb_TransactionDB +* Method: getDeadlockInfoBuffer +* Signature: (J)[Lorg/rocksdb/TransactionDB/DeadlockPath; +*/ +jobjectArray Java_org_rocksdb_TransactionDB_getDeadlockInfoBuffer( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto* txn_db = reinterpret_cast(jhandle); + const std::vector deadlock_info_buffer = + txn_db->GetDeadlockInfoBuffer(); + + const jsize deadlock_info_buffer_len = + static_cast(deadlock_info_buffer.size()); + jobjectArray jdeadlock_info_buffer = + env->NewObjectArray(deadlock_info_buffer_len, + rocksdb::DeadlockPathJni::getJClass(env), nullptr); + if (jdeadlock_info_buffer == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + jsize jdeadlock_info_buffer_offset = 0; + + auto buf_end = deadlock_info_buffer.end(); + for (auto buf_it = deadlock_info_buffer.begin(); buf_it != buf_end; ++buf_it) { + const rocksdb::DeadlockPath deadlock_path = *buf_it; + const std::vector deadlock_infos + = deadlock_path.path; + const jsize deadlock_infos_len = + static_cast(deadlock_info_buffer.size()); + jobjectArray jdeadlock_infos = env->NewObjectArray(deadlock_infos_len, + rocksdb::DeadlockInfoJni::getJClass(env), nullptr); + if (jdeadlock_infos == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jdeadlock_info_buffer); + return nullptr; + } + jsize jdeadlock_infos_offset = 0; + + auto infos_end = deadlock_infos.end(); + for (auto infos_it = deadlock_infos.begin(); infos_it != infos_end; ++infos_it) { + const rocksdb::DeadlockInfo deadlock_info = *infos_it; + const jobject jdeadlock_info = rocksdb::TransactionDBJni::newDeadlockInfo( + env, jobj, deadlock_info.m_txn_id, deadlock_info.m_cf_id, + deadlock_info.m_waiting_key, deadlock_info.m_exclusive); + if (jdeadlock_info == nullptr) { + // exception occcurred + env->DeleteLocalRef(jdeadlock_info_buffer); + return nullptr; + } + env->SetObjectArrayElement(jdeadlock_infos, jdeadlock_infos_offset++, jdeadlock_info); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException or ArrayStoreException + env->DeleteLocalRef(jdeadlock_info); + env->DeleteLocalRef(jdeadlock_info_buffer); + return nullptr; + } + } + + const jobject jdeadlock_path = + rocksdb::DeadlockPathJni::construct(env, jdeadlock_infos, + deadlock_path.limit_exceeded); + if(jdeadlock_path == nullptr) { + // exception occcurred + env->DeleteLocalRef(jdeadlock_info_buffer); + return nullptr; + } + env->SetObjectArrayElement(jdeadlock_info_buffer, jdeadlock_info_buffer_offset++, jdeadlock_path); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException or ArrayStoreException + env->DeleteLocalRef(jdeadlock_path); + env->DeleteLocalRef(jdeadlock_info_buffer); + return nullptr; + } + } + + return jdeadlock_info_buffer; +} + +/* +* Class: org_rocksdb_TransactionDB +* Method: setDeadlockInfoBufferSize +* Signature: (JI)V +*/ +void Java_org_rocksdb_TransactionDB_setDeadlockInfoBufferSize( + JNIEnv* env, jobject jobj, jlong jhandle, jint jdeadlock_info_buffer_size) { + auto* txn_db = reinterpret_cast(jhandle); + txn_db->SetDeadlockInfoBufferSize(jdeadlock_info_buffer_size); +} + +/* + * Class: org_rocksdb_TransactionDB + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_TransactionDB_disposeInternal(JNIEnv* env, jobject jobj, + jlong jhandle) { + delete reinterpret_cast(jhandle); +} diff --git a/java/rocksjni/transaction_db_options.cc b/java/rocksjni/transaction_db_options.cc new file mode 100644 index 00000000000..600bce18fb0 --- /dev/null +++ b/java/rocksjni/transaction_db_options.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the "bridge" between Java and C++ +// for rocksdb::TransactionDBOptions. + +#include + +#include "include/org_rocksdb_TransactionDBOptions.h" + +#include "rocksdb/utilities/transaction_db.h" + +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_TransactionDBOptions + * Method: newTransactionDBOptions + * Signature: ()J + */ +jlong Java_org_rocksdb_TransactionDBOptions_newTransactionDBOptions( + JNIEnv* env, jclass jcls) { + rocksdb::TransactionDBOptions* opts = new rocksdb::TransactionDBOptions(); + return reinterpret_cast(opts); +} + +/* + * Class: org_rocksdb_TransactionDBOptions + * Method: getMaxNumLocks + * Signature: (J)J + */ +jlong Java_org_rocksdb_TransactionDBOptions_getMaxNumLocks(JNIEnv* env, + jobject jobj, jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return opts->max_num_locks; +} + +/* + * Class: org_rocksdb_TransactionDBOptions + * Method: setMaxNumLocks + * Signature: (JJ)V + */ +void Java_org_rocksdb_TransactionDBOptions_setMaxNumLocks(JNIEnv* env, + jobject jobj, jlong jhandle, jlong jmax_num_locks) { + auto* opts = reinterpret_cast(jhandle); + opts->max_num_locks = jmax_num_locks; +} + +/* + * Class: org_rocksdb_TransactionDBOptions + * Method: getNumStripes + * Signature: (J)J + */ +jlong Java_org_rocksdb_TransactionDBOptions_getNumStripes(JNIEnv* env, + jobject jobj, jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return opts->num_stripes; +} + +/* + * Class: org_rocksdb_TransactionDBOptions + * Method: setNumStripes + * Signature: (JJ)V + */ +void Java_org_rocksdb_TransactionDBOptions_setNumStripes(JNIEnv* env, + jobject jobj, jlong jhandle, jlong jnum_stripes) { + auto* opts = reinterpret_cast(jhandle); + opts->num_stripes = jnum_stripes; +} + +/* + * Class: org_rocksdb_TransactionDBOptions + * Method: getTransactionLockTimeout + * Signature: (J)J + */ +jlong Java_org_rocksdb_TransactionDBOptions_getTransactionLockTimeout( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return opts->transaction_lock_timeout; +} + +/* + * Class: org_rocksdb_TransactionDBOptions + * Method: setTransactionLockTimeout + * Signature: (JJ)V + */ +void Java_org_rocksdb_TransactionDBOptions_setTransactionLockTimeout( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jtransaction_lock_timeout) { + auto* opts = reinterpret_cast(jhandle); + opts->transaction_lock_timeout = jtransaction_lock_timeout; +} + +/* + * Class: org_rocksdb_TransactionDBOptions + * Method: getDefaultLockTimeout + * Signature: (J)J + */ +jlong Java_org_rocksdb_TransactionDBOptions_getDefaultLockTimeout( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return opts->default_lock_timeout; +} + +/* + * Class: org_rocksdb_TransactionDBOptions + * Method: setDefaultLockTimeout + * Signature: (JJ)V + */ +void Java_org_rocksdb_TransactionDBOptions_setDefaultLockTimeout( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jdefault_lock_timeout) { + auto* opts = reinterpret_cast(jhandle); + opts->default_lock_timeout = jdefault_lock_timeout; +} + +/* + * Class: org_rocksdb_TransactionDBOptions + * Method: getWritePolicy + * Signature: (J)B + */ +jbyte Java_org_rocksdb_TransactionDBOptions_getWritePolicy( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return rocksdb::TxnDBWritePolicyJni::toJavaTxnDBWritePolicy(opts->write_policy); +} + +/* +* Class: org_rocksdb_TransactionDBOptions +* Method: setWritePolicy +* Signature: (JB)V +*/ +void Java_org_rocksdb_TransactionDBOptions_setWritePolicy( + JNIEnv* env, jobject jobj, jlong jhandle, jbyte jwrite_policy) { + auto* opts = reinterpret_cast(jhandle); + opts->write_policy = + rocksdb::TxnDBWritePolicyJni::toCppTxnDBWritePolicy(jwrite_policy); +} + +/* + * Class: org_rocksdb_TransactionDBOptions + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_TransactionDBOptions_disposeInternal(JNIEnv* env, + jobject jobj, jlong jhandle) { + delete reinterpret_cast(jhandle); +} diff --git a/java/rocksjni/transaction_notifier.cc b/java/rocksjni/transaction_notifier.cc new file mode 100644 index 00000000000..3fdb4fb17e1 --- /dev/null +++ b/java/rocksjni/transaction_notifier.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the "bridge" between Java and C++ +// for rocksdb::TransactionNotifier. + +#include + +#include "include/org_rocksdb_AbstractTransactionNotifier.h" +#include "rocksjni/transaction_notifier_jnicallback.h" + +/* + * Class: org_rocksdb_AbstractTransactionNotifier + * Method: createNewTransactionNotifier + * Signature: ()J + */ +jlong Java_org_rocksdb_AbstractTransactionNotifier_createNewTransactionNotifier( + JNIEnv* env, jobject jobj) { + auto* transaction_notifier = + new rocksdb::TransactionNotifierJniCallback(env, jobj); + auto* sptr_transaction_notifier = + new std::shared_ptr( + transaction_notifier); + return reinterpret_cast(sptr_transaction_notifier); +} + +/* + * Class: org_rocksdb_AbstractTransactionNotifier + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_AbstractTransactionNotifier_disposeInternal(JNIEnv* env, + jobject jobj, jlong jhandle) { + // TODO(AR) refactor to use JniCallback::JniCallback + // when https://github.com/facebook/rocksdb/pull/1241/ is merged + std::shared_ptr* handle = + reinterpret_cast*>(jhandle); + delete handle; +} diff --git a/java/rocksjni/transaction_notifier_jnicallback.cc b/java/rocksjni/transaction_notifier_jnicallback.cc new file mode 100644 index 00000000000..85f2a194bed --- /dev/null +++ b/java/rocksjni/transaction_notifier_jnicallback.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the callback "bridge" between Java and C++ for +// rocksdb::TransactionNotifier. + +#include "rocksjni/transaction_notifier_jnicallback.h" +#include "rocksjni/portal.h" + +namespace rocksdb { + +TransactionNotifierJniCallback::TransactionNotifierJniCallback(JNIEnv* env, + jobject jtransaction_notifier) : JniCallback(env, jtransaction_notifier) { + // we cache the method id for the JNI callback + m_jsnapshot_created_methodID = + AbstractTransactionNotifierJni::getSnapshotCreatedMethodId(env); +} + +void TransactionNotifierJniCallback::SnapshotCreated( + const Snapshot* newSnapshot) { + jboolean attached_thread = JNI_FALSE; + JNIEnv* env = getJniEnv(&attached_thread); + assert(env != nullptr); + + env->CallVoidMethod(m_jcallback_obj, + m_jsnapshot_created_methodID, reinterpret_cast(newSnapshot)); + + if(env->ExceptionCheck()) { + // exception thrown from CallVoidMethod + env->ExceptionDescribe(); // print out exception to stderr + releaseJniEnv(attached_thread); + return; + } + + releaseJniEnv(attached_thread); +} +} // namespace rocksdb diff --git a/java/rocksjni/transaction_notifier_jnicallback.h b/java/rocksjni/transaction_notifier_jnicallback.h new file mode 100644 index 00000000000..8f67cdb8bc8 --- /dev/null +++ b/java/rocksjni/transaction_notifier_jnicallback.h @@ -0,0 +1,42 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the callback "bridge" between Java and C++ for +// rocksdb::TransactionNotifier. + +#ifndef JAVA_ROCKSJNI_TRANSACTION_NOTIFIER_JNICALLBACK_H_ +#define JAVA_ROCKSJNI_TRANSACTION_NOTIFIER_JNICALLBACK_H_ + +#include + +#include "rocksdb/utilities/transaction.h" +#include "rocksjni/jnicallback.h" + +namespace rocksdb { + +/** + * This class acts as a bridge between C++ + * and Java. The methods in this class will be + * called back from the RocksDB TransactionDB or OptimisticTransactionDB (C++), + * we then callback to the appropriate Java method + * this enables TransactionNotifier to be implemented in Java. + * + * Unlike RocksJava's Comparator JNI Callback, we do not attempt + * to reduce Java object allocations by caching the Snapshot object + * presented to the callback. This could be revisited in future + * if performance is lacking. + */ +class TransactionNotifierJniCallback: public JniCallback, + public TransactionNotifier { + public: + TransactionNotifierJniCallback(JNIEnv* env, jobject jtransaction_notifier); + virtual void SnapshotCreated(const Snapshot* newSnapshot); + + private: + jmethodID m_jsnapshot_created_methodID; +}; +} // namespace rocksdb + +#endif // JAVA_ROCKSJNI_TRANSACTION_NOTIFIER_JNICALLBACK_H_ diff --git a/java/rocksjni/transaction_options.cc b/java/rocksjni/transaction_options.cc new file mode 100644 index 00000000000..13ec3b9fd55 --- /dev/null +++ b/java/rocksjni/transaction_options.cc @@ -0,0 +1,166 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the "bridge" between Java and C++ +// for rocksdb::TransactionOptions. + +#include + +#include "include/org_rocksdb_TransactionOptions.h" + +#include "rocksdb/utilities/transaction_db.h" + +/* + * Class: org_rocksdb_TransactionOptions + * Method: newTransactionOptions + * Signature: ()J + */ +jlong Java_org_rocksdb_TransactionOptions_newTransactionOptions(JNIEnv* env, + jclass jcls) { + auto* opts = new rocksdb::TransactionOptions(); + return reinterpret_cast(opts); +} + +/* + * Class: org_rocksdb_TransactionOptions + * Method: isSetSnapshot + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_TransactionOptions_isSetSnapshot(JNIEnv* env, + jobject jobj, jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return opts->set_snapshot; +} + +/* + * Class: org_rocksdb_TransactionOptions + * Method: setSetSnapshot + * Signature: (JZ)V + */ +void Java_org_rocksdb_TransactionOptions_setSetSnapshot(JNIEnv* env, + jobject jobj, jlong jhandle, jboolean jset_snapshot) { + auto* opts = reinterpret_cast(jhandle); + opts->set_snapshot = jset_snapshot; +} + +/* + * Class: org_rocksdb_TransactionOptions + * Method: isDeadlockDetect + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_TransactionOptions_isDeadlockDetect( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return opts->deadlock_detect; +} + +/* +* Class: org_rocksdb_TransactionOptions +* Method: setDeadlockDetect +* Signature: (JZ)V +*/ +void Java_org_rocksdb_TransactionOptions_setDeadlockDetect( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean jdeadlock_detect) { + auto* opts = reinterpret_cast(jhandle); + opts->deadlock_detect = jdeadlock_detect; +} + +/* + * Class: org_rocksdb_TransactionOptions + * Method: getLockTimeout + * Signature: (J)J + */ +jlong Java_org_rocksdb_TransactionOptions_getLockTimeout(JNIEnv* env, + jobject jobj, jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return opts->lock_timeout; +} + +/* + * Class: org_rocksdb_TransactionOptions + * Method: setLockTimeout + * Signature: (JJ)V + */ +void Java_org_rocksdb_TransactionOptions_setLockTimeout(JNIEnv* env, + jobject jobj, jlong jhandle, jlong jlock_timeout) { + auto* opts = reinterpret_cast(jhandle); + opts->lock_timeout = jlock_timeout; +} + +/* + * Class: org_rocksdb_TransactionOptions + * Method: getExpiration + * Signature: (J)J + */ +jlong Java_org_rocksdb_TransactionOptions_getExpiration(JNIEnv* env, + jobject jobj, jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return opts->expiration; +} + +/* + * Class: org_rocksdb_TransactionOptions + * Method: setExpiration + * Signature: (JJ)V + */ +void Java_org_rocksdb_TransactionOptions_setExpiration(JNIEnv* env, + jobject jobj, jlong jhandle, jlong jexpiration) { + auto* opts = reinterpret_cast(jhandle); + opts->expiration = jexpiration; +} + +/* + * Class: org_rocksdb_TransactionOptions + * Method: getDeadlockDetectDepth + * Signature: (J)J + */ +jlong Java_org_rocksdb_TransactionOptions_getDeadlockDetectDepth( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return opts->deadlock_detect_depth; +} + +/* +* Class: org_rocksdb_TransactionOptions +* Method: setDeadlockDetectDepth +* Signature: (JJ)V +*/ +void Java_org_rocksdb_TransactionOptions_setDeadlockDetectDepth( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jdeadlock_detect_depth) { + auto* opts = reinterpret_cast(jhandle); + opts->deadlock_detect_depth = jdeadlock_detect_depth; +} + +/* +* Class: org_rocksdb_TransactionOptions +* Method: getMaxWriteBatchSize +* Signature: (J)J +*/ +jlong Java_org_rocksdb_TransactionOptions_getMaxWriteBatchSize( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return opts->max_write_batch_size; +} + +/* +* Class: org_rocksdb_TransactionOptions +* Method: setMaxWriteBatchSize +* Signature: (JJ)V +*/ +void Java_org_rocksdb_TransactionOptions_setMaxWriteBatchSize( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jmax_write_batch_size) { + auto* opts = reinterpret_cast(jhandle); + opts->max_write_batch_size = jmax_write_batch_size; +} + +/* + * Class: org_rocksdb_TransactionOptions + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_TransactionOptions_disposeInternal(JNIEnv* env, + jobject jobj, jlong jhandle) { + delete reinterpret_cast(jhandle); +} diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc index 4ec1244ed9f..925c43e80b9 100644 --- a/java/rocksjni/write_batch.cc +++ b/java/rocksjni/write_batch.cc @@ -27,12 +27,34 @@ * Method: newWriteBatch * Signature: (I)J */ -jlong Java_org_rocksdb_WriteBatch_newWriteBatch( +jlong Java_org_rocksdb_WriteBatch_newWriteBatch__I( JNIEnv* env, jclass jcls, jint jreserved_bytes) { auto* wb = new rocksdb::WriteBatch(static_cast(jreserved_bytes)); return reinterpret_cast(wb); } +/* + * Class: org_rocksdb_WriteBatch + * Method: newWriteBatch + * Signature: ([BI)J + */ +jlong Java_org_rocksdb_WriteBatch_newWriteBatch___3BI( + JNIEnv* env, jclass jcls, jbyteArray jserialized, + jint jserialized_length) { + jboolean has_exception = JNI_FALSE; + std::string serialized = rocksdb::JniUtil::byteString(env, + jserialized, jserialized_length, + [](const char* str, const size_t len) { return std::string(str, len); }, + &has_exception); + if(has_exception == JNI_TRUE) { + // exception occurred + return 0; + } + + auto* wb = new rocksdb::WriteBatch(serialized); + return reinterpret_cast(wb); +} + /* * Class: org_rocksdb_WriteBatch * Method: count0 @@ -90,6 +112,37 @@ void Java_org_rocksdb_WriteBatch_rollbackToSavePoint0( rocksdb::RocksDBExceptionJni::ThrowNew(env, s); } +/* + * Class: org_rocksdb_WriteBatch + * Method: popSavePoint + * Signature: (J)V + */ +void Java_org_rocksdb_WriteBatch_popSavePoint( + JNIEnv* env, jobject jobj, jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + + auto s = wb->PopSavePoint(); + + if (s.ok()) { + return; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: setMaxBytes + * Signature: (JJ)V + */ +void Java_org_rocksdb_WriteBatch_setMaxBytes( + JNIEnv* env, jobject jobj, jlong jwb_handle, jlong jmax_bytes) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + + wb->SetMaxBytes(static_cast(jmax_bytes)); +} + /* * Class: org_rocksdb_WriteBatch * Method: put @@ -102,10 +155,13 @@ void Java_org_rocksdb_WriteBatch_put__J_3BI_3BI( auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto put = [&wb] (rocksdb::Slice key, rocksdb::Slice value) { - wb->Put(key, value); + return wb->Put(key, value); }; - rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value, - jentry_value_len); + std::unique_ptr status = rocksdb::JniUtil::kv_op(put, env, + jobj, jkey, jkey_len, jentry_value, jentry_value_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* @@ -122,10 +178,13 @@ void Java_org_rocksdb_WriteBatch_put__J_3BI_3BIJ( auto* cf_handle = reinterpret_cast(jcf_handle); assert(cf_handle != nullptr); auto put = [&wb, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) { - wb->Put(cf_handle, key, value); + return wb->Put(cf_handle, key, value); }; - rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value, - jentry_value_len); + std::unique_ptr status = rocksdb::JniUtil::kv_op(put, env, + jobj, jkey, jkey_len, jentry_value, jentry_value_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* @@ -140,10 +199,13 @@ void Java_org_rocksdb_WriteBatch_merge__J_3BI_3BI( auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto merge = [&wb] (rocksdb::Slice key, rocksdb::Slice value) { - wb->Merge(key, value); + return wb->Merge(key, value); }; - rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value, - jentry_value_len); + std::unique_ptr status = rocksdb::JniUtil::kv_op(merge, env, + jobj, jkey, jkey_len, jentry_value, jentry_value_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* @@ -160,34 +222,41 @@ void Java_org_rocksdb_WriteBatch_merge__J_3BI_3BIJ( auto* cf_handle = reinterpret_cast(jcf_handle); assert(cf_handle != nullptr); auto merge = [&wb, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) { - wb->Merge(cf_handle, key, value); + return wb->Merge(cf_handle, key, value); }; - rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value, - jentry_value_len); + std::unique_ptr status = rocksdb::JniUtil::kv_op(merge, env, + jobj, jkey, jkey_len, jentry_value, jentry_value_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* * Class: org_rocksdb_WriteBatch - * Method: remove + * Method: delete * Signature: (J[BI)V */ -void Java_org_rocksdb_WriteBatch_remove__J_3BI( +void Java_org_rocksdb_WriteBatch_delete__J_3BI( JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, jint jkey_len) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto remove = [&wb] (rocksdb::Slice key) { - wb->Delete(key); + return wb->Delete(key); }; - rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len); + std::unique_ptr status = rocksdb::JniUtil::k_op(remove, env, + jobj, jkey, jkey_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* * Class: org_rocksdb_WriteBatch - * Method: remove + * Method: delete * Signature: (J[BIJ)V */ -void Java_org_rocksdb_WriteBatch_remove__J_3BIJ( +void Java_org_rocksdb_WriteBatch_delete__J_3BIJ( JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, jint jkey_len, jlong jcf_handle) { auto* wb = reinterpret_cast(jwb_handle); @@ -195,9 +264,55 @@ void Java_org_rocksdb_WriteBatch_remove__J_3BIJ( auto* cf_handle = reinterpret_cast(jcf_handle); assert(cf_handle != nullptr); auto remove = [&wb, &cf_handle] (rocksdb::Slice key) { - wb->Delete(cf_handle, key); + return wb->Delete(cf_handle, key); + }; + std::unique_ptr status = rocksdb::JniUtil::k_op(remove, env, + jobj, jkey, jkey_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: singleDelete + * Signature: (J[BI)V + */ +void Java_org_rocksdb_WriteBatch_singleDelete__J_3BI( + JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, + jint jkey_len) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + auto single_delete = [&wb] (rocksdb::Slice key) { + return wb->SingleDelete(key); + }; + std::unique_ptr status = rocksdb::JniUtil::k_op(single_delete, + env, jobj, jkey, jkey_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: singleDelete + * Signature: (J[BIJ)V + */ +void Java_org_rocksdb_WriteBatch_singleDelete__J_3BIJ( + JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, + jint jkey_len, jlong jcf_handle) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + auto* cf_handle = reinterpret_cast(jcf_handle); + assert(cf_handle != nullptr); + auto single_delete = [&wb, &cf_handle] (rocksdb::Slice key) { + return wb->SingleDelete(cf_handle, key); }; - rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len); + std::unique_ptr status = rocksdb::JniUtil::k_op(single_delete, + env, jobj, jkey, jkey_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* @@ -205,19 +320,20 @@ void Java_org_rocksdb_WriteBatch_remove__J_3BIJ( * Method: deleteRange * Signature: (J[BI[BI)V */ -JNIEXPORT void JNICALL Java_org_rocksdb_WriteBatch_deleteRange__J_3BI_3BI( - JNIEnv*, jobject, jlong, jbyteArray, jint, jbyteArray, jint); - void Java_org_rocksdb_WriteBatch_deleteRange__J_3BI_3BI( JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jbegin_key, jint jbegin_key_len, jbyteArray jend_key, jint jend_key_len) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto deleteRange = [&wb](rocksdb::Slice beginKey, rocksdb::Slice endKey) { - wb->DeleteRange(beginKey, endKey); + return wb->DeleteRange(beginKey, endKey); }; - rocksdb::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key, jbegin_key_len, - jend_key, jend_key_len); + std::unique_ptr status = rocksdb::JniUtil::kv_op( + deleteRange, env, jobj, jbegin_key, jbegin_key_len, jend_key, + jend_key_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* @@ -234,11 +350,15 @@ void Java_org_rocksdb_WriteBatch_deleteRange__J_3BI_3BIJ( auto* cf_handle = reinterpret_cast(jcf_handle); assert(cf_handle != nullptr); auto deleteRange = [&wb, &cf_handle](rocksdb::Slice beginKey, - rocksdb::Slice endKey) { - wb->DeleteRange(cf_handle, beginKey, endKey); + rocksdb::Slice endKey) { + return wb->DeleteRange(cf_handle, beginKey, endKey); }; - rocksdb::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key, jbegin_key_len, - jend_key, jend_key_len); + std::unique_ptr status = rocksdb::JniUtil::kv_op( + deleteRange, env, jobj, jbegin_key, jbegin_key_len, jend_key, + jend_key_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* @@ -252,9 +372,13 @@ void Java_org_rocksdb_WriteBatch_putLogData( auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); auto putLogData = [&wb] (rocksdb::Slice blob) { - wb->PutLogData(blob); + return wb->PutLogData(blob); }; - rocksdb::JniUtil::k_op(putLogData, env, jobj, jblob, jblob_len); + std::unique_ptr status = rocksdb::JniUtil::k_op(putLogData, + env, jobj, jblob, jblob_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* @@ -263,7 +387,7 @@ void Java_org_rocksdb_WriteBatch_putLogData( * Signature: (JJ)V */ void Java_org_rocksdb_WriteBatch_iterate( - JNIEnv* env , jobject jobj, jlong jwb_handle, jlong handlerHandle) { + JNIEnv* env, jobject jobj, jlong jwb_handle, jlong handlerHandle) { auto* wb = reinterpret_cast(jwb_handle); assert(wb != nullptr); @@ -276,6 +400,178 @@ void Java_org_rocksdb_WriteBatch_iterate( rocksdb::RocksDBExceptionJni::ThrowNew(env, s); } +/* + * Class: org_rocksdb_WriteBatch + * Method: data + * Signature: (J)[B + */ +jbyteArray Java_org_rocksdb_WriteBatch_data( + JNIEnv* env, jobject jobj, jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + + auto data = wb->Data(); + return rocksdb::JniUtil::copyBytes(env, data); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: getDataSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_WriteBatch_getDataSize( + JNIEnv* env, jobject jobj, jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + + auto data_size = wb->GetDataSize(); + return static_cast(data_size); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: hasPut + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_WriteBatch_hasPut( + JNIEnv* env, jobject jobj, jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + + return wb->HasPut(); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: hasDelete + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_WriteBatch_hasDelete( + JNIEnv* env, jobject jobj, jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + + return wb->HasDelete(); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: hasSingleDelete + * Signature: (J)Z + */ +JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasSingleDelete( + JNIEnv* env , jobject jobj, jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + + return wb->HasSingleDelete(); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: hasDeleteRange + * Signature: (J)Z + */ +JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasDeleteRange( + JNIEnv* env , jobject jobj, jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + + return wb->HasDeleteRange(); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: hasMerge + * Signature: (J)Z + */ +JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasMerge( + JNIEnv* env , jobject jobj, jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + + return wb->HasMerge(); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: hasBeginPrepare + * Signature: (J)Z + */ +JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasBeginPrepare( + JNIEnv* env , jobject jobj, jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + + return wb->HasBeginPrepare(); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: hasEndPrepare + * Signature: (J)Z + */ +JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasEndPrepare( + JNIEnv* env , jobject jobj, jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + + return wb->HasEndPrepare(); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: hasCommit + * Signature: (J)Z + */ +JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasCommit( + JNIEnv* env , jobject jobj, jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + + return wb->HasCommit(); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: hasRollback + * Signature: (J)Z + */ +JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasRollback( + JNIEnv* env , jobject jobj, jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + + return wb->HasRollback(); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: markWalTerminationPoint + * Signature: (J)V + */ +void Java_org_rocksdb_WriteBatch_markWalTerminationPoint( + JNIEnv* env, jobject jobj, jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + + wb->MarkWalTerminationPoint(); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: getWalTerminationPoint + * Signature: (J)Lorg/rocksdb/WriteBatch/SavePoint; + */ +jobject Java_org_rocksdb_WriteBatch_getWalTerminationPoint( + JNIEnv* env, jobject jobj, jlong jwb_handle) { + auto* wb = reinterpret_cast(jwb_handle); + assert(wb != nullptr); + + auto save_point = wb->GetWalTerminationPoint(); + return rocksdb::WriteBatchSavePointJni::construct(env, save_point); +} + /* * Class: org_rocksdb_WriteBatch * Method: disposeInternal diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc index 199ad239d79..c65757f86b2 100644 --- a/java/rocksjni/write_batch_test.cc +++ b/java/rocksjni/write_batch_test.cc @@ -87,8 +87,32 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( state.append(")"); count++; break; + case rocksdb::kTypeSingleDeletion: + state.append("SingleDelete("); + state.append(ikey.user_key.ToString()); + state.append(")"); + count++; + break; + case rocksdb::kTypeRangeDeletion: + state.append("DeleteRange("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + break; + case rocksdb::kTypeLogData: + state.append("LogData("); + state.append(ikey.user_key.ToString()); + state.append(")"); + count++; + break; default: assert(false); + state.append("Err:Expected("); + state.append(std::to_string(ikey.type)); + state.append(")"); + count++; break; } state.append("@"); @@ -96,8 +120,12 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( } if (!s.ok()) { state.append(s.ToString()); - } else if (count != rocksdb::WriteBatchInternal::Count(b)) { - state.append("CountMismatch()"); + } else if (rocksdb::WriteBatchInternal::Count(b) != count) { + state.append("Err:CountMismatch(expected="); + state.append(std::to_string(rocksdb::WriteBatchInternal::Count(b))); + state.append(", actual="); + state.append(std::to_string(count)); + state.append(")"); } delete mem->Unref(); diff --git a/java/rocksjni/write_batch_with_index.cc b/java/rocksjni/write_batch_with_index.cc index bca85c1007b..5d619dedf34 100644 --- a/java/rocksjni/write_batch_with_index.cc +++ b/java/rocksjni/write_batch_with_index.cc @@ -39,19 +39,31 @@ jlong Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__Z( /* * Class: org_rocksdb_WriteBatchWithIndex * Method: newWriteBatchWithIndex - * Signature: (JZIZ)J + * Signature: (JBIZ)J */ -jlong Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__JZIZ( +jlong Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__JBIZ( JNIEnv* env, jclass jcls, jlong jfallback_index_comparator_handle, - jboolean is_direct, jint jreserved_bytes, jboolean joverwrite_key) { -rocksdb::Comparator *fallback_comparator = nullptr; -if(is_direct) { - fallback_comparator = - reinterpret_cast(jfallback_index_comparator_handle); -} else { - fallback_comparator = - reinterpret_cast(jfallback_index_comparator_handle); -} + jbyte jcomparator_type, jint jreserved_bytes, jboolean joverwrite_key) { + rocksdb::Comparator *fallback_comparator = nullptr; + switch(jcomparator_type) { + // JAVA_COMPARATOR + case 0x0: + fallback_comparator = + reinterpret_cast(jfallback_index_comparator_handle); + break; + + // JAVA_DIRECT_COMPARATOR + case 0x1: + fallback_comparator = + reinterpret_cast(jfallback_index_comparator_handle); + break; + + // JAVA_NATIVE_COMPARATOR_WRAPPER + case 0x2: + fallback_comparator = + reinterpret_cast(jfallback_index_comparator_handle); + break; + } auto* wbwi = new rocksdb::WriteBatchWithIndex( fallback_comparator, @@ -83,10 +95,13 @@ void Java_org_rocksdb_WriteBatchWithIndex_put__J_3BI_3BI( auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); auto put = [&wbwi] (rocksdb::Slice key, rocksdb::Slice value) { - wbwi->Put(key, value); + return wbwi->Put(key, value); }; - rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value, - jentry_value_len); + std::unique_ptr status = rocksdb::JniUtil::kv_op(put, env, + jobj, jkey, jkey_len, jentry_value, jentry_value_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* @@ -103,10 +118,13 @@ void Java_org_rocksdb_WriteBatchWithIndex_put__J_3BI_3BIJ( auto* cf_handle = reinterpret_cast(jcf_handle); assert(cf_handle != nullptr); auto put = [&wbwi, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) { - wbwi->Put(cf_handle, key, value); + return wbwi->Put(cf_handle, key, value); }; - rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value, - jentry_value_len); + std::unique_ptr status = rocksdb::JniUtil::kv_op(put, env, + jobj, jkey, jkey_len, jentry_value, jentry_value_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* @@ -120,10 +138,13 @@ void Java_org_rocksdb_WriteBatchWithIndex_merge__J_3BI_3BI( auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); auto merge = [&wbwi] (rocksdb::Slice key, rocksdb::Slice value) { - wbwi->Merge(key, value); + return wbwi->Merge(key, value); }; - rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value, - jentry_value_len); + std::unique_ptr status = rocksdb::JniUtil::kv_op(merge, env, + jobj, jkey, jkey_len, jentry_value, jentry_value_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* @@ -140,34 +161,41 @@ void Java_org_rocksdb_WriteBatchWithIndex_merge__J_3BI_3BIJ( auto* cf_handle = reinterpret_cast(jcf_handle); assert(cf_handle != nullptr); auto merge = [&wbwi, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) { - wbwi->Merge(cf_handle, key, value); + return wbwi->Merge(cf_handle, key, value); }; - rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value, - jentry_value_len); + std::unique_ptr status = rocksdb::JniUtil::kv_op(merge, env, + jobj, jkey, jkey_len, jentry_value, jentry_value_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* * Class: org_rocksdb_WriteBatchWithIndex - * Method: remove + * Method: delete * Signature: (J[BI)V */ -void Java_org_rocksdb_WriteBatchWithIndex_remove__J_3BI( +void Java_org_rocksdb_WriteBatchWithIndex_delete__J_3BI( JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey, jint jkey_len) { auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); auto remove = [&wbwi] (rocksdb::Slice key) { - wbwi->Delete(key); + return wbwi->Delete(key); }; - rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len); + std::unique_ptr status = rocksdb::JniUtil::k_op(remove, env, + jobj, jkey, jkey_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* * Class: org_rocksdb_WriteBatchWithIndex - * Method: remove + * Method: delete * Signature: (J[BIJ)V */ -void Java_org_rocksdb_WriteBatchWithIndex_remove__J_3BIJ( +void Java_org_rocksdb_WriteBatchWithIndex_delete__J_3BIJ( JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey, jint jkey_len, jlong jcf_handle) { auto* wbwi = reinterpret_cast(jwbwi_handle); @@ -175,9 +203,55 @@ void Java_org_rocksdb_WriteBatchWithIndex_remove__J_3BIJ( auto* cf_handle = reinterpret_cast(jcf_handle); assert(cf_handle != nullptr); auto remove = [&wbwi, &cf_handle] (rocksdb::Slice key) { - wbwi->Delete(cf_handle, key); + return wbwi->Delete(cf_handle, key); }; - rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len); + std::unique_ptr status = rocksdb::JniUtil::k_op(remove, env, + jobj, jkey, jkey_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: singleDelete + * Signature: (J[BI)V + */ +void Java_org_rocksdb_WriteBatchWithIndex_singleDelete__J_3BI( + JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey, + jint jkey_len) { + auto* wbwi = reinterpret_cast(jwbwi_handle); + assert(wbwi != nullptr); + auto single_delete = [&wbwi] (rocksdb::Slice key) { + return wbwi->SingleDelete(key); + }; + std::unique_ptr status = rocksdb::JniUtil::k_op(single_delete, + env, jobj, jkey, jkey_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: singleDelete + * Signature: (J[BIJ)V + */ +void Java_org_rocksdb_WriteBatchWithIndex_singleDelete__J_3BIJ( + JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey, + jint jkey_len, jlong jcf_handle) { + auto* wbwi = reinterpret_cast(jwbwi_handle); + assert(wbwi != nullptr); + auto* cf_handle = reinterpret_cast(jcf_handle); + assert(cf_handle != nullptr); + auto single_delete = [&wbwi, &cf_handle] (rocksdb::Slice key) { + return wbwi->SingleDelete(cf_handle, key); + }; + std::unique_ptr status = rocksdb::JniUtil::k_op(single_delete, + env, jobj, jkey, jkey_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* @@ -191,10 +265,14 @@ void Java_org_rocksdb_WriteBatchWithIndex_deleteRange__J_3BI_3BI( auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); auto deleteRange = [&wbwi](rocksdb::Slice beginKey, rocksdb::Slice endKey) { - wbwi->DeleteRange(beginKey, endKey); + return wbwi->DeleteRange(beginKey, endKey); }; - rocksdb::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key, jbegin_key_len, - jend_key, jend_key_len); + std::unique_ptr status = rocksdb::JniUtil::kv_op( + deleteRange, env, jobj, jbegin_key, jbegin_key_len, jend_key, + jend_key_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* @@ -211,11 +289,15 @@ void Java_org_rocksdb_WriteBatchWithIndex_deleteRange__J_3BI_3BIJ( auto* cf_handle = reinterpret_cast(jcf_handle); assert(cf_handle != nullptr); auto deleteRange = [&wbwi, &cf_handle](rocksdb::Slice beginKey, - rocksdb::Slice endKey) { - wbwi->DeleteRange(cf_handle, beginKey, endKey); + rocksdb::Slice endKey) { + return wbwi->DeleteRange(cf_handle, beginKey, endKey); }; - rocksdb::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key, jbegin_key_len, - jend_key, jend_key_len); + std::unique_ptr status = rocksdb::JniUtil::kv_op( + deleteRange, env, jobj, jbegin_key, jbegin_key_len, jend_key, + jend_key_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* @@ -229,9 +311,13 @@ void Java_org_rocksdb_WriteBatchWithIndex_putLogData( auto* wbwi = reinterpret_cast(jwbwi_handle); assert(wbwi != nullptr); auto putLogData = [&wbwi] (rocksdb::Slice blob) { - wbwi->PutLogData(blob); + return wbwi->PutLogData(blob); }; - rocksdb::JniUtil::k_op(putLogData, env, jobj, jblob, jblob_len); + std::unique_ptr status = rocksdb::JniUtil::k_op(putLogData, + env, jobj, jblob, jblob_len); + if (status != nullptr && !status->ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, status); + } } /* @@ -279,6 +365,54 @@ void Java_org_rocksdb_WriteBatchWithIndex_rollbackToSavePoint0( rocksdb::RocksDBExceptionJni::ThrowNew(env, s); } +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: popSavePoint + * Signature: (J)V + */ +void Java_org_rocksdb_WriteBatchWithIndex_popSavePoint( + JNIEnv* env, jobject jobj, jlong jwbwi_handle) { + auto* wbwi = reinterpret_cast(jwbwi_handle); + assert(wbwi != nullptr); + + auto s = wbwi->PopSavePoint(); + + if (s.ok()) { + return; + } + + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: setMaxBytes + * Signature: (JJ)V + */ +void Java_org_rocksdb_WriteBatchWithIndex_setMaxBytes( + JNIEnv* env, jobject jobj, jlong jwbwi_handle, jlong jmax_bytes) { + auto* wbwi = reinterpret_cast(jwbwi_handle); + assert(wbwi != nullptr); + + wbwi->SetMaxBytes(static_cast(jmax_bytes)); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: getWriteBatch + * Signature: (J)Lorg/rocksdb/WriteBatch; + */ +jobject Java_org_rocksdb_WriteBatchWithIndex_getWriteBatch( + JNIEnv* env, jobject jobj, jlong jwbwi_handle) { + auto* wbwi = reinterpret_cast(jwbwi_handle); + assert(wbwi != nullptr); + + auto* wb = wbwi->GetWriteBatch(); + + // TODO(AR) is the `wb` object owned by us? + return rocksdb::WriteBatchJni::construct(env, wb); +} + /* * Class: org_rocksdb_WriteBatchWithIndex * Method: iterator0 @@ -551,33 +685,15 @@ jlongArray Java_org_rocksdb_WBWIRocksIterator_entry1( jlong results[3]; - //set the type of the write entry - switch (we.type) { - case rocksdb::kPutRecord: - results[0] = 0x1; - break; - - case rocksdb::kMergeRecord: - results[0] = 0x2; - break; - - case rocksdb::kDeleteRecord: - results[0] = 0x4; - break; - - case rocksdb::kLogDataRecord: - results[0] = 0x8; - break; - - default: - results[0] = 0x0; - } + // set the type of the write entry + results[0] = rocksdb::WriteTypeJni::toJavaWriteType(we.type); - // key_slice and value_slice will be freed by org.rocksdb.DirectSlice#close + // NOTE: key_slice and value_slice will be freed by org.rocksdb.DirectSlice#close auto* key_slice = new rocksdb::Slice(we.key.data(), we.key.size()); results[1] = reinterpret_cast(key_slice); if (we.type == rocksdb::kDeleteRecord + || we.type == rocksdb::kSingleDeleteRecord || we.type == rocksdb::kLogDataRecord) { // set native handle of value slice to null if no value available results[2] = 0; diff --git a/java/rocksjni/writebatchhandlerjnicallback.cc b/java/rocksjni/writebatchhandlerjnicallback.cc index 47dc0f5966f..5173f64b522 100644 --- a/java/rocksjni/writebatchhandlerjnicallback.cc +++ b/java/rocksjni/writebatchhandlerjnicallback.cc @@ -14,24 +14,62 @@ WriteBatchHandlerJniCallback::WriteBatchHandlerJniCallback( JNIEnv* env, jobject jWriteBatchHandler) : JniCallback(env, jWriteBatchHandler), m_env(env) { + m_jPutCfMethodId = WriteBatchHandlerJni::getPutCfMethodId(env); + if(m_jPutCfMethodId == nullptr) { + // exception thrown + return; + } + m_jPutMethodId = WriteBatchHandlerJni::getPutMethodId(env); if(m_jPutMethodId == nullptr) { // exception thrown return; } + m_jMergeCfMethodId = WriteBatchHandlerJni::getMergeCfMethodId(env); + if(m_jMergeCfMethodId == nullptr) { + // exception thrown + return; + } + m_jMergeMethodId = WriteBatchHandlerJni::getMergeMethodId(env); if(m_jMergeMethodId == nullptr) { // exception thrown return; } + m_jDeleteCfMethodId = WriteBatchHandlerJni::getDeleteCfMethodId(env); + if(m_jDeleteCfMethodId == nullptr) { + // exception thrown + return; + } + m_jDeleteMethodId = WriteBatchHandlerJni::getDeleteMethodId(env); if(m_jDeleteMethodId == nullptr) { // exception thrown return; } + m_jSingleDeleteCfMethodId = + WriteBatchHandlerJni::getSingleDeleteCfMethodId(env); + if(m_jSingleDeleteCfMethodId == nullptr) { + // exception thrown + return; + } + + m_jSingleDeleteMethodId = WriteBatchHandlerJni::getSingleDeleteMethodId(env); + if(m_jSingleDeleteMethodId == nullptr) { + // exception thrown + return; + } + + m_jDeleteRangeCfMethodId = + WriteBatchHandlerJni::getDeleteRangeCfMethodId(env); + if (m_jDeleteRangeCfMethodId == nullptr) { + // exception thrown + return; + } + m_jDeleteRangeMethodId = WriteBatchHandlerJni::getDeleteRangeMethodId(env); if (m_jDeleteRangeMethodId == nullptr) { // exception thrown @@ -44,203 +82,318 @@ WriteBatchHandlerJniCallback::WriteBatchHandlerJniCallback( return; } - m_jContinueMethodId = WriteBatchHandlerJni::getContinueMethodId(env); - if(m_jContinueMethodId == nullptr) { + m_jPutBlobIndexCfMethodId = + WriteBatchHandlerJni::getPutBlobIndexCfMethodId(env); + if(m_jPutBlobIndexCfMethodId == nullptr) { // exception thrown return; } -} -void WriteBatchHandlerJniCallback::Put(const Slice& key, const Slice& value) { - const jbyteArray j_key = sliceToJArray(key); - if(j_key == nullptr) { + m_jMarkBeginPrepareMethodId = + WriteBatchHandlerJni::getMarkBeginPrepareMethodId(env); + if(m_jMarkBeginPrepareMethodId == nullptr) { // exception thrown - if(m_env->ExceptionCheck()) { - m_env->ExceptionDescribe(); - } return; } - const jbyteArray j_value = sliceToJArray(value); - if(j_value == nullptr) { + m_jMarkEndPrepareMethodId = + WriteBatchHandlerJni::getMarkEndPrepareMethodId(env); + if(m_jMarkEndPrepareMethodId == nullptr) { // exception thrown - if(m_env->ExceptionCheck()) { - m_env->ExceptionDescribe(); - } - if(j_key != nullptr) { - m_env->DeleteLocalRef(j_key); - } return; } - m_env->CallVoidMethod( - m_jcallback_obj, - m_jPutMethodId, - j_key, - j_value); - if(m_env->ExceptionCheck()) { + m_jMarkNoopMethodId = WriteBatchHandlerJni::getMarkNoopMethodId(env); + if(m_jMarkNoopMethodId == nullptr) { // exception thrown - m_env->ExceptionDescribe(); - if(j_value != nullptr) { - m_env->DeleteLocalRef(j_value); - } - if(j_key != nullptr) { - m_env->DeleteLocalRef(j_key); - } return; } - - if(j_value != nullptr) { - m_env->DeleteLocalRef(j_value); - } - if(j_key != nullptr) { - m_env->DeleteLocalRef(j_key); + + m_jMarkRollbackMethodId = WriteBatchHandlerJni::getMarkRollbackMethodId(env); + if(m_jMarkRollbackMethodId == nullptr) { + // exception thrown + return; } -} -void WriteBatchHandlerJniCallback::Merge(const Slice& key, const Slice& value) { - const jbyteArray j_key = sliceToJArray(key); - if(j_key == nullptr) { + m_jMarkCommitMethodId = WriteBatchHandlerJni::getMarkCommitMethodId(env); + if(m_jMarkCommitMethodId == nullptr) { // exception thrown - if(m_env->ExceptionCheck()) { - m_env->ExceptionDescribe(); - } return; } - const jbyteArray j_value = sliceToJArray(value); - if(j_value == nullptr) { + m_jContinueMethodId = WriteBatchHandlerJni::getContinueMethodId(env); + if(m_jContinueMethodId == nullptr) { // exception thrown - if(m_env->ExceptionCheck()) { - m_env->ExceptionDescribe(); - } - if(j_key != nullptr) { - m_env->DeleteLocalRef(j_key); - } return; } +} - m_env->CallVoidMethod( +rocksdb::Status WriteBatchHandlerJniCallback::PutCF(uint32_t column_family_id, + const Slice& key, const Slice& value) { + auto put = [this, column_family_id] ( + jbyteArray j_key, jbyteArray j_value) { + m_env->CallVoidMethod( m_jcallback_obj, - m_jMergeMethodId, + m_jPutCfMethodId, + static_cast(column_family_id), j_key, j_value); - if(m_env->ExceptionCheck()) { - // exception thrown - m_env->ExceptionDescribe(); - if(j_value != nullptr) { - m_env->DeleteLocalRef(j_value); - } - if(j_key != nullptr) { - m_env->DeleteLocalRef(j_key); - } - return; + }; + auto status = WriteBatchHandlerJniCallback::kv_op(key, value, put); + if(status == nullptr) { + return rocksdb::Status::OK(); // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status? + } else { + return rocksdb::Status(*status); } +} - if(j_value != nullptr) { - m_env->DeleteLocalRef(j_value); - } - if(j_key != nullptr) { - m_env->DeleteLocalRef(j_key); +void WriteBatchHandlerJniCallback::Put(const Slice& key, const Slice& value) { + auto put = [this] ( + jbyteArray j_key, jbyteArray j_value) { + m_env->CallVoidMethod( + m_jcallback_obj, + m_jPutMethodId, + j_key, + j_value); + }; + WriteBatchHandlerJniCallback::kv_op(key, value, put); +} + +rocksdb::Status WriteBatchHandlerJniCallback::MergeCF(uint32_t column_family_id, + const Slice& key, const Slice& value) { + auto merge = [this, column_family_id] ( + jbyteArray j_key, jbyteArray j_value) { + m_env->CallVoidMethod( + m_jcallback_obj, + m_jMergeCfMethodId, + static_cast(column_family_id), + j_key, + j_value); + }; + auto status = WriteBatchHandlerJniCallback::kv_op(key, value, merge); + if(status == nullptr) { + return rocksdb::Status::OK(); // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status? + } else { + return rocksdb::Status(*status); } } -void WriteBatchHandlerJniCallback::Delete(const Slice& key) { - const jbyteArray j_key = sliceToJArray(key); - if(j_key == nullptr) { - // exception thrown - if(m_env->ExceptionCheck()) { - m_env->ExceptionDescribe(); - } - return; +void WriteBatchHandlerJniCallback::Merge(const Slice& key, const Slice& value) { + auto merge = [this] ( + jbyteArray j_key, jbyteArray j_value) { + m_env->CallVoidMethod( + m_jcallback_obj, + m_jMergeMethodId, + j_key, + j_value); + }; + WriteBatchHandlerJniCallback::kv_op(key, value, merge); +} + +rocksdb::Status WriteBatchHandlerJniCallback::DeleteCF(uint32_t column_family_id, + const Slice& key) { + auto remove = [this, column_family_id] (jbyteArray j_key) { + m_env->CallVoidMethod( + m_jcallback_obj, + m_jDeleteCfMethodId, + static_cast(column_family_id), + j_key); + }; + auto status = WriteBatchHandlerJniCallback::k_op(key, remove); + if(status == nullptr) { + return rocksdb::Status::OK(); // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status? + } else { + return rocksdb::Status(*status); } +} - m_env->CallVoidMethod( +void WriteBatchHandlerJniCallback::Delete(const Slice& key) { + auto remove = [this] (jbyteArray j_key) { + m_env->CallVoidMethod( m_jcallback_obj, m_jDeleteMethodId, j_key); - if(m_env->ExceptionCheck()) { - // exception thrown - m_env->ExceptionDescribe(); - if(j_key != nullptr) { - m_env->DeleteLocalRef(j_key); - } - return; + }; + WriteBatchHandlerJniCallback::k_op(key, remove); +} + +rocksdb::Status WriteBatchHandlerJniCallback::SingleDeleteCF(uint32_t column_family_id, + const Slice& key) { + auto singleDelete = [this, column_family_id] (jbyteArray j_key) { + m_env->CallVoidMethod( + m_jcallback_obj, + m_jSingleDeleteCfMethodId, + static_cast(column_family_id), + j_key); + }; + auto status = WriteBatchHandlerJniCallback::k_op(key, singleDelete); + if(status == nullptr) { + return rocksdb::Status::OK(); // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status? + } else { + return rocksdb::Status(*status); } +} - if(j_key != nullptr) { - m_env->DeleteLocalRef(j_key); +void WriteBatchHandlerJniCallback::SingleDelete(const Slice& key) { + auto singleDelete = [this] (jbyteArray j_key) { + m_env->CallVoidMethod( + m_jcallback_obj, + m_jSingleDeleteMethodId, + j_key); + }; + WriteBatchHandlerJniCallback::k_op(key, singleDelete); +} + +rocksdb::Status WriteBatchHandlerJniCallback::DeleteRangeCF(uint32_t column_family_id, + const Slice& beginKey, const Slice& endKey) { + auto deleteRange = [this, column_family_id] ( + jbyteArray j_beginKey, jbyteArray j_endKey) { + m_env->CallVoidMethod( + m_jcallback_obj, + m_jDeleteRangeCfMethodId, + static_cast(column_family_id), + j_beginKey, + j_endKey); + }; + auto status = WriteBatchHandlerJniCallback::kv_op(beginKey, endKey, deleteRange); + if(status == nullptr) { + return rocksdb::Status::OK(); // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status? + } else { + return rocksdb::Status(*status); } } void WriteBatchHandlerJniCallback::DeleteRange(const Slice& beginKey, - const Slice& endKey) { - const jbyteArray j_beginKey = sliceToJArray(beginKey); - if (j_beginKey == nullptr) { - // exception thrown - if (m_env->ExceptionCheck()) { - m_env->ExceptionDescribe(); - } - return; - } + const Slice& endKey) { + auto deleteRange = [this] ( + jbyteArray j_beginKey, jbyteArray j_endKey) { + m_env->CallVoidMethod( + m_jcallback_obj, + m_jDeleteRangeMethodId, + j_beginKey, + j_endKey); + }; + WriteBatchHandlerJniCallback::kv_op(beginKey, endKey, deleteRange); +} - const jbyteArray j_endKey = sliceToJArray(beginKey); - if (j_endKey == nullptr) { - // exception thrown - if (m_env->ExceptionCheck()) { - m_env->ExceptionDescribe(); - } - return; +void WriteBatchHandlerJniCallback::LogData(const Slice& blob) { + auto logData = [this] (jbyteArray j_blob) { + m_env->CallVoidMethod( + m_jcallback_obj, + m_jLogDataMethodId, + j_blob); + }; + WriteBatchHandlerJniCallback::k_op(blob, logData); +} + +rocksdb::Status WriteBatchHandlerJniCallback::PutBlobIndexCF(uint32_t column_family_id, + const Slice& key, const Slice& value) { + auto putBlobIndex = [this, column_family_id] ( + jbyteArray j_key, jbyteArray j_value) { + m_env->CallVoidMethod( + m_jcallback_obj, + m_jPutBlobIndexCfMethodId, + static_cast(column_family_id), + j_key, + j_value); + }; + auto status = WriteBatchHandlerJniCallback::kv_op(key, value, putBlobIndex); + if(status == nullptr) { + return rocksdb::Status::OK(); // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status? + } else { + return rocksdb::Status(*status); } +} + +rocksdb::Status WriteBatchHandlerJniCallback::MarkBeginPrepare() { + m_env->CallVoidMethod(m_jcallback_obj, m_jMarkBeginPrepareMethodId); - m_env->CallVoidMethod(m_jcallback_obj, m_jDeleteRangeMethodId, - j_beginKey, j_endKey); + // check for Exception, in-particular RocksDBException if (m_env->ExceptionCheck()) { // exception thrown - m_env->ExceptionDescribe(); - if (j_beginKey != nullptr) { - m_env->DeleteLocalRef(j_beginKey); - } - if (j_endKey != nullptr) { - m_env->DeleteLocalRef(j_endKey); + jthrowable exception = m_env->ExceptionOccurred(); + std::unique_ptr status = rocksdb::RocksDBExceptionJni::toCppStatus(m_env, exception); + if (status == nullptr) { + // unkown status or exception occurred extracting status + m_env->ExceptionDescribe(); + return rocksdb::Status::OK(); // TODO(AR) probably need a better error code here + + } else { + m_env->ExceptionClear(); // clear the exception, as we have extracted the status + return rocksdb::Status(*status); } - return; } - if (j_beginKey != nullptr) { - m_env->DeleteLocalRef(j_beginKey); - } + return rocksdb::Status::OK(); +} - if (j_endKey != nullptr) { - m_env->DeleteLocalRef(j_endKey); +rocksdb::Status WriteBatchHandlerJniCallback::MarkEndPrepare(const Slice& xid) { + auto markEndPrepare = [this] ( + jbyteArray j_xid) { + m_env->CallVoidMethod( + m_jcallback_obj, + m_jMarkEndPrepareMethodId, + j_xid); + }; + auto status = WriteBatchHandlerJniCallback::k_op(xid, markEndPrepare); + if(status == nullptr) { + return rocksdb::Status::OK(); // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status? + } else { + return rocksdb::Status(*status); } } -void WriteBatchHandlerJniCallback::LogData(const Slice& blob) { - const jbyteArray j_blob = sliceToJArray(blob); - if(j_blob == nullptr) { +rocksdb::Status WriteBatchHandlerJniCallback::MarkNoop(bool empty_batch) { + m_env->CallVoidMethod(m_jcallback_obj, m_jMarkNoopMethodId, static_cast(empty_batch)); + + // check for Exception, in-particular RocksDBException + if (m_env->ExceptionCheck()) { // exception thrown - if(m_env->ExceptionCheck()) { + jthrowable exception = m_env->ExceptionOccurred(); + std::unique_ptr status = rocksdb::RocksDBExceptionJni::toCppStatus(m_env, exception); + if (status == nullptr) { + // unkown status or exception occurred extracting status m_env->ExceptionDescribe(); + return rocksdb::Status::OK(); // TODO(AR) probably need a better error code here + + } else { + m_env->ExceptionClear(); // clear the exception, as we have extracted the status + return rocksdb::Status(*status); } - return; } - m_env->CallVoidMethod( + return rocksdb::Status::OK(); +} + +rocksdb::Status WriteBatchHandlerJniCallback::MarkRollback(const Slice& xid) { + auto markRollback = [this] ( + jbyteArray j_xid) { + m_env->CallVoidMethod( m_jcallback_obj, - m_jLogDataMethodId, - j_blob); - if(m_env->ExceptionCheck()) { - // exception thrown - m_env->ExceptionDescribe(); - if(j_blob != nullptr) { - m_env->DeleteLocalRef(j_blob); - } - return; + m_jMarkRollbackMethodId, + j_xid); + }; + auto status = WriteBatchHandlerJniCallback::k_op(xid, markRollback); + if(status == nullptr) { + return rocksdb::Status::OK(); // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status? + } else { + return rocksdb::Status(*status); } +} - if(j_blob != nullptr) { - m_env->DeleteLocalRef(j_blob); +rocksdb::Status WriteBatchHandlerJniCallback::MarkCommit(const Slice& xid) { + auto markCommit = [this] ( + jbyteArray j_xid) { + m_env->CallVoidMethod( + m_jcallback_obj, + m_jMarkCommitMethodId, + j_xid); + }; + auto status = WriteBatchHandlerJniCallback::k_op(xid, markCommit); + if(status == nullptr) { + return rocksdb::Status::OK(); // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status? + } else { + return rocksdb::Status(*status); } } @@ -256,39 +409,101 @@ bool WriteBatchHandlerJniCallback::Continue() { return static_cast(jContinue == JNI_TRUE); } -/* - * Creates a Java Byte Array from the data in a Slice - * - * When calling this function - * you must remember to call env->DeleteLocalRef - * on the result after you have finished with it - * - * @param s A Slice to convery to a Java byte array - * - * @return A reference to a Java byte array, or a nullptr if an - * exception occurs - */ -jbyteArray WriteBatchHandlerJniCallback::sliceToJArray(const Slice& s) { - - // TODO(AR) move to JniUtil - - jbyteArray ja = m_env->NewByteArray(static_cast(s.size())); - if(ja == nullptr) { - // exception thrown: OutOfMemoryError +std::unique_ptr WriteBatchHandlerJniCallback::kv_op(const Slice& key, const Slice& value, std::function kvFn) { + const jbyteArray j_key = JniUtil::copyBytes(m_env, key); + if (j_key == nullptr) { + // exception thrown + if (m_env->ExceptionCheck()) { + m_env->ExceptionDescribe(); + } return nullptr; } - m_env->SetByteArrayRegion( - ja, 0, static_cast(s.size()), - const_cast(reinterpret_cast(s.data()))); - if(m_env->ExceptionCheck()) { - if(ja != nullptr) { - m_env->DeleteLocalRef(ja); + const jbyteArray j_value = JniUtil::copyBytes(m_env, value); + if (j_value == nullptr) { + // exception thrown + if (m_env->ExceptionCheck()) { + m_env->ExceptionDescribe(); + } + if (j_key != nullptr) { + m_env->DeleteLocalRef(j_key); + } + return nullptr; + } + + kvFn(j_key, j_value); + + // check for Exception, in-particular RocksDBException + if (m_env->ExceptionCheck()) { + if (j_value != nullptr) { + m_env->DeleteLocalRef(j_value); + } + if (j_key != nullptr) { + m_env->DeleteLocalRef(j_key); + } + + // exception thrown + jthrowable exception = m_env->ExceptionOccurred(); + std::unique_ptr status = rocksdb::RocksDBExceptionJni::toCppStatus(m_env, exception); + if (status == nullptr) { + // unkown status or exception occurred extracting status + m_env->ExceptionDescribe(); + return nullptr; + + } else { + m_env->ExceptionClear(); // clear the exception, as we have extracted the status + return status; + } + } + + if (j_value != nullptr) { + m_env->DeleteLocalRef(j_value); + } + if (j_key != nullptr) { + m_env->DeleteLocalRef(j_key); + } + + // all OK + return std::unique_ptr(new rocksdb::Status(rocksdb::Status::OK())); +} + +std::unique_ptr WriteBatchHandlerJniCallback::k_op(const Slice& key, std::function kFn) { + const jbyteArray j_key = JniUtil::copyBytes(m_env, key); + if (j_key == nullptr) { + // exception thrown + if (m_env->ExceptionCheck()) { + m_env->ExceptionDescribe(); } - // exception thrown: ArrayIndexOutOfBoundsException return nullptr; } - return ja; + kFn(j_key); + + // check for Exception, in-particular RocksDBException + if (m_env->ExceptionCheck()) { + if (j_key != nullptr) { + m_env->DeleteLocalRef(j_key); + } + + // exception thrown + jthrowable exception = m_env->ExceptionOccurred(); + std::unique_ptr status = rocksdb::RocksDBExceptionJni::toCppStatus(m_env, exception); + if (status == nullptr) { + // unkown status or exception occurred extracting status + m_env->ExceptionDescribe(); + return nullptr; + + } else { + m_env->ExceptionClear(); // clear the exception, as we have extracted the status + return status; + } + } + + if (j_key != nullptr) { + m_env->DeleteLocalRef(j_key); + } + + // all OK + return std::unique_ptr(new rocksdb::Status(rocksdb::Status::OK())); } } // namespace rocksdb diff --git a/java/rocksjni/writebatchhandlerjnicallback.h b/java/rocksjni/writebatchhandlerjnicallback.h index 9132027dd6c..311a268db22 100644 --- a/java/rocksjni/writebatchhandlerjnicallback.h +++ b/java/rocksjni/writebatchhandlerjnicallback.h @@ -9,7 +9,9 @@ #ifndef JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_ #define JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_ +#include #include +#include #include "rocksjni/jnicallback.h" #include "rocksdb/write_batch.h" @@ -25,22 +27,57 @@ class WriteBatchHandlerJniCallback : public JniCallback, public WriteBatch::Hand public: WriteBatchHandlerJniCallback( JNIEnv* env, jobject jWriteBackHandler); + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value); void Put(const Slice& key, const Slice& value); + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value); void Merge(const Slice& key, const Slice& value); + Status DeleteCF(uint32_t column_family_id, const Slice& key); void Delete(const Slice& key); + Status SingleDeleteCF(uint32_t column_family_id, const Slice& key); + void SingleDelete(const Slice& key); + Status DeleteRangeCF(uint32_t column_family_id, const Slice& beginKey, + const Slice& endKey); void DeleteRange(const Slice& beginKey, const Slice& endKey); void LogData(const Slice& blob); + Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key, + const Slice& value); + Status MarkBeginPrepare(); + Status MarkEndPrepare(const Slice& xid); + Status MarkNoop(bool empty_batch); + Status MarkRollback(const Slice& xid); + Status MarkCommit(const Slice& xid); bool Continue(); private: JNIEnv* m_env; - jbyteArray sliceToJArray(const Slice& s); + jmethodID m_jPutCfMethodId; jmethodID m_jPutMethodId; + jmethodID m_jMergeCfMethodId; jmethodID m_jMergeMethodId; + jmethodID m_jDeleteCfMethodId; jmethodID m_jDeleteMethodId; + jmethodID m_jSingleDeleteCfMethodId; + jmethodID m_jSingleDeleteMethodId; + jmethodID m_jDeleteRangeCfMethodId; jmethodID m_jDeleteRangeMethodId; jmethodID m_jLogDataMethodId; + jmethodID m_jPutBlobIndexCfMethodId; + jmethodID m_jMarkBeginPrepareMethodId; + jmethodID m_jMarkEndPrepareMethodId; + jmethodID m_jMarkNoopMethodId; + jmethodID m_jMarkRollbackMethodId; + jmethodID m_jMarkCommitMethodId; jmethodID m_jContinueMethodId; + /** + * @return A pointer to a rocksdb::Status or nullptr if an unexpected exception occurred + */ + std::unique_ptr kv_op(const Slice& key, const Slice& value, std::function kvFn); + /** + * @return A pointer to a rocksdb::Status or nullptr if an unexpected exception occurred + */ + std::unique_ptr k_op(const Slice& key, std::function kFn); }; } // namespace rocksdb diff --git a/java/samples/src/main/java/OptimisticTransactionSample.java b/java/samples/src/main/java/OptimisticTransactionSample.java new file mode 100644 index 00000000000..1633d1f2bd4 --- /dev/null +++ b/java/samples/src/main/java/OptimisticTransactionSample.java @@ -0,0 +1,184 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +import org.rocksdb.*; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Demonstrates using Transactions on an OptimisticTransactionDB with + * varying isolation guarantees + */ +public class OptimisticTransactionSample { + private static final String dbPath = "/tmp/rocksdb_optimistic_transaction_example"; + + public static final void main(final String args[]) throws RocksDBException { + + try(final Options options = new Options() + .setCreateIfMissing(true); + final OptimisticTransactionDB txnDb = + OptimisticTransactionDB.open(options, dbPath)) { + + try (final WriteOptions writeOptions = new WriteOptions(); + final ReadOptions readOptions = new ReadOptions()) { + + //////////////////////////////////////////////////////// + // + // Simple OptimisticTransaction Example ("Read Committed") + // + //////////////////////////////////////////////////////// + readCommitted(txnDb, writeOptions, readOptions); + + + //////////////////////////////////////////////////////// + // + // "Repeatable Read" (Snapshot Isolation) Example + // -- Using a single Snapshot + // + //////////////////////////////////////////////////////// + repeatableRead(txnDb, writeOptions, readOptions); + + + //////////////////////////////////////////////////////// + // + // "Read Committed" (Monotonic Atomic Views) Example + // --Using multiple Snapshots + // + //////////////////////////////////////////////////////// + readCommitted_monotonicAtomicViews(txnDb, writeOptions, readOptions); + } + } + } + + /** + * Demonstrates "Read Committed" isolation + */ + private static void readCommitted(final OptimisticTransactionDB txnDb, + final WriteOptions writeOptions, final ReadOptions readOptions) + throws RocksDBException { + final byte key1[] = "abc".getBytes(UTF_8); + final byte value1[] = "def".getBytes(UTF_8); + + final byte key2[] = "xyz".getBytes(UTF_8); + final byte value2[] = "zzz".getBytes(UTF_8); + + // Start a transaction + try(final Transaction txn = txnDb.beginTransaction(writeOptions)) { + // Read a key in this transaction + byte[] value = txn.get(readOptions, key1); + assert(value == null); + + // Write a key in this transaction + txn.put(key1, value1); + + // Read a key OUTSIDE this transaction. Does not affect txn. + value = txnDb.get(readOptions, key1); + assert(value == null); + + // Write a key OUTSIDE of this transaction. + // Does not affect txn since this is an unrelated key. + // If we wrote key 'abc' here, the transaction would fail to commit. + txnDb.put(writeOptions, key2, value2); + + // Commit transaction + txn.commit(); + } + } + + /** + * Demonstrates "Repeatable Read" (Snapshot Isolation) isolation + */ + private static void repeatableRead(final OptimisticTransactionDB txnDb, + final WriteOptions writeOptions, final ReadOptions readOptions) + throws RocksDBException { + + final byte key1[] = "ghi".getBytes(UTF_8); + final byte value1[] = "jkl".getBytes(UTF_8); + + // Set a snapshot at start of transaction by setting setSnapshot(true) + try(final OptimisticTransactionOptions txnOptions = + new OptimisticTransactionOptions().setSetSnapshot(true); + final Transaction txn = + txnDb.beginTransaction(writeOptions, txnOptions)) { + + final Snapshot snapshot = txn.getSnapshot(); + + // Write a key OUTSIDE of transaction + txnDb.put(writeOptions, key1, value1); + + // Read a key using the snapshot. + readOptions.setSnapshot(snapshot); + final byte[] value = txn.getForUpdate(readOptions, key1, true); + assert(value == value1); + + try { + // Attempt to commit transaction + txn.commit(); + throw new IllegalStateException(); + } catch(final RocksDBException e) { + // Transaction could not commit since the write outside of the txn + // conflicted with the read! + assert(e.getStatus().getCode() == Status.Code.Busy); + } + + txn.rollback(); + } finally { + // Clear snapshot from read options since it is no longer valid + readOptions.setSnapshot(null); + } + } + + /** + * Demonstrates "Read Committed" (Monotonic Atomic Views) isolation + * + * In this example, we set the snapshot multiple times. This is probably + * only necessary if you have very strict isolation requirements to + * implement. + */ + private static void readCommitted_monotonicAtomicViews( + final OptimisticTransactionDB txnDb, final WriteOptions writeOptions, + final ReadOptions readOptions) throws RocksDBException { + + final byte keyX[] = "x".getBytes(UTF_8); + final byte valueX[] = "x".getBytes(UTF_8); + + final byte keyY[] = "y".getBytes(UTF_8); + final byte valueY[] = "y".getBytes(UTF_8); + + try (final OptimisticTransactionOptions txnOptions = + new OptimisticTransactionOptions().setSetSnapshot(true); + final Transaction txn = + txnDb.beginTransaction(writeOptions, txnOptions)) { + + // Do some reads and writes to key "x" + Snapshot snapshot = txnDb.getSnapshot(); + readOptions.setSnapshot(snapshot); + byte[] value = txn.get(readOptions, keyX); + txn.put(valueX, valueX); + + // Do a write outside of the transaction to key "y" + txnDb.put(writeOptions, keyY, valueY); + + // Set a new snapshot in the transaction + txn.setSnapshot(); + snapshot = txnDb.getSnapshot(); + readOptions.setSnapshot(snapshot); + + // Do some reads and writes to key "y" + // Since the snapshot was advanced, the write done outside of the + // transaction does not conflict. + value = txn.getForUpdate(readOptions, keyY, true); + txn.put(keyY, valueY); + + // Commit. Since the snapshot was advanced, the write done outside of the + // transaction does not prevent this transaction from Committing. + txn.commit(); + + } finally { + // Clear snapshot from read options since it is no longer valid + readOptions.setSnapshot(null); + } + } +} diff --git a/java/samples/src/main/java/TransactionSample.java b/java/samples/src/main/java/TransactionSample.java new file mode 100644 index 00000000000..b88a68f1233 --- /dev/null +++ b/java/samples/src/main/java/TransactionSample.java @@ -0,0 +1,183 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +import org.rocksdb.*; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Demonstrates using Transactions on a TransactionDB with + * varying isolation guarantees + */ +public class TransactionSample { + private static final String dbPath = "/tmp/rocksdb_transaction_example"; + + public static final void main(final String args[]) throws RocksDBException { + + try(final Options options = new Options() + .setCreateIfMissing(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB txnDb = + TransactionDB.open(options, txnDbOptions, dbPath)) { + + try (final WriteOptions writeOptions = new WriteOptions(); + final ReadOptions readOptions = new ReadOptions()) { + + //////////////////////////////////////////////////////// + // + // Simple Transaction Example ("Read Committed") + // + //////////////////////////////////////////////////////// + readCommitted(txnDb, writeOptions, readOptions); + + + //////////////////////////////////////////////////////// + // + // "Repeatable Read" (Snapshot Isolation) Example + // -- Using a single Snapshot + // + //////////////////////////////////////////////////////// + repeatableRead(txnDb, writeOptions, readOptions); + + + //////////////////////////////////////////////////////// + // + // "Read Committed" (Monotonic Atomic Views) Example + // --Using multiple Snapshots + // + //////////////////////////////////////////////////////// + readCommitted_monotonicAtomicViews(txnDb, writeOptions, readOptions); + } + } + } + + /** + * Demonstrates "Read Committed" isolation + */ + private static void readCommitted(final TransactionDB txnDb, + final WriteOptions writeOptions, final ReadOptions readOptions) + throws RocksDBException { + final byte key1[] = "abc".getBytes(UTF_8); + final byte value1[] = "def".getBytes(UTF_8); + + final byte key2[] = "xyz".getBytes(UTF_8); + final byte value2[] = "zzz".getBytes(UTF_8); + + // Start a transaction + try(final Transaction txn = txnDb.beginTransaction(writeOptions)) { + // Read a key in this transaction + byte[] value = txn.get(readOptions, key1); + assert(value == null); + + // Write a key in this transaction + txn.put(key1, value1); + + // Read a key OUTSIDE this transaction. Does not affect txn. + value = txnDb.get(readOptions, key1); + assert(value == null); + + // Write a key OUTSIDE of this transaction. + // Does not affect txn since this is an unrelated key. + // If we wrote key 'abc' here, the transaction would fail to commit. + txnDb.put(writeOptions, key2, value2); + + // Commit transaction + txn.commit(); + } + } + + /** + * Demonstrates "Repeatable Read" (Snapshot Isolation) isolation + */ + private static void repeatableRead(final TransactionDB txnDb, + final WriteOptions writeOptions, final ReadOptions readOptions) + throws RocksDBException { + + final byte key1[] = "ghi".getBytes(UTF_8); + final byte value1[] = "jkl".getBytes(UTF_8); + + // Set a snapshot at start of transaction by setting setSnapshot(true) + try(final TransactionOptions txnOptions = new TransactionOptions() + .setSetSnapshot(true); + final Transaction txn = + txnDb.beginTransaction(writeOptions, txnOptions)) { + + final Snapshot snapshot = txn.getSnapshot(); + + // Write a key OUTSIDE of transaction + txnDb.put(writeOptions, key1, value1); + + // Attempt to read a key using the snapshot. This will fail since + // the previous write outside this txn conflicts with this read. + readOptions.setSnapshot(snapshot); + + try { + final byte[] value = txn.getForUpdate(readOptions, key1, true); + throw new IllegalStateException(); + } catch(final RocksDBException e) { + assert(e.getStatus().getCode() == Status.Code.Busy); + } + + txn.rollback(); + } finally { + // Clear snapshot from read options since it is no longer valid + readOptions.setSnapshot(null); + } + } + + /** + * Demonstrates "Read Committed" (Monotonic Atomic Views) isolation + * + * In this example, we set the snapshot multiple times. This is probably + * only necessary if you have very strict isolation requirements to + * implement. + */ + private static void readCommitted_monotonicAtomicViews( + final TransactionDB txnDb, final WriteOptions writeOptions, + final ReadOptions readOptions) throws RocksDBException { + + final byte keyX[] = "x".getBytes(UTF_8); + final byte valueX[] = "x".getBytes(UTF_8); + + final byte keyY[] = "y".getBytes(UTF_8); + final byte valueY[] = "y".getBytes(UTF_8); + + try (final TransactionOptions txnOptions = new TransactionOptions() + .setSetSnapshot(true); + final Transaction txn = + txnDb.beginTransaction(writeOptions, txnOptions)) { + + // Do some reads and writes to key "x" + Snapshot snapshot = txnDb.getSnapshot(); + readOptions.setSnapshot(snapshot); + byte[] value = txn.get(readOptions, keyX); + txn.put(valueX, valueX); + + // Do a write outside of the transaction to key "y" + txnDb.put(writeOptions, keyY, valueY); + + // Set a new snapshot in the transaction + txn.setSnapshot(); + txn.setSavePoint(); + snapshot = txnDb.getSnapshot(); + readOptions.setSnapshot(snapshot); + + // Do some reads and writes to key "y" + // Since the snapshot was advanced, the write done outside of the + // transaction does not conflict. + value = txn.getForUpdate(readOptions, keyY, true); + txn.put(keyY, valueY); + + // Decide we want to revert the last write from this transaction. + txn.rollbackToSavePoint(); + + // Commit. + txn.commit(); + } finally { + // Clear snapshot from read options since it is no longer valid + readOptions.setSnapshot(null); + } + } +} diff --git a/java/src/main/java/org/rocksdb/AbstractComparator.java b/java/src/main/java/org/rocksdb/AbstractComparator.java index 00484236c0f..9310397b0cd 100644 --- a/java/src/main/java/org/rocksdb/AbstractComparator.java +++ b/java/src/main/java/org/rocksdb/AbstractComparator.java @@ -17,10 +17,23 @@ public abstract class AbstractComparator> extends RocksCallbackObject { + protected AbstractComparator() { + super(); + } + protected AbstractComparator(final ComparatorOptions copt) { super(copt.nativeHandle_); } + /** + * Get the type of this comparator. + * + * Used for determining the correct C++ cast in native code. + * + * @return The type of the comparator. + */ + abstract ComparatorType getComparatorType(); + /** * The name of the comparator. Used to check for comparator * mismatches (i.e., a DB created with one comparator is diff --git a/java/src/main/java/org/rocksdb/AbstractTransactionNotifier.java b/java/src/main/java/org/rocksdb/AbstractTransactionNotifier.java new file mode 100644 index 00000000000..cbb49836d1c --- /dev/null +++ b/java/src/main/java/org/rocksdb/AbstractTransactionNotifier.java @@ -0,0 +1,54 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +/** + * Provides notification to the caller of SetSnapshotOnNextOperation when + * the actual snapshot gets created + */ +public abstract class AbstractTransactionNotifier + extends RocksCallbackObject { + + protected AbstractTransactionNotifier() { + super(); + } + + /** + * Implement this method to receive notification when a snapshot is + * requested via {@link Transaction#setSnapshotOnNextOperation()}. + * + * @param newSnapshot the snapshot that has been created. + */ + public abstract void snapshotCreated(final Snapshot newSnapshot); + + /** + * This is intentionally private as it is the callback hook + * from JNI + */ + private void snapshotCreated(final long snapshotHandle) { + snapshotCreated(new Snapshot(snapshotHandle)); + } + + @Override + protected long initializeNative(final long... nativeParameterHandles) { + return createNewTransactionNotifier(); + } + + private native long createNewTransactionNotifier(); + + /** + * Deletes underlying C++ TransactionNotifier pointer. + * + * Note that this function should be called only after all + * Transactions referencing the comparator are closed. + * Otherwise an undefined behavior will occur. + */ + @Override + protected void disposeInternal() { + disposeInternal(nativeHandle_); + } + protected final native void disposeInternal(final long handle); +} diff --git a/java/src/main/java/org/rocksdb/AbstractWriteBatch.java b/java/src/main/java/org/rocksdb/AbstractWriteBatch.java index b2e5571809a..9de0eb43c59 100644 --- a/java/src/main/java/org/rocksdb/AbstractWriteBatch.java +++ b/java/src/main/java/org/rocksdb/AbstractWriteBatch.java @@ -18,52 +18,80 @@ public int count() { } @Override - public void put(byte[] key, byte[] value) { + public void put(byte[] key, byte[] value) throws RocksDBException { put(nativeHandle_, key, key.length, value, value.length); } @Override public void put(ColumnFamilyHandle columnFamilyHandle, byte[] key, - byte[] value) { + byte[] value) throws RocksDBException { put(nativeHandle_, key, key.length, value, value.length, columnFamilyHandle.nativeHandle_); } @Override - public void merge(byte[] key, byte[] value) { + public void merge(byte[] key, byte[] value) throws RocksDBException { merge(nativeHandle_, key, key.length, value, value.length); } @Override public void merge(ColumnFamilyHandle columnFamilyHandle, byte[] key, - byte[] value) { + byte[] value) throws RocksDBException { merge(nativeHandle_, key, key.length, value, value.length, columnFamilyHandle.nativeHandle_); } @Override - public void remove(byte[] key) { - remove(nativeHandle_, key, key.length); + @Deprecated + public void remove(byte[] key) throws RocksDBException { + delete(nativeHandle_, key, key.length); } @Override - public void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key) { - remove(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); + @Deprecated + public void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key) + throws RocksDBException { + delete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); } @Override - public void deleteRange(byte[] beginKey, byte[] endKey) { + public void delete(byte[] key) throws RocksDBException { + delete(nativeHandle_, key, key.length); + } + + @Override + public void delete(ColumnFamilyHandle columnFamilyHandle, byte[] key) + throws RocksDBException { + delete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); + } + + + @Override + public void singleDelete(byte[] key) throws RocksDBException { + singleDelete(nativeHandle_, key, key.length); + } + + @Override + public void singleDelete(ColumnFamilyHandle columnFamilyHandle, byte[] key) + throws RocksDBException { + singleDelete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); + } + + @Override + public void deleteRange(byte[] beginKey, byte[] endKey) + throws RocksDBException { deleteRange(nativeHandle_, beginKey, beginKey.length, endKey, endKey.length); } @Override - public void deleteRange(ColumnFamilyHandle columnFamilyHandle, byte[] beginKey, byte[] endKey) { + public void deleteRange(ColumnFamilyHandle columnFamilyHandle, + byte[] beginKey, byte[] endKey) throws RocksDBException { deleteRange(nativeHandle_, beginKey, beginKey.length, endKey, endKey.length, columnFamilyHandle.nativeHandle_); } @Override - public void putLogData(byte[] blob) { + public void putLogData(byte[] blob) throws RocksDBException { putLogData(nativeHandle_, blob, blob.length); } @@ -82,38 +110,67 @@ public void rollbackToSavePoint() throws RocksDBException { rollbackToSavePoint0(nativeHandle_); } + @Override + public void popSavePoint() throws RocksDBException { + popSavePoint(nativeHandle_); + } + + @Override + public void setMaxBytes(final long maxBytes) { + setMaxBytes(nativeHandle_, maxBytes); + } + + @Override + public WriteBatch getWriteBatch() { + return getWriteBatch(nativeHandle_); + } + abstract int count0(final long handle); abstract void put(final long handle, final byte[] key, final int keyLen, - final byte[] value, final int valueLen); + final byte[] value, final int valueLen) throws RocksDBException; abstract void put(final long handle, final byte[] key, final int keyLen, - final byte[] value, final int valueLen, final long cfHandle); + final byte[] value, final int valueLen, final long cfHandle) + throws RocksDBException; abstract void merge(final long handle, final byte[] key, final int keyLen, - final byte[] value, final int valueLen); + final byte[] value, final int valueLen) throws RocksDBException; abstract void merge(final long handle, final byte[] key, final int keyLen, - final byte[] value, final int valueLen, final long cfHandle); + final byte[] value, final int valueLen, final long cfHandle) + throws RocksDBException; + + abstract void delete(final long handle, final byte[] key, + final int keyLen) throws RocksDBException; + + abstract void delete(final long handle, final byte[] key, + final int keyLen, final long cfHandle) throws RocksDBException; - abstract void remove(final long handle, final byte[] key, - final int keyLen); + abstract void singleDelete(final long handle, final byte[] key, + final int keyLen) throws RocksDBException; - abstract void remove(final long handle, final byte[] key, - final int keyLen, final long cfHandle); + abstract void singleDelete(final long handle, final byte[] key, + final int keyLen, final long cfHandle) throws RocksDBException; abstract void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen, - final byte[] endKey, final int endKeyLen); + final byte[] endKey, final int endKeyLen) throws RocksDBException; abstract void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen, - final byte[] endKey, final int endKeyLen, final long cfHandle); + final byte[] endKey, final int endKeyLen, final long cfHandle) throws RocksDBException; abstract void putLogData(final long handle, final byte[] blob, - final int blobLen); + final int blobLen) throws RocksDBException; abstract void clear0(final long handle); abstract void setSavePoint0(final long handle); abstract void rollbackToSavePoint0(final long handle); + + abstract void popSavePoint(final long handle) throws RocksDBException; + + abstract void setMaxBytes(final long handle, long maxBytes); + + abstract WriteBatch getWriteBatch(final long handle); } diff --git a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java index d3908d1a379..ac8550f3ef7 100644 --- a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java @@ -441,7 +441,7 @@ T setOptimizeFiltersForHits( boolean optimizeFiltersForHits(); /** - * In debug mode, RocksDB run consistency checks on the LSM everytime the LSM + * In debug mode, RocksDB run consistency checks on the LSM every time the LSM * change (Flush, Compaction, AddFile). These checks are disabled in release * mode, use this option to enable them in release mode as well. * @@ -455,7 +455,7 @@ T setForceConsistencyChecks( boolean forceConsistencyChecks); /** - * In debug mode, RocksDB run consistency checks on the LSM everytime the LSM + * In debug mode, RocksDB run consistency checks on the LSM every time the LSM * change (Flush, Compaction, AddFile). These checks are disabled in release * mode. * diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java index 2d847de29d3..2dbbc64d358 100644 --- a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java +++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java @@ -15,6 +15,7 @@ public BlockBasedTableConfig() { noBlockCache_ = false; blockCacheSize_ = 8 * 1024 * 1024; blockCacheNumShardBits_ = 0; + blockCache_ = null; blockSize_ = 4 * 1024; blockSizeDeviation_ = 10; blockRestartInterval_ = 16; @@ -71,6 +72,24 @@ public long blockCacheSize() { return blockCacheSize_; } + /** + * Use the specified cache for blocks. + * When not null this take precedence even if the user sets a block cache size. + * + * {@link org.rocksdb.Cache} should not be disposed before options instances + * using this cache is disposed. + * + * {@link org.rocksdb.Cache} instance can be re-used in multiple options + * instances. + * + * @param cache {@link org.rocksdb.Cache} Cache java instance (e.g. LRUCache). + * @return the reference to the current config. + */ + public BlockBasedTableConfig setBlockCache(final Cache cache) { + blockCache_ = cache; + return this; + } + /** * Controls the number of shards for the block cache. * This is applied only if cacheSize is set to non-negative. @@ -413,25 +432,25 @@ public int formatVersion() { filterHandle = filter_.nativeHandle_; } - return newTableFactoryHandle(noBlockCache_, blockCacheSize_, - blockCacheNumShardBits_, blockSize_, blockSizeDeviation_, - blockRestartInterval_, wholeKeyFiltering_, - filterHandle, cacheIndexAndFilterBlocks_, - pinL0FilterAndIndexBlocksInCache_, - hashIndexAllowCollision_, blockCacheCompressedSize_, - blockCacheCompressedNumShardBits_, - checksumType_.getValue(), indexType_.getValue(), + long blockCacheHandle = 0; + if (blockCache_ != null) { + blockCacheHandle = blockCache_.nativeHandle_; + } + + return newTableFactoryHandle(noBlockCache_, blockCacheSize_, blockCacheNumShardBits_, + blockCacheHandle, blockSize_, blockSizeDeviation_, blockRestartInterval_, + wholeKeyFiltering_, filterHandle, cacheIndexAndFilterBlocks_, + pinL0FilterAndIndexBlocksInCache_, hashIndexAllowCollision_, blockCacheCompressedSize_, + blockCacheCompressedNumShardBits_, checksumType_.getValue(), indexType_.getValue(), formatVersion_); } - private native long newTableFactoryHandle( - boolean noBlockCache, long blockCacheSize, int blockCacheNumShardBits, - long blockSize, int blockSizeDeviation, int blockRestartInterval, - boolean wholeKeyFiltering, long filterPolicyHandle, + private native long newTableFactoryHandle(boolean noBlockCache, long blockCacheSize, + int blockCacheNumShardBits, long blockCacheHandle, long blockSize, int blockSizeDeviation, + int blockRestartInterval, boolean wholeKeyFiltering, long filterPolicyHandle, boolean cacheIndexAndFilterBlocks, boolean pinL0FilterAndIndexBlocksInCache, boolean hashIndexAllowCollision, long blockCacheCompressedSize, - int blockCacheCompressedNumShardBits, byte checkSumType, - byte indexType, int formatVersion); + int blockCacheCompressedNumShardBits, byte checkSumType, byte indexType, int formatVersion); private boolean cacheIndexAndFilterBlocks_; private boolean pinL0FilterAndIndexBlocksInCache_; @@ -442,6 +461,7 @@ private native long newTableFactoryHandle( private long blockSize_; private long blockCacheSize_; private int blockCacheNumShardBits_; + private Cache blockCache_; private long blockCacheCompressedSize_; private int blockCacheCompressedNumShardBits_; private int blockSizeDeviation_; diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java b/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java index d932fd9a927..8bb570e5d30 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java @@ -5,6 +5,8 @@ package org.rocksdb; +import java.util.Arrays; + /** *

Describes a column family with a * name and respective Options.

@@ -32,7 +34,7 @@ public ColumnFamilyDescriptor(final byte[] columnFamilyName) { * @since 3.10.0 */ public ColumnFamilyDescriptor(final byte[] columnFamilyName, - final ColumnFamilyOptions columnFamilyOptions) { + final ColumnFamilyOptions columnFamilyOptions) { columnFamilyName_ = columnFamilyName; columnFamilyOptions_ = columnFamilyOptions; } @@ -43,19 +45,65 @@ public ColumnFamilyDescriptor(final byte[] columnFamilyName, * @return column family name. * @since 3.10.0 */ - public byte[] columnFamilyName() { + public byte[] getName() { return columnFamilyName_; } + /** + * Retrieve name of column family. + * + * @return column family name. + * @since 3.10.0 + * + * @deprecated Use {@link #getName()} instead. + */ + @Deprecated + public byte[] columnFamilyName() { + return getName(); + } + /** * Retrieve assigned options instance. * * @return Options instance assigned to this instance. */ - public ColumnFamilyOptions columnFamilyOptions() { + public ColumnFamilyOptions getOptions() { return columnFamilyOptions_; } + /** + * Retrieve assigned options instance. + * + * @return Options instance assigned to this instance. + * + * @deprecated Use {@link #getOptions()} instead. + */ + @Deprecated + public ColumnFamilyOptions columnFamilyOptions() { + return getOptions(); + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + final ColumnFamilyDescriptor that = (ColumnFamilyDescriptor) o; + return Arrays.equals(columnFamilyName_, that.columnFamilyName_) + && columnFamilyOptions_.nativeHandle_ == that.columnFamilyOptions_.nativeHandle_; + } + + @Override + public int hashCode() { + int result = (int) (columnFamilyOptions_.nativeHandle_ ^ (columnFamilyOptions_.nativeHandle_ >>> 32)); + result = 31 * result + Arrays.hashCode(columnFamilyName_); + return result; + } + private final byte[] columnFamilyName_; private final ColumnFamilyOptions columnFamilyOptions_; } diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java b/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java index 7726cc62d79..16b9c609b94 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java @@ -5,6 +5,9 @@ package org.rocksdb; +import java.util.Arrays; +import java.util.Objects; + /** * ColumnFamilyHandle class to hold handles to underlying rocksdb * ColumnFamily Pointers. @@ -21,6 +24,63 @@ public class ColumnFamilyHandle extends RocksObject { this.rocksDB_ = rocksDB; } + /** + * Gets the name of the Column Family. + * + * @return The name of the Column Family. + */ + public byte[] getName() { + return getName(nativeHandle_); + } + + /** + * Gets the ID of the Column Family. + * + * @return the ID of the Column Family. + */ + public int getID() { + return getID(nativeHandle_); + } + + /** + * Gets the up-to-date descriptor of the column family + * associated with this handle. Since it fills "*desc" with the up-to-date + * information, this call might internally lock and release DB mutex to + * access the up-to-date CF options. In addition, all the pointer-typed + * options cannot be referenced any longer than the original options exist. + * + * Note that this function is not supported in RocksDBLite. + * + * @return the up-to-date descriptor. + * + * @throws RocksDBException if an error occurs whilst retrieving the + * descriptor. + */ + public ColumnFamilyDescriptor getDescriptor() throws RocksDBException { + assert(isOwningHandle()); + return getDescriptor(nativeHandle_); + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + final ColumnFamilyHandle that = (ColumnFamilyHandle) o; + return rocksDB_.nativeHandle_ == that.rocksDB_.nativeHandle_ && + getID() == that.getID() && + Arrays.equals(getName(), that.getName()); + } + + @Override + public int hashCode() { + return Objects.hash(getName(), getID(), rocksDB_.nativeHandle_); + } + /** *

Deletes underlying C++ iterator pointer.

* @@ -36,6 +96,9 @@ protected void disposeInternal() { } } + private native byte[] getName(final long handle); + private native int getID(final long handle); + private native ColumnFamilyDescriptor getDescriptor(final long handle) throws RocksDBException; @Override protected final native void disposeInternal(final long handle); private final RocksDB rocksDB_; diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java index b3890ed815a..3cdf9569b2a 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java @@ -53,6 +53,18 @@ public ColumnFamilyOptions(ColumnFamilyOptions other) { this.compressionOptions_ = other.compressionOptions_; } + /** + *

Constructor to be used by + * {@link #getColumnFamilyOptionsFromProps(java.util.Properties)}, + * {@link ColumnFamilyDescriptor#columnFamilyOptions()} + * and also called via JNI.

+ * + * @param handle native handle to ColumnFamilyOptions instance. + */ + ColumnFamilyOptions(final long handle) { + super(handle); + } + /** *

Method to get a options instance by using pre-configured * property values. If one or many values are undefined in @@ -151,7 +163,7 @@ public ColumnFamilyOptions setComparator( final AbstractComparator> comparator) { assert (isOwningHandle()); setComparatorHandle(nativeHandle_, comparator.nativeHandle_, - comparator instanceof DirectComparator); + comparator.getComparatorType().getValue()); comparator_ = comparator; return this; } @@ -788,17 +800,6 @@ public boolean forceConsistencyChecks() { return forceConsistencyChecks(nativeHandle_); } - /** - *

Constructor to be used by - * {@link #getColumnFamilyOptionsFromProps(java.util.Properties)}

- * and also called via JNI. - * - * @param handle native handle to ColumnFamilyOptions instance. - */ - public ColumnFamilyOptions(final long handle) { - super(handle); - } - private static native long getColumnFamilyOptionsFromProps( String optString); @@ -815,7 +816,7 @@ private native void optimizeUniversalStyleCompaction(long handle, long memtableMemoryBudget); private native void setComparatorHandle(long handle, int builtinComparator); private native void setComparatorHandle(long optHandle, - long comparatorHandle, boolean isDirect); + long comparatorHandle, byte comparatorType); private native void setMergeOperatorName(long handle, String name); private native void setMergeOperator(long handle, long mergeOperatorHandle); private native void setCompactionFilterHandle(long handle, diff --git a/java/src/main/java/org/rocksdb/Comparator.java b/java/src/main/java/org/rocksdb/Comparator.java index ec5f4652d4b..4d06073f26a 100644 --- a/java/src/main/java/org/rocksdb/Comparator.java +++ b/java/src/main/java/org/rocksdb/Comparator.java @@ -25,5 +25,10 @@ protected long initializeNative(final long... nativeParameterHandles) { return createNewComparator0(nativeParameterHandles[0]); } + @Override + final ComparatorType getComparatorType() { + return ComparatorType.JAVA_COMPARATOR; + } + private native long createNewComparator0(final long comparatorOptionsHandle); } diff --git a/java/src/main/java/org/rocksdb/ComparatorType.java b/java/src/main/java/org/rocksdb/ComparatorType.java new file mode 100644 index 00000000000..df8b4759078 --- /dev/null +++ b/java/src/main/java/org/rocksdb/ComparatorType.java @@ -0,0 +1,49 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +enum ComparatorType { + JAVA_COMPARATOR((byte)0x0), + JAVA_DIRECT_COMPARATOR((byte)0x1), + JAVA_NATIVE_COMPARATOR_WRAPPER((byte)0x2); + + private final byte value; + + ComparatorType(final byte value) { + this.value = value; + } + + /** + *

Returns the byte value of the enumerations value.

+ * + * @return byte representation + */ + byte getValue() { + return value; + } + + /** + *

Get the ComparatorType enumeration value by + * passing the byte identifier to this method.

+ * + * @param byteIdentifier of ComparatorType. + * + * @return ComparatorType instance. + * + * @throws IllegalArgumentException if the comparator type for the byteIdentifier + * cannot be found + */ + static ComparatorType getComparatorType(final byte byteIdentifier) { + for (final ComparatorType comparatorType : ComparatorType.values()) { + if (comparatorType.getValue() == byteIdentifier) { + return comparatorType; + } + } + + throw new IllegalArgumentException( + "Illegal value provided for ComparatorType."); + } +} diff --git a/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java index f426701903e..57213119939 100644 --- a/java/src/main/java/org/rocksdb/DBOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/DBOptionsInterface.java @@ -269,7 +269,10 @@ public interface DBOptionsInterface { * Statistics objects should not be shared between DB instances as * it does not use any locks to prevent concurrent updates.

* + * @param statistics The statistics to set + * * @return the instance of the current object. + * * @see RocksDB#open(org.rocksdb.Options, String) */ T setStatistics(final Statistics statistics); @@ -277,7 +280,9 @@ public interface DBOptionsInterface { /** *

Returns statistics object.

* - * @return the instance of the statistics object or null if there is no statistics object. + * @return the instance of the statistics object or null if there is no + * statistics object. + * * @see #setStatistics(Statistics) */ Statistics statistics(); diff --git a/java/src/main/java/org/rocksdb/DirectComparator.java b/java/src/main/java/org/rocksdb/DirectComparator.java index 347eb26441d..e33004f5d80 100644 --- a/java/src/main/java/org/rocksdb/DirectComparator.java +++ b/java/src/main/java/org/rocksdb/DirectComparator.java @@ -25,6 +25,11 @@ protected long initializeNative(final long... nativeParameterHandles) { return createNewDirectComparator0(nativeParameterHandles[0]); } + @Override + final ComparatorType getComparatorType() { + return ComparatorType.JAVA_DIRECT_COMPARATOR; + } + private native long createNewDirectComparator0( final long comparatorOptionsHandle); } diff --git a/java/src/main/java/org/rocksdb/NativeComparatorWrapper.java b/java/src/main/java/org/rocksdb/NativeComparatorWrapper.java new file mode 100644 index 00000000000..28a427aaa75 --- /dev/null +++ b/java/src/main/java/org/rocksdb/NativeComparatorWrapper.java @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +/** + * A simple abstraction to allow a Java class to wrap a custom comparator + * implemented in C++. + * + * The native comparator must directly extend rocksdb::Comparator. + */ +public abstract class NativeComparatorWrapper + extends AbstractComparator { + + @Override + final ComparatorType getComparatorType() { + return ComparatorType.JAVA_NATIVE_COMPARATOR_WRAPPER; + } + + @Override + public final String name() { + throw new IllegalStateException("This should not be called. " + + "Implementation is in Native code"); + } + + @Override + public final int compare(final Slice s1, final Slice s2) { + throw new IllegalStateException("This should not be called. " + + "Implementation is in Native code"); + } + + @Override + public final String findShortestSeparator(final String start, final Slice limit) { + throw new IllegalStateException("This should not be called. " + + "Implementation is in Native code"); + } + + @Override + public final String findShortSuccessor(final String key) { + throw new IllegalStateException("This should not be called. " + + "Implementation is in Native code"); + } + + /** + * We override {@link RocksCallbackObject#disposeInternal()} + * as disposing of a native rocksd::Comparator extension requires + * a slightly different approach as it is not really a RocksCallbackObject + */ + @Override + protected void disposeInternal() { + disposeInternal(nativeHandle_); + } + + private native void disposeInternal(final long handle); +} diff --git a/java/src/main/java/org/rocksdb/OptimisticTransactionDB.java b/java/src/main/java/org/rocksdb/OptimisticTransactionDB.java new file mode 100644 index 00000000000..1610dc73901 --- /dev/null +++ b/java/src/main/java/org/rocksdb/OptimisticTransactionDB.java @@ -0,0 +1,175 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.List; + +/** + * Database with Transaction support. + */ +public class OptimisticTransactionDB extends RocksDB + implements TransactionalDB { + + /** + * Private constructor. + * + * @param nativeHandle The native handle of the C++ OptimisticTransactionDB + * object + */ + private OptimisticTransactionDB(final long nativeHandle) { + super(nativeHandle); + } + + /** + * Open an OptimisticTransactionDB similar to + * {@link RocksDB#open(Options, String)}. + * + * @param options {@link org.rocksdb.Options} instance. + * @param path the path to the rocksdb. + * + * @return a {@link OptimisticTransactionDB} instance on success, null if the + * specified {@link OptimisticTransactionDB} can not be opened. + * + * @throws RocksDBException if an error occurs whilst opening the database. + */ + public static OptimisticTransactionDB open(final Options options, + final String path) throws RocksDBException { + final OptimisticTransactionDB otdb = new OptimisticTransactionDB(open( + options.nativeHandle_, path)); + + // when non-default Options is used, keeping an Options reference + // in RocksDB can prevent Java to GC during the life-time of + // the currently-created RocksDB. + otdb.storeOptionsInstance(options); + + return otdb; + } + + /** + * Open an OptimisticTransactionDB similar to + * {@link RocksDB#open(DBOptions, String, List, List)}. + * + * @param dbOptions {@link org.rocksdb.DBOptions} instance. + * @param path the path to the rocksdb. + * @param columnFamilyDescriptors list of column family descriptors + * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances + * + * @return a {@link OptimisticTransactionDB} instance on success, null if the + * specified {@link OptimisticTransactionDB} can not be opened. + * + * @throws RocksDBException if an error occurs whilst opening the database. + */ + public static OptimisticTransactionDB open(final DBOptions dbOptions, + final String path, + final List columnFamilyDescriptors, + final List columnFamilyHandles) + throws RocksDBException { + + final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][]; + final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()]; + for (int i = 0; i < columnFamilyDescriptors.size(); i++) { + final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors + .get(i); + cfNames[i] = cfDescriptor.columnFamilyName(); + cfOptionHandles[i] = cfDescriptor.columnFamilyOptions().nativeHandle_; + } + + final long[] handles = open(dbOptions.nativeHandle_, path, cfNames, + cfOptionHandles); + final OptimisticTransactionDB otdb = + new OptimisticTransactionDB(handles[0]); + + // when non-default Options is used, keeping an Options reference + // in RocksDB can prevent Java to GC during the life-time of + // the currently-created RocksDB. + otdb.storeOptionsInstance(dbOptions); + + for (int i = 1; i < handles.length; i++) { + columnFamilyHandles.add(new ColumnFamilyHandle(otdb, handles[i])); + } + + return otdb; + } + + @Override + public Transaction beginTransaction(final WriteOptions writeOptions) { + return new Transaction(this, beginTransaction(nativeHandle_, + writeOptions.nativeHandle_)); + } + + @Override + public Transaction beginTransaction(final WriteOptions writeOptions, + final OptimisticTransactionOptions optimisticTransactionOptions) { + return new Transaction(this, beginTransaction(nativeHandle_, + writeOptions.nativeHandle_, + optimisticTransactionOptions.nativeHandle_)); + } + + // TODO(AR) consider having beingTransaction(... oldTransaction) set a + // reference count inside Transaction, so that we can always call + // Transaction#close but the object is only disposed when there are as many + // closes as beginTransaction. Makes the try-with-resources paradigm easier for + // java developers + + @Override + public Transaction beginTransaction(final WriteOptions writeOptions, + final Transaction oldTransaction) { + final long jtxn_handle = beginTransaction_withOld(nativeHandle_, + writeOptions.nativeHandle_, oldTransaction.nativeHandle_); + + // RocksJava relies on the assumption that + // we do not allocate a new Transaction object + // when providing an old_txn + assert(jtxn_handle == oldTransaction.nativeHandle_); + + return oldTransaction; + } + + @Override + public Transaction beginTransaction(final WriteOptions writeOptions, + final OptimisticTransactionOptions optimisticTransactionOptions, + final Transaction oldTransaction) { + final long jtxn_handle = beginTransaction_withOld(nativeHandle_, + writeOptions.nativeHandle_, optimisticTransactionOptions.nativeHandle_, + oldTransaction.nativeHandle_); + + // RocksJava relies on the assumption that + // we do not allocate a new Transaction object + // when providing an old_txn + assert(jtxn_handle == oldTransaction.nativeHandle_); + + return oldTransaction; + } + + /** + * Get the underlying database that was opened. + * + * @return The underlying database that was opened. + */ + public RocksDB getBaseDB() { + final RocksDB db = new RocksDB(getBaseDB(nativeHandle_)); + db.disOwnNativeHandle(); + return db; + } + + protected static native long open(final long optionsHandle, + final String path) throws RocksDBException; + protected static native long[] open(final long handle, final String path, + final byte[][] columnFamilyNames, final long[] columnFamilyOptions); + private native long beginTransaction(final long handle, + final long writeOptionsHandle); + private native long beginTransaction(final long handle, + final long writeOptionsHandle, + final long optimisticTransactionOptionsHandle); + private native long beginTransaction_withOld(final long handle, + final long writeOptionsHandle, final long oldTransactionHandle); + private native long beginTransaction_withOld(final long handle, + final long writeOptionsHandle, + final long optimisticTransactionOptionsHandle, + final long oldTransactionHandle); + private native long getBaseDB(final long handle); + @Override protected final native void disposeInternal(final long handle); +} diff --git a/java/src/main/java/org/rocksdb/OptimisticTransactionOptions.java b/java/src/main/java/org/rocksdb/OptimisticTransactionOptions.java new file mode 100644 index 00000000000..650ee22550a --- /dev/null +++ b/java/src/main/java/org/rocksdb/OptimisticTransactionOptions.java @@ -0,0 +1,53 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +public class OptimisticTransactionOptions extends RocksObject + implements TransactionalOptions { + + public OptimisticTransactionOptions() { + super(newOptimisticTransactionOptions()); + } + + @Override + public boolean isSetSnapshot() { + assert(isOwningHandle()); + return isSetSnapshot(nativeHandle_); + } + + @Override + public OptimisticTransactionOptions setSetSnapshot( + final boolean setSnapshot) { + assert(isOwningHandle()); + setSetSnapshot(nativeHandle_, setSnapshot); + return this; + } + + /** + * Should be set if the DB has a non-default comparator. + * See comment in + * {@link WriteBatchWithIndex#WriteBatchWithIndex(AbstractComparator, int, boolean)} + * constructor. + * + * @param comparator The comparator to use for the transaction. + * + * @return this OptimisticTransactionOptions instance + */ + public OptimisticTransactionOptions setComparator( + final AbstractComparator> comparator) { + assert(isOwningHandle()); + setComparator(nativeHandle_, comparator.nativeHandle_); + return this; + } + + private native static long newOptimisticTransactionOptions(); + private native boolean isSetSnapshot(final long handle); + private native void setSetSnapshot(final long handle, + final boolean setSnapshot); + private native void setComparator(final long handle, + final long comparatorHandle); + @Override protected final native void disposeInternal(final long handle); +} diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java index 4393caa75eb..22179a1c757 100644 --- a/java/src/main/java/org/rocksdb/Options.java +++ b/java/src/main/java/org/rocksdb/Options.java @@ -191,7 +191,7 @@ public Options setComparator( final AbstractComparator> comparator) { assert(isOwningHandle()); setComparatorHandle(nativeHandle_, comparator.nativeHandle_, - comparator instanceof DirectComparator); + comparator.getComparatorType().getValue()); comparator_ = comparator; return this; } @@ -1756,7 +1756,7 @@ private native void optimizeUniversalStyleCompaction(long handle, long memtableMemoryBudget); private native void setComparatorHandle(long handle, int builtinComparator); private native void setComparatorHandle(long optHandle, - long comparatorHandle, boolean isDirect); + long comparatorHandle, byte comparatorType); private native void setMergeOperatorName( long handle, String name); private native void setMergeOperator( diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java index 592c7f9ad25..3b398631d78 100644 --- a/java/src/main/java/org/rocksdb/RocksDB.java +++ b/java/src/main/java/org/rocksdb/RocksDB.java @@ -435,7 +435,7 @@ public static List listColumnFamilies(final Options options, path)); } - private void storeOptionsInstance(DBOptionsInterface options) { + protected void storeOptionsInstance(DBOptionsInterface options) { options_ = options; } @@ -1683,7 +1683,7 @@ public List newIterators( * @return The handle of the default column family */ public ColumnFamilyHandle getDefaultColumnFamily() { - ColumnFamilyHandle cfHandle = new ColumnFamilyHandle(this, + final ColumnFamilyHandle cfHandle = new ColumnFamilyHandle(this, getDefaultColumnFamily(nativeHandle_)); cfHandle.disOwnNativeHandle(); return cfHandle; @@ -2359,8 +2359,9 @@ protected native long[] iterators(final long handle, final long[] columnFamilyHandles, final long readOptHandle) throws RocksDBException; protected native long getSnapshot(long nativeHandle); - protected native void releaseSnapshot(long nativeHandle, long snapshotHandle); - @Override protected final native void disposeInternal(final long handle); + protected native void releaseSnapshot( + long nativeHandle, long snapshotHandle); + @Override protected native void disposeInternal(final long handle); private native long getDefaultColumnFamily(long handle); private native long createColumnFamily(final long handle, final byte[] columnFamilyName, final long columnFamilyOptions) diff --git a/java/src/main/java/org/rocksdb/Snapshot.java b/java/src/main/java/org/rocksdb/Snapshot.java index a6b53f495f2..39cdf0c2d27 100644 --- a/java/src/main/java/org/rocksdb/Snapshot.java +++ b/java/src/main/java/org/rocksdb/Snapshot.java @@ -11,6 +11,10 @@ public class Snapshot extends RocksObject { Snapshot(final long nativeHandle) { super(nativeHandle); + + // The pointer to the snapshot is always released + // by the database instance. + disOwnNativeHandle(); } /** @@ -20,17 +24,17 @@ public class Snapshot extends RocksObject { * this snapshot. */ public long getSequenceNumber() { - assert(isOwningHandle()); return getSequenceNumber(nativeHandle_); } - /** - * Dont release C++ Snapshot pointer. The pointer - * to the snapshot is released by the database - * instance. - */ @Override protected final void disposeInternal(final long handle) { + /** + * Nothing to release, we never own the pointer for a + * Snapshot. The pointer + * to the snapshot is released by the database + * instance. + */ } private native long getSequenceNumber(long handle); diff --git a/java/src/main/java/org/rocksdb/SstFileWriter.java b/java/src/main/java/org/rocksdb/SstFileWriter.java index 57879f94b86..447e41ea9db 100644 --- a/java/src/main/java/org/rocksdb/SstFileWriter.java +++ b/java/src/main/java/org/rocksdb/SstFileWriter.java @@ -31,7 +31,7 @@ public SstFileWriter(final EnvOptions envOptions, final Options options, final AbstractComparator> comparator) { super(newSstFileWriter( envOptions.nativeHandle_, options.nativeHandle_, comparator.nativeHandle_, - comparator instanceof DirectComparator)); + comparator.getComparatorType().getValue())); } /** @@ -225,7 +225,7 @@ public void finish() throws RocksDBException { private native static long newSstFileWriter( final long envOptionsHandle, final long optionsHandle, - final long userComparatorHandle, final boolean isDirect); + final long userComparatorHandle, final byte comparatorType); private native static long newSstFileWriter(final long envOptionsHandle, final long optionsHandle); diff --git a/java/src/main/java/org/rocksdb/Statistics.java b/java/src/main/java/org/rocksdb/Statistics.java index 10c072c897e..0938a6d5834 100644 --- a/java/src/main/java/org/rocksdb/Statistics.java +++ b/java/src/main/java/org/rocksdb/Statistics.java @@ -117,6 +117,8 @@ public String getHistogramString(final HistogramType histogramType) { /** * Resets all ticker and histogram stats. + * + * @throws RocksDBException if an error occurs when resetting the statistics. */ public void reset() throws RocksDBException { assert(isOwningHandle()); @@ -126,6 +128,7 @@ public void reset() throws RocksDBException { /** * String representation of the statistic object. */ + @Override public String toString() { assert(isOwningHandle()); return toString(nativeHandle_); diff --git a/java/src/main/java/org/rocksdb/Status.java b/java/src/main/java/org/rocksdb/Status.java index df575289f45..e633940c297 100644 --- a/java/src/main/java/org/rocksdb/Status.java +++ b/java/src/main/java/org/rocksdb/Status.java @@ -87,6 +87,15 @@ public static Code getCode(final byte value) { throw new IllegalArgumentException( "Illegal value provided for Code (" + value + ")."); } + + /** + * Returns the byte value of the enumerations value. + * + * @return byte representation + */ + public byte getValue() { + return value; + } } // should stay in sync with /include/rocksdb/status.h:SubCode and /java/rocksjni/portal.h:toJavaStatusSubCode @@ -116,5 +125,14 @@ public static SubCode getSubCode(final byte value) { throw new IllegalArgumentException( "Illegal value provided for SubCode (" + value + ")."); } + + /** + * Returns the byte value of the enumerations value. + * + * @return byte representation + */ + public byte getValue() { + return value; + } } } diff --git a/java/src/main/java/org/rocksdb/StringAppendOperator.java b/java/src/main/java/org/rocksdb/StringAppendOperator.java index 85c36adc7c1..978cad6ccfe 100644 --- a/java/src/main/java/org/rocksdb/StringAppendOperator.java +++ b/java/src/main/java/org/rocksdb/StringAppendOperator.java @@ -11,9 +11,13 @@ */ public class StringAppendOperator extends MergeOperator { public StringAppendOperator() { - super(newSharedStringAppendOperator()); + this(','); } - private native static long newSharedStringAppendOperator(); + public StringAppendOperator(char delim) { + super(newSharedStringAppendOperator(delim)); + } + + private native static long newSharedStringAppendOperator(final char delim); @Override protected final native void disposeInternal(final long handle); } diff --git a/java/src/main/java/org/rocksdb/Transaction.java b/java/src/main/java/org/rocksdb/Transaction.java new file mode 100644 index 00000000000..c619bb1053f --- /dev/null +++ b/java/src/main/java/org/rocksdb/Transaction.java @@ -0,0 +1,1761 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.List; + +/** + * Provides BEGIN/COMMIT/ROLLBACK transactions. + * + * To use transactions, you must first create either an + * {@link OptimisticTransactionDB} or a {@link TransactionDB} + * + * To create a transaction, use + * {@link OptimisticTransactionDB#beginTransaction(org.rocksdb.WriteOptions)} or + * {@link TransactionDB#beginTransaction(org.rocksdb.WriteOptions)} + * + * It is up to the caller to synchronize access to this object. + * + * See samples/src/main/java/OptimisticTransactionSample.java and + * samples/src/main/java/TransactionSample.java for some simple + * examples. + */ +public class Transaction extends RocksObject { + + private final RocksDB parent; + + /** + * Intentionally package private + * as this is called from + * {@link OptimisticTransactionDB#beginTransaction(org.rocksdb.WriteOptions)} + * or {@link TransactionDB#beginTransaction(org.rocksdb.WriteOptions)} + * + * @param parent This must be either {@link TransactionDB} or + * {@link OptimisticTransactionDB} + * @param transactionHandle The native handle to the underlying C++ + * transaction object + */ + Transaction(final RocksDB parent, final long transactionHandle) { + super(transactionHandle); + this.parent = parent; + } + + /** + * If a transaction has a snapshot set, the transaction will ensure that + * any keys successfully written(or fetched via {@link #getForUpdate}) have + * not been modified outside of this transaction since the time the snapshot + * was set. + * + * If a snapshot has not been set, the transaction guarantees that keys have + * not been modified since the time each key was first written (or fetched via + * {@link #getForUpdate}). + * + * Using {@link #setSnapshot()} will provide stricter isolation guarantees + * at the expense of potentially more transaction failures due to conflicts + * with other writes. + * + * Calling {@link #setSnapshot()} has no effect on keys written before this + * function has been called. + * + * {@link #setSnapshot()} may be called multiple times if you would like to + * change the snapshot used for different operations in this transaction. + * + * Calling {@link #setSnapshot()} will not affect the version of Data returned + * by get(...) methods. See {@link #get} for more details. + */ + public void setSnapshot() { + assert(isOwningHandle()); + setSnapshot(nativeHandle_); + } + + /** + * Similar to {@link #setSnapshot()}, but will not change the current snapshot + * until put/merge/delete/getForUpdate/multiGetForUpdate is called. + * By calling this function, the transaction will essentially call + * {@link #setSnapshot()} for you right before performing the next + * write/getForUpdate. + * + * Calling {@link #setSnapshotOnNextOperation()} will not affect what + * snapshot is returned by {@link #getSnapshot} until the next + * write/getForUpdate is executed. + * + * When the snapshot is created the notifier's snapshotCreated method will + * be called so that the caller can get access to the snapshot. + * + * This is an optimization to reduce the likelihood of conflicts that + * could occur in between the time {@link #setSnapshot()} is called and the + * first write/getForUpdate operation. i.e. this prevents the following + * race-condition: + * + * txn1->setSnapshot(); + * txn2->put("A", ...); + * txn2->commit(); + * txn1->getForUpdate(opts, "A", ...); * FAIL! + */ + public void setSnapshotOnNextOperation() { + assert(isOwningHandle()); + setSnapshotOnNextOperation(nativeHandle_); + } + + /** + * Similar to {@link #setSnapshot()}, but will not change the current snapshot + * until put/merge/delete/getForUpdate/multiGetForUpdate is called. + * By calling this function, the transaction will essentially call + * {@link #setSnapshot()} for you right before performing the next + * write/getForUpdate. + * + * Calling {@link #setSnapshotOnNextOperation()} will not affect what + * snapshot is returned by {@link #getSnapshot} until the next + * write/getForUpdate is executed. + * + * When the snapshot is created the + * {@link AbstractTransactionNotifier#snapshotCreated(Snapshot)} method will + * be called so that the caller can get access to the snapshot. + * + * This is an optimization to reduce the likelihood of conflicts that + * could occur in between the time {@link #setSnapshot()} is called and the + * first write/getForUpdate operation. i.e. this prevents the following + * race-condition: + * + * txn1->setSnapshot(); + * txn2->put("A", ...); + * txn2->commit(); + * txn1->getForUpdate(opts, "A", ...); * FAIL! + * + * @param transactionNotifier A handler for receiving snapshot notifications + * for the transaction + * + */ + public void setSnapshotOnNextOperation( + final AbstractTransactionNotifier transactionNotifier) { + assert(isOwningHandle()); + setSnapshotOnNextOperation(nativeHandle_, transactionNotifier.nativeHandle_); + } + + /** + * Returns the Snapshot created by the last call to {@link #setSnapshot()}. + * + * REQUIRED: The returned Snapshot is only valid up until the next time + * {@link #setSnapshot()}/{@link #setSnapshotOnNextOperation()} is called, + * {@link #clearSnapshot()} is called, or the Transaction is deleted. + * + * @return The snapshot or null if there is no snapshot + */ + public Snapshot getSnapshot() { + assert(isOwningHandle()); + final long snapshotNativeHandle = getSnapshot(nativeHandle_); + if(snapshotNativeHandle == 0) { + return null; + } else { + final Snapshot snapshot = new Snapshot(snapshotNativeHandle); + return snapshot; + } + } + + /** + * Clears the current snapshot (i.e. no snapshot will be 'set') + * + * This removes any snapshot that currently exists or is set to be created + * on the next update operation ({@link #setSnapshotOnNextOperation()}). + * + * Calling {@link #clearSnapshot()} has no effect on keys written before this + * function has been called. + * + * If a reference to a snapshot was retrieved via {@link #getSnapshot()}, it + * will no longer be valid and should be discarded after a call to + * {@link #clearSnapshot()}. + */ + public void clearSnapshot() { + assert(isOwningHandle()); + clearSnapshot(nativeHandle_); + } + + /** + * Prepare the current transaction for 2PC + */ + void prepare() throws RocksDBException { + //TODO(AR) consider a Java'ish version of this function, which returns an AutoCloseable (commit) + assert(isOwningHandle()); + prepare(nativeHandle_); + } + + /** + * Write all batched keys to the db atomically. + * + * Returns OK on success. + * + * May return any error status that could be returned by DB:Write(). + * + * If this transaction was created by an {@link OptimisticTransactionDB} + * Status::Busy() may be returned if the transaction could not guarantee + * that there are no write conflicts. Status::TryAgain() may be returned + * if the memtable history size is not large enough + * (See max_write_buffer_number_to_maintain). + * + * If this transaction was created by a {@link TransactionDB}, + * Status::Expired() may be returned if this transaction has lived for + * longer than {@link TransactionOptions#getExpiration()}. + * + * @throws RocksDBException if an error occurs when committing the transaction + */ + public void commit() throws RocksDBException { + assert(isOwningHandle()); + commit(nativeHandle_); + } + + /** + * Discard all batched writes in this transaction. + * + * @throws RocksDBException if an error occurs when rolling back the transaction + */ + public void rollback() throws RocksDBException { + assert(isOwningHandle()); + rollback(nativeHandle_); + } + + /** + * Records the state of the transaction for future calls to + * {@link #rollbackToSavePoint()}. + * + * May be called multiple times to set multiple save points. + * + * @throws RocksDBException if an error occurs whilst setting a save point + */ + public void setSavePoint() throws RocksDBException { + assert(isOwningHandle()); + setSavePoint(nativeHandle_); + } + + /** + * Undo all operations in this transaction (put, merge, delete, putLogData) + * since the most recent call to {@link #setSavePoint()} and removes the most + * recent {@link #setSavePoint()}. + * + * If there is no previous call to {@link #setSavePoint()}, + * returns Status::NotFound() + * + * @throws RocksDBException if an error occurs when rolling back to a save point + */ + public void rollbackToSavePoint() throws RocksDBException { + assert(isOwningHandle()); + rollbackToSavePoint(nativeHandle_); + } + + /** + * This function is similar to + * {@link RocksDB#get(ColumnFamilyHandle, ReadOptions, byte[])} except it will + * also read pending changes in this transaction. + * Currently, this function will return Status::MergeInProgress if the most + * recent write to the queried key in this batch is a Merge. + * + * If {@link ReadOptions#snapshot()} is not set, the current version of the + * key will be read. Calling {@link #setSnapshot()} does not affect the + * version of the data returned. + * + * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect + * what is read from the DB but will NOT change which keys are read from this + * transaction (the keys in this transaction do not yet belong to any snapshot + * and will be fetched regardless). + * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance + * @param readOptions Read options. + * @param key the key to retrieve the value for. + * + * @return a byte array storing the value associated with the input key if + * any. null if it does not find the specified key. + * + * @throws RocksDBException thrown if error happens in underlying native + * library. + */ + public byte[] get(final ColumnFamilyHandle columnFamilyHandle, + final ReadOptions readOptions, final byte[] key) throws RocksDBException { + assert(isOwningHandle()); + return get(nativeHandle_, readOptions.nativeHandle_, key, key.length, + columnFamilyHandle.nativeHandle_); + } + + /** + * This function is similar to + * {@link RocksDB#get(ReadOptions, byte[])} except it will + * also read pending changes in this transaction. + * Currently, this function will return Status::MergeInProgress if the most + * recent write to the queried key in this batch is a Merge. + * + * If {@link ReadOptions#snapshot()} is not set, the current version of the + * key will be read. Calling {@link #setSnapshot()} does not affect the + * version of the data returned. + * + * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect + * what is read from the DB but will NOT change which keys are read from this + * transaction (the keys in this transaction do not yet belong to any snapshot + * and will be fetched regardless). + * + * @param readOptions Read options. + * @param key the key to retrieve the value for. + * + * @return a byte array storing the value associated with the input key if + * any. null if it does not find the specified key. + * + * @throws RocksDBException thrown if error happens in underlying native + * library. + */ + public byte[] get(final ReadOptions readOptions, final byte[] key) + throws RocksDBException { + assert(isOwningHandle()); + return get(nativeHandle_, readOptions.nativeHandle_, key, key.length); + } + + /** + * This function is similar to + * {@link RocksDB#multiGet(ReadOptions, List, List)} except it will + * also read pending changes in this transaction. + * Currently, this function will return Status::MergeInProgress if the most + * recent write to the queried key in this batch is a Merge. + * + * If {@link ReadOptions#snapshot()} is not set, the current version of the + * key will be read. Calling {@link #setSnapshot()} does not affect the + * version of the data returned. + * + * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect + * what is read from the DB but will NOT change which keys are read from this + * transaction (the keys in this transaction do not yet belong to any snapshot + * and will be fetched regardless). + * + * @param readOptions Read options. + * @param columnFamilyHandles {@link java.util.List} containing + * {@link org.rocksdb.ColumnFamilyHandle} instances. + * @param keys of keys for which values need to be retrieved. + * + * @return Array of values, one for each key + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + * @throws IllegalArgumentException thrown if the size of passed keys is not + * equal to the amount of passed column family handles. + */ + public byte[][] multiGet(final ReadOptions readOptions, + final List columnFamilyHandles, + final byte[][] keys) throws RocksDBException { + assert(isOwningHandle()); + // Check if key size equals cfList size. If not a exception must be + // thrown. If not a Segmentation fault happens. + if (keys.length != columnFamilyHandles.size()) { + throw new IllegalArgumentException( + "For each key there must be a ColumnFamilyHandle."); + } + if(keys.length == 0) { + return new byte[0][0]; + } + final long[] cfHandles = new long[columnFamilyHandles.size()]; + for (int i = 0; i < columnFamilyHandles.size(); i++) { + cfHandles[i] = columnFamilyHandles.get(i).nativeHandle_; + } + + return multiGet(nativeHandle_, readOptions.nativeHandle_, + keys, cfHandles); + } + + /** + * This function is similar to + * {@link RocksDB#multiGet(ReadOptions, List)} except it will + * also read pending changes in this transaction. + * Currently, this function will return Status::MergeInProgress if the most + * recent write to the queried key in this batch is a Merge. + * + * If {@link ReadOptions#snapshot()} is not set, the current version of the + * key will be read. Calling {@link #setSnapshot()} does not affect the + * version of the data returned. + * + * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect + * what is read from the DB but will NOT change which keys are read from this + * transaction (the keys in this transaction do not yet belong to any snapshot + * and will be fetched regardless). + * + * @param readOptions Read options.= + * {@link org.rocksdb.ColumnFamilyHandle} instances. + * @param keys of keys for which values need to be retrieved. + * + * @return Array of values, one for each key + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public byte[][] multiGet(final ReadOptions readOptions, + final byte[][] keys) throws RocksDBException { + assert(isOwningHandle()); + if(keys.length == 0) { + return new byte[0][0]; + } + + return multiGet(nativeHandle_, readOptions.nativeHandle_, + keys); + } + + /** + * Read this key and ensure that this transaction will only + * be able to be committed if this key is not written outside this + * transaction after it has first been read (or after the snapshot if a + * snapshot is set in this transaction). The transaction behavior is the + * same regardless of whether the key exists or not. + * + * Note: Currently, this function will return Status::MergeInProgress + * if the most recent write to the queried key in this batch is a Merge. + * + * The values returned by this function are similar to + * {@link RocksDB#get(ColumnFamilyHandle, ReadOptions, byte[])}. + * If value==nullptr, then this function will not read any data, but will + * still ensure that this key cannot be written to by outside of this + * transaction. + * + * If this transaction was created by an {@link OptimisticTransactionDB}, + * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)} + * could cause {@link #commit()} to fail. Otherwise, it could return any error + * that could be returned by + * {@link RocksDB#get(ColumnFamilyHandle, ReadOptions, byte[])}. + * + * If this transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * {@link Status.Code#MergeInProgress} if merge operations cannot be + * resolved. + * + * @param readOptions Read options. + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * @param key the key to retrieve the value for. + * @param exclusive true if the transaction should have exclusive access to + * the key, otherwise false for shared access. + * + * @return a byte array storing the value associated with the input key if + * any. null if it does not find the specified key. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public byte[] getForUpdate(final ReadOptions readOptions, + final ColumnFamilyHandle columnFamilyHandle, final byte[] key, + final boolean exclusive) throws RocksDBException { + assert(isOwningHandle()); + return getForUpdate(nativeHandle_, readOptions.nativeHandle_, key, + key.length, columnFamilyHandle.nativeHandle_, exclusive); + } + + /** + * Read this key and ensure that this transaction will only + * be able to be committed if this key is not written outside this + * transaction after it has first been read (or after the snapshot if a + * snapshot is set in this transaction). The transaction behavior is the + * same regardless of whether the key exists or not. + * + * Note: Currently, this function will return Status::MergeInProgress + * if the most recent write to the queried key in this batch is a Merge. + * + * The values returned by this function are similar to + * {@link RocksDB#get(ReadOptions, byte[])}. + * If value==nullptr, then this function will not read any data, but will + * still ensure that this key cannot be written to by outside of this + * transaction. + * + * If this transaction was created on an {@link OptimisticTransactionDB}, + * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)} + * could cause {@link #commit()} to fail. Otherwise, it could return any error + * that could be returned by + * {@link RocksDB#get(ReadOptions, byte[])}. + * + * If this transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * {@link Status.Code#MergeInProgress} if merge operations cannot be + * resolved. + * + * @param readOptions Read options. + * @param key the key to retrieve the value for. + * @param exclusive true if the transaction should have exclusive access to + * the key, otherwise false for shared access. + * + * @return a byte array storing the value associated with the input key if + * any. null if it does not find the specified key. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public byte[] getForUpdate(final ReadOptions readOptions, final byte[] key, + final boolean exclusive) throws RocksDBException { + assert(isOwningHandle()); + return getForUpdate(nativeHandle_, readOptions.nativeHandle_, key, + key.length, exclusive); + } + + /** + * A multi-key version of + * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}. + * + * + * @param readOptions Read options. + * @param columnFamilyHandles {@link org.rocksdb.ColumnFamilyHandle} + * instances + * @param keys the keys to retrieve the values for. + * + * @return Array of values, one for each key + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public byte[][] multiGetForUpdate(final ReadOptions readOptions, + final List columnFamilyHandles, + final byte[][] keys) throws RocksDBException { + assert(isOwningHandle()); + // Check if key size equals cfList size. If not a exception must be + // thrown. If not a Segmentation fault happens. + if (keys.length != columnFamilyHandles.size()){ + throw new IllegalArgumentException( + "For each key there must be a ColumnFamilyHandle."); + } + if(keys.length == 0) { + return new byte[0][0]; + } + final long[] cfHandles = new long[columnFamilyHandles.size()]; + for (int i = 0; i < columnFamilyHandles.size(); i++) { + cfHandles[i] = columnFamilyHandles.get(i).nativeHandle_; + } + return multiGetForUpdate(nativeHandle_, readOptions.nativeHandle_, + keys, cfHandles); + } + + /** + * A multi-key version of {@link #getForUpdate(ReadOptions, byte[], boolean)}. + * + * + * @param readOptions Read options. + * @param keys the keys to retrieve the values for. + * + * @return Array of values, one for each key + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public byte[][] multiGetForUpdate(final ReadOptions readOptions, + final byte[][] keys) throws RocksDBException { + assert(isOwningHandle()); + if(keys.length == 0) { + return new byte[0][0]; + } + + return multiGetForUpdate(nativeHandle_, + readOptions.nativeHandle_, keys); + } + + /** + * Returns an iterator that will iterate on all keys in the default + * column family including both keys in the DB and uncommitted keys in this + * transaction. + * + * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read + * from the DB but will NOT change which keys are read from this transaction + * (the keys in this transaction do not yet belong to any snapshot and will be + * fetched regardless). + * + * Caller is responsible for deleting the returned Iterator. + * + * The returned iterator is only valid until {@link #commit()}, + * {@link #rollback()}, or {@link #rollbackToSavePoint()} is called. + * + * @param readOptions Read options. + * + * @return instance of iterator object. + */ + public RocksIterator getIterator(final ReadOptions readOptions) { + assert(isOwningHandle()); + return new RocksIterator(parent, getIterator(nativeHandle_, + readOptions.nativeHandle_)); + } + + /** + * Returns an iterator that will iterate on all keys in the default + * column family including both keys in the DB and uncommitted keys in this + * transaction. + * + * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read + * from the DB but will NOT change which keys are read from this transaction + * (the keys in this transaction do not yet belong to any snapshot and will be + * fetched regardless). + * + * Caller is responsible for calling {@link RocksIterator#close()} on + * the returned Iterator. + * + * The returned iterator is only valid until {@link #commit()}, + * {@link #rollback()}, or {@link #rollbackToSavePoint()} is called. + * + * @param readOptions Read options. + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * + * @return instance of iterator object. + */ + public RocksIterator getIterator(final ReadOptions readOptions, + final ColumnFamilyHandle columnFamilyHandle) { + assert(isOwningHandle()); + return new RocksIterator(parent, getIterator(nativeHandle_, + readOptions.nativeHandle_, columnFamilyHandle.nativeHandle_)); + } + + /** + * Similar to {@link RocksDB#put(ColumnFamilyHandle, byte[], byte[])}, but + * will also perform conflict checking on the keys be written. + * + * If this Transaction was created on an {@link OptimisticTransactionDB}, + * these functions should always succeed. + * + * If this Transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * + * @param columnFamilyHandle The column family to put the key/value into + * @param key the specified key to be inserted. + * @param value the value associated with the specified key. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, + final byte[] value) throws RocksDBException { + assert(isOwningHandle()); + put(nativeHandle_, key, key.length, value, value.length, + columnFamilyHandle.nativeHandle_); + } + + /** + * Similar to {@link RocksDB#put(byte[], byte[])}, but + * will also perform conflict checking on the keys be written. + * + * If this Transaction was created on an {@link OptimisticTransactionDB}, + * these functions should always succeed. + * + * If this Transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * + * @param key the specified key to be inserted. + * @param value the value associated with the specified key. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void put(final byte[] key, final byte[] value) + throws RocksDBException { + assert(isOwningHandle()); + put(nativeHandle_, key, key.length, value, value.length); + } + + //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future + /** + * Similar to {@link #put(ColumnFamilyHandle, byte[], byte[])} but allows + * you to specify the key and value in several parts that will be + * concatenated together. + * + * @param columnFamilyHandle The column family to put the key/value into + * @param keyParts the specified key to be inserted. + * @param valueParts the value associated with the specified key. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void put(final ColumnFamilyHandle columnFamilyHandle, + final byte[][] keyParts, final byte[][] valueParts) + throws RocksDBException { + assert(isOwningHandle()); + put(nativeHandle_, keyParts, keyParts.length, valueParts, valueParts.length, + columnFamilyHandle.nativeHandle_); + } + + //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future + /** + * Similar to {@link #put(byte[], byte[])} but allows + * you to specify the key and value in several parts that will be + * concatenated together + * + * @param keyParts the specified key to be inserted. + * @param valueParts the value associated with the specified key. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void put(final byte[][] keyParts, final byte[][] valueParts) + throws RocksDBException { + assert(isOwningHandle()); + put(nativeHandle_, keyParts, keyParts.length, valueParts, + valueParts.length); + } + + /** + * Similar to {@link RocksDB#merge(ColumnFamilyHandle, byte[], byte[])}, but + * will also perform conflict checking on the keys be written. + * + * If this Transaction was created on an {@link OptimisticTransactionDB}, + * these functions should always succeed. + * + * If this Transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * + * @param columnFamilyHandle The column family to merge the key/value into + * @param key the specified key to be merged. + * @param value the value associated with the specified key. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void merge(final ColumnFamilyHandle columnFamilyHandle, + final byte[] key, final byte[] value) throws RocksDBException { + assert(isOwningHandle()); + merge(nativeHandle_, key, key.length, value, value.length, + columnFamilyHandle.nativeHandle_); + } + + /** + * Similar to {@link RocksDB#merge(byte[], byte[])}, but + * will also perform conflict checking on the keys be written. + * + * If this Transaction was created on an {@link OptimisticTransactionDB}, + * these functions should always succeed. + * + * If this Transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * + * @param key the specified key to be merged. + * @param value the value associated with the specified key. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void merge(final byte[] key, final byte[] value) + throws RocksDBException { + assert(isOwningHandle()); + merge(nativeHandle_, key, key.length, value, value.length); + } + + /** + * Similar to {@link RocksDB#delete(ColumnFamilyHandle, byte[])}, but + * will also perform conflict checking on the keys be written. + * + * If this Transaction was created on an {@link OptimisticTransactionDB}, + * these functions should always succeed. + * + * If this Transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * + * @param columnFamilyHandle The column family to delete the key/value from + * @param key the specified key to be deleted. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void delete(final ColumnFamilyHandle columnFamilyHandle, + final byte[] key) throws RocksDBException { + assert(isOwningHandle()); + delete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); + } + + /** + * Similar to {@link RocksDB#delete(byte[])}, but + * will also perform conflict checking on the keys be written. + * + * If this Transaction was created on an {@link OptimisticTransactionDB}, + * these functions should always succeed. + * + * If this Transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * + * @param key the specified key to be deleted. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void delete(final byte[] key) throws RocksDBException { + assert(isOwningHandle()); + delete(nativeHandle_, key, key.length); + } + + //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future + /** + * Similar to {@link #delete(ColumnFamilyHandle, byte[])} but allows + * you to specify the key in several parts that will be + * concatenated together. + * + * @param columnFamilyHandle The column family to delete the key/value from + * @param keyParts the specified key to be deleted. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void delete(final ColumnFamilyHandle columnFamilyHandle, + final byte[][] keyParts) throws RocksDBException { + assert(isOwningHandle()); + delete(nativeHandle_, keyParts, keyParts.length, + columnFamilyHandle.nativeHandle_); + } + + //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future + /** + * Similar to {@link #delete(byte[])} but allows + * you to specify key the in several parts that will be + * concatenated together. + * + * @param keyParts the specified key to be deleted + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void delete(final byte[][] keyParts) throws RocksDBException { + assert(isOwningHandle()); + delete(nativeHandle_, keyParts, keyParts.length); + } + + /** + * Similar to {@link RocksDB#singleDelete(ColumnFamilyHandle, byte[])}, but + * will also perform conflict checking on the keys be written. + * + * If this Transaction was created on an {@link OptimisticTransactionDB}, + * these functions should always succeed. + * + * If this Transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * + * @param columnFamilyHandle The column family to delete the key/value from + * @param key the specified key to be deleted. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + @Experimental("Performance optimization for a very specific workload") + public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, + final byte[] key) throws RocksDBException { + assert(isOwningHandle()); + singleDelete(nativeHandle_, key, key.length, + columnFamilyHandle.nativeHandle_); + } + + /** + * Similar to {@link RocksDB#singleDelete(byte[])}, but + * will also perform conflict checking on the keys be written. + * + * If this Transaction was created on an {@link OptimisticTransactionDB}, + * these functions should always succeed. + * + * If this Transaction was created on a {@link TransactionDB}, an + * {@link RocksDBException} may be thrown with an accompanying {@link Status} + * when: + * {@link Status.Code#Busy} if there is a write conflict, + * {@link Status.Code#TimedOut} if a lock could not be acquired, + * {@link Status.Code#TryAgain} if the memtable history size is not large + * enough. See + * {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()} + * + * @param key the specified key to be deleted. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + @Experimental("Performance optimization for a very specific workload") + public void singleDelete(final byte[] key) throws RocksDBException { + assert(isOwningHandle()); + singleDelete(nativeHandle_, key, key.length); + } + + //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future + /** + * Similar to {@link #singleDelete(ColumnFamilyHandle, byte[])} but allows + * you to specify the key in several parts that will be + * concatenated together. + * + * @param columnFamilyHandle The column family to delete the key/value from + * @param keyParts the specified key to be deleted. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + @Experimental("Performance optimization for a very specific workload") + public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, + final byte[][] keyParts) throws RocksDBException { + assert(isOwningHandle()); + singleDelete(nativeHandle_, keyParts, keyParts.length, + columnFamilyHandle.nativeHandle_); + } + + //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future + /** + * Similar to {@link #singleDelete(byte[])} but allows + * you to specify the key in several parts that will be + * concatenated together. + * + * @param keyParts the specified key to be deleted. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + @Experimental("Performance optimization for a very specific workload") + public void singleDelete(final byte[][] keyParts) throws RocksDBException { + assert(isOwningHandle()); + singleDelete(nativeHandle_, keyParts, keyParts.length); + } + + /** + * Similar to {@link RocksDB#put(ColumnFamilyHandle, byte[], byte[])}, + * but operates on the transactions write batch. This write will only happen + * if this transaction gets committed successfully. + * + * Unlike {@link #put(ColumnFamilyHandle, byte[], byte[])} no conflict + * checking will be performed for this key. + * + * If this Transaction was created on a {@link TransactionDB}, this function + * will still acquire locks necessary to make sure this write doesn't cause + * conflicts in other transactions; This may cause a {@link RocksDBException} + * with associated {@link Status.Code#Busy}. + * + * @param columnFamilyHandle The column family to put the key/value into + * @param key the specified key to be inserted. + * @param value the value associated with the specified key. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void putUntracked(final ColumnFamilyHandle columnFamilyHandle, + final byte[] key, final byte[] value) throws RocksDBException { + assert(isOwningHandle()); + putUntracked(nativeHandle_, key, key.length, value, value.length, + columnFamilyHandle.nativeHandle_); + } + + /** + * Similar to {@link RocksDB#put(byte[], byte[])}, + * but operates on the transactions write batch. This write will only happen + * if this transaction gets committed successfully. + * + * Unlike {@link #put(byte[], byte[])} no conflict + * checking will be performed for this key. + * + * If this Transaction was created on a {@link TransactionDB}, this function + * will still acquire locks necessary to make sure this write doesn't cause + * conflicts in other transactions; This may cause a {@link RocksDBException} + * with associated {@link Status.Code#Busy}. + * + * @param key the specified key to be inserted. + * @param value the value associated with the specified key. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void putUntracked(final byte[] key, final byte[] value) + throws RocksDBException { + assert(isOwningHandle()); + putUntracked(nativeHandle_, key, key.length, value, value.length); + } + + //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future + /** + * Similar to {@link #putUntracked(ColumnFamilyHandle, byte[], byte[])} but + * allows you to specify the key and value in several parts that will be + * concatenated together. + * + * @param columnFamilyHandle The column family to put the key/value into + * @param keyParts the specified key to be inserted. + * @param valueParts the value associated with the specified key. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void putUntracked(final ColumnFamilyHandle columnFamilyHandle, + final byte[][] keyParts, final byte[][] valueParts) + throws RocksDBException { + assert(isOwningHandle()); + putUntracked(nativeHandle_, keyParts, keyParts.length, valueParts, + valueParts.length, columnFamilyHandle.nativeHandle_); + } + + //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future + /** + * Similar to {@link #putUntracked(byte[], byte[])} but + * allows you to specify the key and value in several parts that will be + * concatenated together. + * + * @param keyParts the specified key to be inserted. + * @param valueParts the value associated with the specified key. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void putUntracked(final byte[][] keyParts, final byte[][] valueParts) + throws RocksDBException { + assert(isOwningHandle()); + putUntracked(nativeHandle_, keyParts, keyParts.length, valueParts, + valueParts.length); + } + + /** + * Similar to {@link RocksDB#merge(ColumnFamilyHandle, byte[], byte[])}, + * but operates on the transactions write batch. This write will only happen + * if this transaction gets committed successfully. + * + * Unlike {@link #merge(ColumnFamilyHandle, byte[], byte[])} no conflict + * checking will be performed for this key. + * + * If this Transaction was created on a {@link TransactionDB}, this function + * will still acquire locks necessary to make sure this write doesn't cause + * conflicts in other transactions; This may cause a {@link RocksDBException} + * with associated {@link Status.Code#Busy}. + * + * @param columnFamilyHandle The column family to merge the key/value into + * @param key the specified key to be merged. + * @param value the value associated with the specified key. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void mergeUntracked(final ColumnFamilyHandle columnFamilyHandle, + final byte[] key, final byte[] value) throws RocksDBException { + mergeUntracked(nativeHandle_, key, key.length, value, value.length, + columnFamilyHandle.nativeHandle_); + } + + /** + * Similar to {@link RocksDB#merge(byte[], byte[])}, + * but operates on the transactions write batch. This write will only happen + * if this transaction gets committed successfully. + * + * Unlike {@link #merge(byte[], byte[])} no conflict + * checking will be performed for this key. + * + * If this Transaction was created on a {@link TransactionDB}, this function + * will still acquire locks necessary to make sure this write doesn't cause + * conflicts in other transactions; This may cause a {@link RocksDBException} + * with associated {@link Status.Code#Busy}. + * + * @param key the specified key to be merged. + * @param value the value associated with the specified key. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void mergeUntracked(final byte[] key, final byte[] value) + throws RocksDBException { + assert(isOwningHandle()); + mergeUntracked(nativeHandle_, key, key.length, value, value.length); + } + + /** + * Similar to {@link RocksDB#delete(ColumnFamilyHandle, byte[])}, + * but operates on the transactions write batch. This write will only happen + * if this transaction gets committed successfully. + * + * Unlike {@link #delete(ColumnFamilyHandle, byte[])} no conflict + * checking will be performed for this key. + * + * If this Transaction was created on a {@link TransactionDB}, this function + * will still acquire locks necessary to make sure this write doesn't cause + * conflicts in other transactions; This may cause a {@link RocksDBException} + * with associated {@link Status.Code#Busy}. + * + * @param columnFamilyHandle The column family to delete the key/value from + * @param key the specified key to be deleted. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void deleteUntracked(final ColumnFamilyHandle columnFamilyHandle, + final byte[] key) throws RocksDBException { + assert(isOwningHandle()); + deleteUntracked(nativeHandle_, key, key.length, + columnFamilyHandle.nativeHandle_); + } + + /** + * Similar to {@link RocksDB#delete(byte[])}, + * but operates on the transactions write batch. This write will only happen + * if this transaction gets committed successfully. + * + * Unlike {@link #delete(byte[])} no conflict + * checking will be performed for this key. + * + * If this Transaction was created on a {@link TransactionDB}, this function + * will still acquire locks necessary to make sure this write doesn't cause + * conflicts in other transactions; This may cause a {@link RocksDBException} + * with associated {@link Status.Code#Busy}. + * + * @param key the specified key to be deleted. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void deleteUntracked(final byte[] key) throws RocksDBException { + assert(isOwningHandle()); + deleteUntracked(nativeHandle_, key, key.length); + } + + //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future + /** + * Similar to {@link #deleteUntracked(ColumnFamilyHandle, byte[])} but allows + * you to specify the key in several parts that will be + * concatenated together. + * + * @param columnFamilyHandle The column family to delete the key/value from + * @param keyParts the specified key to be deleted. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void deleteUntracked(final ColumnFamilyHandle columnFamilyHandle, + final byte[][] keyParts) throws RocksDBException { + assert(isOwningHandle()); + deleteUntracked(nativeHandle_, keyParts, keyParts.length, + columnFamilyHandle.nativeHandle_); + } + + //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future + /** + * Similar to {@link #deleteUntracked(byte[])} but allows + * you to specify the key in several parts that will be + * concatenated together. + * + * @param keyParts the specified key to be deleted. + * + * @throws RocksDBException when one of the TransactionalDB conditions + * described above occurs, or in the case of an unexpected error + */ + public void deleteUntracked(final byte[][] keyParts) throws RocksDBException { + assert(isOwningHandle()); + deleteUntracked(nativeHandle_, keyParts, keyParts.length); + } + + /** + * Similar to {@link WriteBatch#putLogData(byte[])} + * + * @param blob binary object to be inserted + */ + public void putLogData(final byte[] blob) { + assert(isOwningHandle()); + putLogData(nativeHandle_, blob, blob.length); + } + + /** + * By default, all put/merge/delete operations will be indexed in the + * transaction so that get/getForUpdate/getIterator can search for these + * keys. + * + * If the caller does not want to fetch the keys about to be written, + * they may want to avoid indexing as a performance optimization. + * Calling {@link #disableIndexing()} will turn off indexing for all future + * put/merge/delete operations until {@link #enableIndexing()} is called. + * + * If a key is put/merge/deleted after {@link #disableIndexing()} is called + * and then is fetched via get/getForUpdate/getIterator, the result of the + * fetch is undefined. + */ + public void disableIndexing() { + assert(isOwningHandle()); + disableIndexing(nativeHandle_); + } + + /** + * Re-enables indexing after a previous call to {@link #disableIndexing()} + */ + public void enableIndexing() { + assert(isOwningHandle()); + enableIndexing(nativeHandle_); + } + + /** + * Returns the number of distinct Keys being tracked by this transaction. + * If this transaction was created by a {@link TransactionDB}, this is the + * number of keys that are currently locked by this transaction. + * If this transaction was created by an {@link OptimisticTransactionDB}, + * this is the number of keys that need to be checked for conflicts at commit + * time. + * + * @return the number of distinct Keys being tracked by this transaction + */ + public long getNumKeys() { + assert(isOwningHandle()); + return getNumKeys(nativeHandle_); + } + + /** + * Returns the number of puts that have been applied to this + * transaction so far. + * + * @return the number of puts that have been applied to this transaction + */ + public long getNumPuts() { + assert(isOwningHandle()); + return getNumPuts(nativeHandle_); + } + + /** + * Returns the number of deletes that have been applied to this + * transaction so far. + * + * @return the number of deletes that have been applied to this transaction + */ + public long getNumDeletes() { + assert(isOwningHandle()); + return getNumDeletes(nativeHandle_); + } + + /** + * Returns the number of merges that have been applied to this + * transaction so far. + * + * @return the number of merges that have been applied to this transaction + */ + public long getNumMerges() { + assert(isOwningHandle()); + return getNumMerges(nativeHandle_); + } + + /** + * Returns the elapsed time in milliseconds since this Transaction began. + * + * @return the elapsed time in milliseconds since this transaction began. + */ + public long getElapsedTime() { + assert(isOwningHandle()); + return getElapsedTime(nativeHandle_); + } + + /** + * Fetch the underlying write batch that contains all pending changes to be + * committed. + * + * Note: You should not write or delete anything from the batch directly and + * should only use the functions in the {@link Transaction} class to + * write to this transaction. + * + * @return The write batch + */ + public WriteBatchWithIndex getWriteBatch() { + assert(isOwningHandle()); + final WriteBatchWithIndex writeBatchWithIndex = + new WriteBatchWithIndex(getWriteBatch(nativeHandle_)); + return writeBatchWithIndex; + } + + /** + * Change the value of {@link TransactionOptions#getLockTimeout()} + * (in milliseconds) for this transaction. + * + * Has no effect on OptimisticTransactions. + * + * @param lockTimeout the timeout (in milliseconds) for locks used by this + * transaction. + */ + public void setLockTimeout(final long lockTimeout) { + assert(isOwningHandle()); + setLockTimeout(nativeHandle_, lockTimeout); + } + + /** + * Return the WriteOptions that will be used during {@link #commit()}. + * + * @return the WriteOptions that will be used + */ + public WriteOptions getWriteOptions() { + assert(isOwningHandle()); + final WriteOptions writeOptions = + new WriteOptions(getWriteOptions(nativeHandle_)); + return writeOptions; + } + + /** + * Reset the WriteOptions that will be used during {@link #commit()}. + * + * @param writeOptions The new WriteOptions + */ + public void setWriteOptions(final WriteOptions writeOptions) { + assert(isOwningHandle()); + setWriteOptions(nativeHandle_, writeOptions.nativeHandle_); + } + + /** + * If this key was previously fetched in this transaction using + * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}/ + * {@link #multiGetForUpdate(ReadOptions, List, byte[][])}, calling + * {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} will tell + * the transaction that it no longer needs to do any conflict checking + * for this key. + * + * If a key has been fetched N times via + * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}/ + * {@link #multiGetForUpdate(ReadOptions, List, byte[][])}, then + * {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} will only have an + * effect if it is also called N times. If this key has been written to in + * this transaction, {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} + * will have no effect. + * + * If {@link #setSavePoint()} has been called after the + * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}, + * {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} will not have any + * effect. + * + * If this Transaction was created by an {@link OptimisticTransactionDB}, + * calling {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} can affect + * whether this key is conflict checked at commit time. + * If this Transaction was created by a {@link TransactionDB}, + * calling {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} may release + * any held locks for this key. + * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * @param key the key to retrieve the value for. + */ + public void undoGetForUpdate(final ColumnFamilyHandle columnFamilyHandle, + final byte[] key) { + assert(isOwningHandle()); + undoGetForUpdate(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); + } + + /** + * If this key was previously fetched in this transaction using + * {@link #getForUpdate(ReadOptions, byte[], boolean)}/ + * {@link #multiGetForUpdate(ReadOptions, List, byte[][])}, calling + * {@link #undoGetForUpdate(byte[])} will tell + * the transaction that it no longer needs to do any conflict checking + * for this key. + * + * If a key has been fetched N times via + * {@link #getForUpdate(ReadOptions, byte[], boolean)}/ + * {@link #multiGetForUpdate(ReadOptions, List, byte[][])}, then + * {@link #undoGetForUpdate(byte[])} will only have an + * effect if it is also called N times. If this key has been written to in + * this transaction, {@link #undoGetForUpdate(byte[])} + * will have no effect. + * + * If {@link #setSavePoint()} has been called after the + * {@link #getForUpdate(ReadOptions, byte[], boolean)}, + * {@link #undoGetForUpdate(byte[])} will not have any + * effect. + * + * If this Transaction was created by an {@link OptimisticTransactionDB}, + * calling {@link #undoGetForUpdate(byte[])} can affect + * whether this key is conflict checked at commit time. + * If this Transaction was created by a {@link TransactionDB}, + * calling {@link #undoGetForUpdate(byte[])} may release + * any held locks for this key. + * + * @param key the key to retrieve the value for. + */ + public void undoGetForUpdate(final byte[] key) { + assert(isOwningHandle()); + undoGetForUpdate(nativeHandle_, key, key.length); + } + + /** + * Adds the keys from the WriteBatch to the transaction + * + * @param writeBatch The write batch to read from + * + * @throws RocksDBException if an error occurs whilst rebuilding from the + * write batch. + */ + public void rebuildFromWriteBatch(final WriteBatch writeBatch) + throws RocksDBException { + assert(isOwningHandle()); + rebuildFromWriteBatch(nativeHandle_, writeBatch.nativeHandle_); + } + + /** + * Get the Commit time Write Batch. + * + * @return the commit time write batch. + */ + public WriteBatch getCommitTimeWriteBatch() { + assert(isOwningHandle()); + final WriteBatch writeBatch = + new WriteBatch(getCommitTimeWriteBatch(nativeHandle_)); + return writeBatch; + } + + /** + * Set the log number. + * + * @param logNumber the log number + */ + public void setLogNumber(final long logNumber) { + assert(isOwningHandle()); + setLogNumber(nativeHandle_, logNumber); + } + + /** + * Get the log number. + * + * @return the log number + */ + public long getLogNumber() { + assert(isOwningHandle()); + return getLogNumber(nativeHandle_); + } + + /** + * Set the name of the transaction. + * + * @param transactionName the name of the transaction + * + * @throws RocksDBException if an error occurs when setting the transaction + * name. + */ + public void setName(final String transactionName) throws RocksDBException { + assert(isOwningHandle()); + setName(nativeHandle_, transactionName); + } + + /** + * Get the name of the transaction. + * + * @return the name of the transaction + */ + public String getName() { + assert(isOwningHandle()); + return getName(nativeHandle_); + } + + /** + * Get the ID of the transaction. + * + * @return the ID of the transaction. + */ + public long getID() { + assert(isOwningHandle()); + return getID(nativeHandle_); + } + + /** + * Determine if a deadlock has been detected. + * + * @return true if a deadlock has been detected. + */ + public boolean isDeadlockDetect() { + assert(isOwningHandle()); + return isDeadlockDetect(nativeHandle_); + } + + /** + * Get the list of waiting transactions. + * + * @return The list of waiting transactions. + */ + public WaitingTransactions getWaitingTxns() { + assert(isOwningHandle()); + return getWaitingTxns(nativeHandle_); + } + + /** + * Get the execution status of the transaction. + * + * NOTE: The execution status of an Optimistic Transaction + * never changes. This is only useful for non-optimistic transactions! + * + * @return The execution status of the transaction + */ + public TransactionState getState() { + assert(isOwningHandle()); + return TransactionState.getTransactionState( + getState(nativeHandle_)); + } + + /** + * The globally unique id with which the transaction is identified. This id + * might or might not be set depending on the implementation. Similarly the + * implementation decides the point in lifetime of a transaction at which it + * assigns the id. Although currently it is the case, the id is not guaranteed + * to remain the same across restarts. + * + * @return the transaction id. + */ + @Experimental("NOTE: Experimental feature") + public long getId() { + assert(isOwningHandle()); + return getId(nativeHandle_); + } + + public enum TransactionState { + STARTED((byte)0), + AWAITING_PREPARE((byte)1), + PREPARED((byte)2), + AWAITING_COMMIT((byte)3), + COMMITED((byte)4), + AWAITING_ROLLBACK((byte)5), + ROLLEDBACK((byte)6), + LOCKS_STOLEN((byte)7); + + private final byte value; + + TransactionState(final byte value) { + this.value = value; + } + + /** + * Get TransactionState by byte value. + * + * @param value byte representation of TransactionState. + * + * @return {@link org.rocksdb.Transaction.TransactionState} instance or null. + * @throws java.lang.IllegalArgumentException if an invalid + * value is provided. + */ + public static TransactionState getTransactionState(final byte value) { + for (final TransactionState transactionState : TransactionState.values()) { + if (transactionState.value == value){ + return transactionState; + } + } + throw new IllegalArgumentException( + "Illegal value provided for TransactionState."); + } + } + + /** + * Called from C++ native method {@link #getWaitingTxns(long)} + * to construct a WaitingTransactions object. + * + * @param columnFamilyId The id of the {@link ColumnFamilyHandle} + * @param key The key + * @param transactionIds The transaction ids + * + * @return The waiting transactions + */ + private WaitingTransactions newWaitingTransactions( + final long columnFamilyId, final String key, + final long[] transactionIds) { + return new WaitingTransactions(columnFamilyId, key, transactionIds); + } + + public static class WaitingTransactions { + private final long columnFamilyId; + private final String key; + private final long[] transactionIds; + + private WaitingTransactions(final long columnFamilyId, final String key, + final long[] transactionIds) { + this.columnFamilyId = columnFamilyId; + this.key = key; + this.transactionIds = transactionIds; + } + + /** + * Get the Column Family ID. + * + * @return The column family ID + */ + public long getColumnFamilyId() { + return columnFamilyId; + } + + /** + * Get the key on which the transactions are waiting. + * + * @return The key + */ + public String getKey() { + return key; + } + + /** + * Get the IDs of the waiting transactions. + * + * @return The IDs of the waiting transactions + */ + public long[] getTransactionIds() { + return transactionIds; + } + } + + private native void setSnapshot(final long handle); + private native void setSnapshotOnNextOperation(final long handle); + private native void setSnapshotOnNextOperation(final long handle, + final long transactionNotifierHandle); + private native long getSnapshot(final long handle); + private native void clearSnapshot(final long handle); + private native void prepare(final long handle) throws RocksDBException; + private native void commit(final long handle) throws RocksDBException; + private native void rollback(final long handle) throws RocksDBException; + private native void setSavePoint(final long handle) throws RocksDBException; + private native void rollbackToSavePoint(final long handle) + throws RocksDBException; + private native byte[] get(final long handle, final long readOptionsHandle, + final byte key[], final int keyLength, final long columnFamilyHandle) + throws RocksDBException; + private native byte[] get(final long handle, final long readOptionsHandle, + final byte key[], final int keyLen) throws RocksDBException; + private native byte[][] multiGet(final long handle, + final long readOptionsHandle, final byte[][] keys, + final long[] columnFamilyHandles) throws RocksDBException; + private native byte[][] multiGet(final long handle, + final long readOptionsHandle, final byte[][] keys) + throws RocksDBException; + private native byte[] getForUpdate(final long handle, + final long readOptionsHandle, final byte key[], final int keyLength, + final long columnFamilyHandle, final boolean exclusive) + throws RocksDBException; + private native byte[] getForUpdate(final long handle, + final long readOptionsHandle, final byte key[], final int keyLen, + final boolean exclusive) throws RocksDBException; + private native byte[][] multiGetForUpdate(final long handle, + final long readOptionsHandle, final byte[][] keys, + final long[] columnFamilyHandles) throws RocksDBException; + private native byte[][] multiGetForUpdate(final long handle, + final long readOptionsHandle, final byte[][] keys) + throws RocksDBException; + private native long getIterator(final long handle, + final long readOptionsHandle); + private native long getIterator(final long handle, + final long readOptionsHandle, final long columnFamilyHandle); + private native void put(final long handle, final byte[] key, + final int keyLength, final byte[] value, final int valueLength, + final long columnFamilyHandle) throws RocksDBException; + private native void put(final long handle, final byte[] key, + final int keyLength, final byte[] value, final int valueLength) + throws RocksDBException; + private native void put(final long handle, final byte[][] keys, + final int keysLength, final byte[][] values, final int valuesLength, + final long columnFamilyHandle) throws RocksDBException; + private native void put(final long handle, final byte[][] keys, + final int keysLength, final byte[][] values, final int valuesLength) + throws RocksDBException; + private native void merge(final long handle, final byte[] key, + final int keyLength, final byte[] value, final int valueLength, + final long columnFamilyHandle) throws RocksDBException; + private native void merge(final long handle, final byte[] key, + final int keyLength, final byte[] value, final int valueLength) + throws RocksDBException; + private native void delete(final long handle, final byte[] key, + final int keyLength, final long columnFamilyHandle) + throws RocksDBException; + private native void delete(final long handle, final byte[] key, + final int keyLength) throws RocksDBException; + private native void delete(final long handle, final byte[][] keys, + final int keysLength, final long columnFamilyHandle) + throws RocksDBException; + private native void delete(final long handle, final byte[][] keys, + final int keysLength) throws RocksDBException; + private native void singleDelete(final long handle, final byte[] key, + final int keyLength, final long columnFamilyHandle) + throws RocksDBException; + private native void singleDelete(final long handle, final byte[] key, + final int keyLength) throws RocksDBException; + private native void singleDelete(final long handle, final byte[][] keys, + final int keysLength, final long columnFamilyHandle) + throws RocksDBException; + private native void singleDelete(final long handle, final byte[][] keys, + final int keysLength) throws RocksDBException; + private native void putUntracked(final long handle, final byte[] key, + final int keyLength, final byte[] value, final int valueLength, + final long columnFamilyHandle) throws RocksDBException; + private native void putUntracked(final long handle, final byte[] key, + final int keyLength, final byte[] value, final int valueLength) + throws RocksDBException; + private native void putUntracked(final long handle, final byte[][] keys, + final int keysLength, final byte[][] values, final int valuesLength, + final long columnFamilyHandle) throws RocksDBException; + private native void putUntracked(final long handle, final byte[][] keys, + final int keysLength, final byte[][] values, final int valuesLength) + throws RocksDBException; + private native void mergeUntracked(final long handle, final byte[] key, + final int keyLength, final byte[] value, final int valueLength, + final long columnFamilyHandle) throws RocksDBException; + private native void mergeUntracked(final long handle, final byte[] key, + final int keyLength, final byte[] value, final int valueLength) + throws RocksDBException; + private native void deleteUntracked(final long handle, final byte[] key, + final int keyLength, final long columnFamilyHandle) + throws RocksDBException; + private native void deleteUntracked(final long handle, final byte[] key, + final int keyLength) throws RocksDBException; + private native void deleteUntracked(final long handle, final byte[][] keys, + final int keysLength, final long columnFamilyHandle) + throws RocksDBException; + private native void deleteUntracked(final long handle, final byte[][] keys, + final int keysLength) throws RocksDBException; + private native void putLogData(final long handle, final byte[] blob, + final int blobLength); + private native void disableIndexing(final long handle); + private native void enableIndexing(final long handle); + private native long getNumKeys(final long handle); + private native long getNumPuts(final long handle); + private native long getNumDeletes(final long handle); + private native long getNumMerges(final long handle); + private native long getElapsedTime(final long handle); + private native long getWriteBatch(final long handle); + private native void setLockTimeout(final long handle, final long lockTimeout); + private native long getWriteOptions(final long handle); + private native void setWriteOptions(final long handle, + final long writeOptionsHandle); + private native void undoGetForUpdate(final long handle, final byte[] key, + final int keyLength, final long columnFamilyHandle); + private native void undoGetForUpdate(final long handle, final byte[] key, + final int keyLength); + private native void rebuildFromWriteBatch(final long handle, + final long writeBatchHandle) throws RocksDBException; + private native long getCommitTimeWriteBatch(final long handle); + private native void setLogNumber(final long handle, final long logNumber); + private native long getLogNumber(final long handle); + private native void setName(final long handle, final String name) + throws RocksDBException; + private native String getName(final long handle); + private native long getID(final long handle); + private native boolean isDeadlockDetect(final long handle); + private native WaitingTransactions getWaitingTxns(final long handle); + private native byte getState(final long handle); + private native long getId(final long handle); + + @Override protected final native void disposeInternal(final long handle); +} diff --git a/java/src/main/java/org/rocksdb/TransactionDB.java b/java/src/main/java/org/rocksdb/TransactionDB.java new file mode 100644 index 00000000000..fcecf3faffe --- /dev/null +++ b/java/src/main/java/org/rocksdb/TransactionDB.java @@ -0,0 +1,354 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Database with Transaction support + */ +public class TransactionDB extends RocksDB + implements TransactionalDB { + + private TransactionDBOptions transactionDbOptions_; + + /** + * Private constructor. + * + * @param nativeHandle The native handle of the C++ TransactionDB object + */ + private TransactionDB(final long nativeHandle) { + super(nativeHandle); + } + + /** + * Open a TransactionDB, similar to {@link RocksDB#open(Options, String)}. + * + * @param options {@link org.rocksdb.Options} instance. + * @param transactionDbOptions {@link org.rocksdb.TransactionDBOptions} + * instance. + * @param path the path to the rocksdb. + * + * @return a {@link TransactionDB} instance on success, null if the specified + * {@link TransactionDB} can not be opened. + * + * @throws RocksDBException if an error occurs whilst opening the database. + */ + public static TransactionDB open(final Options options, + final TransactionDBOptions transactionDbOptions, final String path) + throws RocksDBException { + final TransactionDB tdb = new TransactionDB(open(options.nativeHandle_, + transactionDbOptions.nativeHandle_, path)); + + // when non-default Options is used, keeping an Options reference + // in RocksDB can prevent Java to GC during the life-time of + // the currently-created RocksDB. + tdb.storeOptionsInstance(options); + tdb.storeTransactionDbOptions(transactionDbOptions); + + return tdb; + } + + /** + * Open a TransactionDB, similar to + * {@link RocksDB#open(DBOptions, String, List, List)}. + * + * @param dbOptions {@link org.rocksdb.DBOptions} instance. + * @param transactionDbOptions {@link org.rocksdb.TransactionDBOptions} + * instance. + * @param path the path to the rocksdb. + * @param columnFamilyDescriptors list of column family descriptors + * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances + * + * @return a {@link TransactionDB} instance on success, null if the specified + * {@link TransactionDB} can not be opened. + * + * @throws RocksDBException if an error occurs whilst opening the database. + */ + public static TransactionDB open(final DBOptions dbOptions, + final TransactionDBOptions transactionDbOptions, + final String path, + final List columnFamilyDescriptors, + final List columnFamilyHandles) + throws RocksDBException { + + final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][]; + final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()]; + for (int i = 0; i < columnFamilyDescriptors.size(); i++) { + final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors + .get(i); + cfNames[i] = cfDescriptor.columnFamilyName(); + cfOptionHandles[i] = cfDescriptor.columnFamilyOptions().nativeHandle_; + } + + final long[] handles = open(dbOptions.nativeHandle_, + transactionDbOptions.nativeHandle_, path, cfNames, cfOptionHandles); + final TransactionDB tdb = new TransactionDB(handles[0]); + + // when non-default Options is used, keeping an Options reference + // in RocksDB can prevent Java to GC during the life-time of + // the currently-created RocksDB. + tdb.storeOptionsInstance(dbOptions); + tdb.storeTransactionDbOptions(transactionDbOptions); + + for (int i = 1; i < handles.length; i++) { + columnFamilyHandles.add(new ColumnFamilyHandle(tdb, handles[i])); + } + + return tdb; + } + + @Override + public Transaction beginTransaction(final WriteOptions writeOptions) { + return new Transaction(this, beginTransaction(nativeHandle_, + writeOptions.nativeHandle_)); + } + + @Override + public Transaction beginTransaction(final WriteOptions writeOptions, + final TransactionOptions transactionOptions) { + return new Transaction(this, beginTransaction(nativeHandle_, + writeOptions.nativeHandle_, transactionOptions.nativeHandle_)); + } + + // TODO(AR) consider having beingTransaction(... oldTransaction) set a + // reference count inside Transaction, so that we can always call + // Transaction#close but the object is only disposed when there are as many + // closes as beginTransaction. Makes the try-with-resources paradigm easier for + // java developers + + @Override + public Transaction beginTransaction(final WriteOptions writeOptions, + final Transaction oldTransaction) { + final long jtxnHandle = beginTransaction_withOld(nativeHandle_, + writeOptions.nativeHandle_, oldTransaction.nativeHandle_); + + // RocksJava relies on the assumption that + // we do not allocate a new Transaction object + // when providing an old_txn + assert(jtxnHandle == oldTransaction.nativeHandle_); + + return oldTransaction; + } + + @Override + public Transaction beginTransaction(final WriteOptions writeOptions, + final TransactionOptions transactionOptions, + final Transaction oldTransaction) { + final long jtxn_handle = beginTransaction_withOld(nativeHandle_, + writeOptions.nativeHandle_, transactionOptions.nativeHandle_, + oldTransaction.nativeHandle_); + + // RocksJava relies on the assumption that + // we do not allocate a new Transaction object + // when providing an old_txn + assert(jtxn_handle == oldTransaction.nativeHandle_); + + return oldTransaction; + } + + public Transaction getTransactionByName(final String transactionName) { + final long jtxnHandle = getTransactionByName(nativeHandle_, transactionName); + if(jtxnHandle == 0) { + return null; + } + + final Transaction txn = new Transaction(this, jtxnHandle); + + // this instance doesn't own the underlying C++ object + txn.disOwnNativeHandle(); + + return txn; + } + + public List getAllPreparedTransactions() { + final long[] jtxnHandles = getAllPreparedTransactions(nativeHandle_); + + final List txns = new ArrayList<>(); + for(final long jtxnHandle : jtxnHandles) { + final Transaction txn = new Transaction(this, jtxnHandle); + + // this instance doesn't own the underlying C++ object + txn.disOwnNativeHandle(); + + txns.add(txn); + } + return txns; + } + + public static class KeyLockInfo { + private final String key; + private final long[] transactionIDs; + private final boolean exclusive; + + public KeyLockInfo(final String key, final long transactionIDs[], + final boolean exclusive) { + this.key = key; + this.transactionIDs = transactionIDs; + this.exclusive = exclusive; + } + + /** + * Get the key. + * + * @return the key + */ + public String getKey() { + return key; + } + + /** + * Get the Transaction IDs. + * + * @return the Transaction IDs. + */ + public long[] getTransactionIDs() { + return transactionIDs; + } + + /** + * Get the Lock status. + * + * @return true if the lock is exclusive, false if the lock is shared. + */ + public boolean isExclusive() { + return exclusive; + } + } + + /** + * Returns map of all locks held. + * + * @return a map of all the locks held. + */ + public Map getLockStatusData() { + return getLockStatusData(nativeHandle_); + } + + /** + * Called from C++ native method {@link #getDeadlockInfoBuffer(long)} + * to construct a DeadlockInfo object. + * + * @param transactionID The transaction id + * @param columnFamilyId The id of the {@link ColumnFamilyHandle} + * @param waitingKey the key that we are waiting on + * @param exclusive true if the lock is exclusive, false if the lock is shared + * + * @return The waiting transactions + */ + private DeadlockInfo newDeadlockInfo( + final long transactionID, final long columnFamilyId, + final String waitingKey, final boolean exclusive) { + return new DeadlockInfo(transactionID, columnFamilyId, + waitingKey, exclusive); + } + + public static class DeadlockInfo { + private final long transactionID; + private final long columnFamilyId; + private final String waitingKey; + private final boolean exclusive; + + private DeadlockInfo(final long transactionID, final long columnFamilyId, + final String waitingKey, final boolean exclusive) { + this.transactionID = transactionID; + this.columnFamilyId = columnFamilyId; + this.waitingKey = waitingKey; + this.exclusive = exclusive; + } + + /** + * Get the Transaction ID. + * + * @return the transaction ID + */ + public long getTransactionID() { + return transactionID; + } + + /** + * Get the Column Family ID. + * + * @return The column family ID + */ + public long getColumnFamilyId() { + return columnFamilyId; + } + + /** + * Get the key that we are waiting on. + * + * @return the key that we are waiting on + */ + public String getWaitingKey() { + return waitingKey; + } + + /** + * Get the Lock status. + * + * @return true if the lock is exclusive, false if the lock is shared. + */ + public boolean isExclusive() { + return exclusive; + } + } + + public static class DeadlockPath { + final DeadlockInfo[] path; + final boolean limitExceeded; + + public DeadlockPath(final DeadlockInfo[] path, final boolean limitExceeded) { + this.path = path; + this.limitExceeded = limitExceeded; + } + + public boolean isEmpty() { + return path.length == 0 && !limitExceeded; + } + } + + public DeadlockPath[] getDeadlockInfoBuffer() { + return getDeadlockInfoBuffer(nativeHandle_); + } + + public void setDeadlockInfoBufferSize(final int targetSize) { + setDeadlockInfoBufferSize(nativeHandle_, targetSize); + } + + private void storeTransactionDbOptions( + final TransactionDBOptions transactionDbOptions) { + this.transactionDbOptions_ = transactionDbOptions; + } + + private static native long open(final long optionsHandle, + final long transactionDbOptionsHandle, final String path) + throws RocksDBException; + private static native long[] open(final long dbOptionsHandle, + final long transactionDbOptionsHandle, final String path, + final byte[][] columnFamilyNames, final long[] columnFamilyOptions); + private native long beginTransaction(final long handle, + final long writeOptionsHandle); + private native long beginTransaction(final long handle, + final long writeOptionsHandle, final long transactionOptionsHandle); + private native long beginTransaction_withOld(final long handle, + final long writeOptionsHandle, final long oldTransactionHandle); + private native long beginTransaction_withOld(final long handle, + final long writeOptionsHandle, final long transactionOptionsHandle, + final long oldTransactionHandle); + private native long getTransactionByName(final long handle, + final String name); + private native long[] getAllPreparedTransactions(final long handle); + private native Map getLockStatusData( + final long handle); + private native DeadlockPath[] getDeadlockInfoBuffer(final long handle); + private native void setDeadlockInfoBufferSize(final long handle, + final int targetSize); + @Override protected final native void disposeInternal(final long handle); +} diff --git a/java/src/main/java/org/rocksdb/TransactionDBOptions.java b/java/src/main/java/org/rocksdb/TransactionDBOptions.java new file mode 100644 index 00000000000..76f545cde6a --- /dev/null +++ b/java/src/main/java/org/rocksdb/TransactionDBOptions.java @@ -0,0 +1,217 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +public class TransactionDBOptions extends RocksObject { + + public TransactionDBOptions() { + super(newTransactionDBOptions()); + } + + /** + * Specifies the maximum number of keys that can be locked at the same time + * per column family. + * + * If the number of locked keys is greater than {@link #getMaxNumLocks()}, + * transaction writes (or GetForUpdate) will return an error. + * + * @return The maximum number of keys that can be locked + */ + public long getMaxNumLocks() { + assert(isOwningHandle()); + return getMaxNumLocks(nativeHandle_); + } + + /** + * Specifies the maximum number of keys that can be locked at the same time + * per column family. + * + * If the number of locked keys is greater than {@link #getMaxNumLocks()}, + * transaction writes (or GetForUpdate) will return an error. + * + * @param maxNumLocks The maximum number of keys that can be locked; + * If this value is not positive, no limit will be enforced. + * + * @return this TransactionDBOptions instance + */ + public TransactionDBOptions setMaxNumLocks(final long maxNumLocks) { + assert(isOwningHandle()); + setMaxNumLocks(nativeHandle_, maxNumLocks); + return this; + } + + /** + * The number of sub-tables per lock table (per column family) + * + * @return The number of sub-tables + */ + public long getNumStripes() { + assert(isOwningHandle()); + return getNumStripes(nativeHandle_); + } + + /** + * Increasing this value will increase the concurrency by dividing the lock + * table (per column family) into more sub-tables, each with their own + * separate mutex. + * + * Default: 16 + * + * @param numStripes The number of sub-tables + * + * @return this TransactionDBOptions instance + */ + public TransactionDBOptions setNumStripes(final long numStripes) { + assert(isOwningHandle()); + setNumStripes(nativeHandle_, numStripes); + return this; + } + + /** + * The default wait timeout in milliseconds when + * a transaction attempts to lock a key if not specified by + * {@link TransactionOptions#setLockTimeout(long)} + * + * If 0, no waiting is done if a lock cannot instantly be acquired. + * If negative, there is no timeout. + * + * @return the default wait timeout in milliseconds + */ + public long getTransactionLockTimeout() { + assert(isOwningHandle()); + return getTransactionLockTimeout(nativeHandle_); + } + + /** + * If positive, specifies the default wait timeout in milliseconds when + * a transaction attempts to lock a key if not specified by + * {@link TransactionOptions#setLockTimeout(long)} + * + * If 0, no waiting is done if a lock cannot instantly be acquired. + * If negative, there is no timeout. Not using a timeout is not recommended + * as it can lead to deadlocks. Currently, there is no deadlock-detection to + * recover from a deadlock. + * + * Default: 1000 + * + * @param transactionLockTimeout the default wait timeout in milliseconds + * + * @return this TransactionDBOptions instance + */ + public TransactionDBOptions setTransactionLockTimeout( + final long transactionLockTimeout) { + assert(isOwningHandle()); + setTransactionLockTimeout(nativeHandle_, transactionLockTimeout); + return this; + } + + /** + * The wait timeout in milliseconds when writing a key + * OUTSIDE of a transaction (ie by calling {@link RocksDB#put}, + * {@link RocksDB#merge}, {@link RocksDB#remove} or {@link RocksDB#write} + * directly). + * + * If 0, no waiting is done if a lock cannot instantly be acquired. + * If negative, there is no timeout and will block indefinitely when acquiring + * a lock. + * + * @return the timeout in milliseconds when writing a key OUTSIDE of a + * transaction + */ + public long getDefaultLockTimeout() { + assert(isOwningHandle()); + return getDefaultLockTimeout(nativeHandle_); + } + + /** + * If positive, specifies the wait timeout in milliseconds when writing a key + * OUTSIDE of a transaction (ie by calling {@link RocksDB#put}, + * {@link RocksDB#merge}, {@link RocksDB#remove} or {@link RocksDB#write} + * directly). + * + * If 0, no waiting is done if a lock cannot instantly be acquired. + * If negative, there is no timeout and will block indefinitely when acquiring + * a lock. + * + * Not using a timeout can lead to deadlocks. Currently, there + * is no deadlock-detection to recover from a deadlock. While DB writes + * cannot deadlock with other DB writes, they can deadlock with a transaction. + * A negative timeout should only be used if all transactions have a small + * expiration set. + * + * Default: 1000 + * + * @param defaultLockTimeout the timeout in milliseconds when writing a key + * OUTSIDE of a transaction + * @return this TransactionDBOptions instance + */ + public TransactionDBOptions setDefaultLockTimeout( + final long defaultLockTimeout) { + assert(isOwningHandle()); + setDefaultLockTimeout(nativeHandle_, defaultLockTimeout); + return this; + } + +// /** +// * If set, the {@link TransactionDB} will use this implementation of a mutex +// * and condition variable for all transaction locking instead of the default +// * mutex/condvar implementation. +// * +// * @param transactionDbMutexFactory the mutex factory for the transactions +// * +// * @return this TransactionDBOptions instance +// */ +// public TransactionDBOptions setCustomMutexFactory( +// final TransactionDBMutexFactory transactionDbMutexFactory) { +// +// } + + /** + * The policy for when to write the data into the DB. The default policy is to + * write only the committed data {@link TxnDBWritePolicy#WRITE_COMMITTED}. + * The data could be written before the commit phase. The DB then needs to + * provide the mechanisms to tell apart committed from uncommitted data. + * + * @return The write policy. + */ + public TxnDBWritePolicy getWritePolicy() { + assert(isOwningHandle()); + return TxnDBWritePolicy.getTxnDBWritePolicy(getWritePolicy(nativeHandle_)); + } + + /** + * The policy for when to write the data into the DB. The default policy is to + * write only the committed data {@link TxnDBWritePolicy#WRITE_COMMITTED}. + * The data could be written before the commit phase. The DB then needs to + * provide the mechanisms to tell apart committed from uncommitted data. + * + * @param writePolicy The write policy. + * + * @return this TransactionDBOptions instance + */ + public TransactionDBOptions setWritePolicy( + final TxnDBWritePolicy writePolicy) { + assert(isOwningHandle()); + setWritePolicy(nativeHandle_, writePolicy.getValue()); + return this; + } + + private native static long newTransactionDBOptions(); + private native long getMaxNumLocks(final long handle); + private native void setMaxNumLocks(final long handle, + final long maxNumLocks); + private native long getNumStripes(final long handle); + private native void setNumStripes(final long handle, final long numStripes); + private native long getTransactionLockTimeout(final long handle); + private native void setTransactionLockTimeout(final long handle, + final long transactionLockTimeout); + private native long getDefaultLockTimeout(final long handle); + private native void setDefaultLockTimeout(final long handle, + final long transactionLockTimeout); + private native byte getWritePolicy(final long handle); + private native void setWritePolicy(final long handle, final byte writePolicy); + @Override protected final native void disposeInternal(final long handle); +} diff --git a/java/src/main/java/org/rocksdb/TransactionOptions.java b/java/src/main/java/org/rocksdb/TransactionOptions.java new file mode 100644 index 00000000000..1cd936ae649 --- /dev/null +++ b/java/src/main/java/org/rocksdb/TransactionOptions.java @@ -0,0 +1,189 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +public class TransactionOptions extends RocksObject + implements TransactionalOptions { + + public TransactionOptions() { + super(newTransactionOptions()); + } + + @Override + public boolean isSetSnapshot() { + assert(isOwningHandle()); + return isSetSnapshot(nativeHandle_); + } + + @Override + public TransactionOptions setSetSnapshot(final boolean setSnapshot) { + assert(isOwningHandle()); + setSetSnapshot(nativeHandle_, setSnapshot); + return this; + } + + /** + * True means that before acquiring locks, this transaction will + * check if doing so will cause a deadlock. If so, it will return with + * {@link Status.Code#Busy}. The user should retry their transaction. + * + * @return true if a deadlock is detected. + */ + public boolean isDeadlockDetect() { + assert(isOwningHandle()); + return isDeadlockDetect(nativeHandle_); + } + + /** + * Setting to true means that before acquiring locks, this transaction will + * check if doing so will cause a deadlock. If so, it will return with + * {@link Status.Code#Busy}. The user should retry their transaction. + * + * @param deadlockDetect true if we should detect deadlocks. + * + * @return this TransactionOptions instance + */ + public TransactionOptions setDeadlockDetect(final boolean deadlockDetect) { + assert(isOwningHandle()); + setDeadlockDetect(nativeHandle_, deadlockDetect); + return this; + } + + /** + * The wait timeout in milliseconds when a transaction attempts to lock a key. + * + * If 0, no waiting is done if a lock cannot instantly be acquired. + * If negative, {@link TransactionDBOptions#getTransactionLockTimeout(long)} + * will be used + * + * @return the lock timeout in milliseconds + */ + public long getLockTimeout() { + assert(isOwningHandle()); + return getLockTimeout(nativeHandle_); + } + + /** + * If positive, specifies the wait timeout in milliseconds when + * a transaction attempts to lock a key. + * + * If 0, no waiting is done if a lock cannot instantly be acquired. + * If negative, {@link TransactionDBOptions#getTransactionLockTimeout(long)} + * will be used + * + * Default: -1 + * + * @param lockTimeout the lock timeout in milliseconds + * + * @return this TransactionOptions instance + */ + public TransactionOptions setLockTimeout(final long lockTimeout) { + assert(isOwningHandle()); + setLockTimeout(nativeHandle_, lockTimeout); + return this; + } + + /** + * Expiration duration in milliseconds. + * + * If non-negative, transactions that last longer than this many milliseconds + * will fail to commit. If not set, a forgotten transaction that is never + * committed, rolled back, or deleted will never relinquish any locks it + * holds. This could prevent keys from being written by other writers. + * + * @return expiration the expiration duration in milliseconds + */ + public long getExpiration() { + assert(isOwningHandle()); + return getExpiration(nativeHandle_); + } + + /** + * Expiration duration in milliseconds. + * + * If non-negative, transactions that last longer than this many milliseconds + * will fail to commit. If not set, a forgotten transaction that is never + * committed, rolled back, or deleted will never relinquish any locks it + * holds. This could prevent keys from being written by other writers. + * + * Default: -1 + * + * @param expiration the expiration duration in milliseconds + * + * @return this TransactionOptions instance + */ + public TransactionOptions setExpiration(final long expiration) { + assert(isOwningHandle()); + setExpiration(nativeHandle_, expiration); + return this; + } + + /** + * Gets the number of traversals to make during deadlock detection. + * + * @return the number of traversals to make during + * deadlock detection + */ + public long getDeadlockDetectDepth() { + return getDeadlockDetectDepth(nativeHandle_); + } + + /** + * Sets the number of traversals to make during deadlock detection. + * + * Default: 50 + * + * @param deadlockDetectDepth the number of traversals to make during + * deadlock detection + * + * @return this TransactionOptions instance + */ + public TransactionOptions setDeadlockDetectDepth( + final long deadlockDetectDepth) { + setDeadlockDetectDepth(nativeHandle_, deadlockDetectDepth); + return this; + } + + /** + * Get the maximum number of bytes that may be used for the write batch. + * + * @return the maximum number of bytes, 0 means no limit. + */ + public long getMaxWriteBatchSize() { + return getMaxWriteBatchSize(nativeHandle_); + } + + /** + * Set the maximum number of bytes that may be used for the write batch. + * + * @param maxWriteBatchSize the maximum number of bytes, 0 means no limit. + * + * @return this TransactionOptions instance + */ + public TransactionOptions setMaxWriteBatchSize(final long maxWriteBatchSize) { + setMaxWriteBatchSize(nativeHandle_, maxWriteBatchSize); + return this; + } + + private native static long newTransactionOptions(); + private native boolean isSetSnapshot(final long handle); + private native void setSetSnapshot(final long handle, + final boolean setSnapshot); + private native boolean isDeadlockDetect(final long handle); + private native void setDeadlockDetect(final long handle, + final boolean deadlockDetect); + private native long getLockTimeout(final long handle); + private native void setLockTimeout(final long handle, final long lockTimeout); + private native long getExpiration(final long handle); + private native void setExpiration(final long handle, final long expiration); + private native long getDeadlockDetectDepth(final long handle); + private native void setDeadlockDetectDepth(final long handle, + final long deadlockDetectDepth); + private native long getMaxWriteBatchSize(final long handle); + private native void setMaxWriteBatchSize(final long handle, + final long maxWriteBatchSize); + @Override protected final native void disposeInternal(final long handle); +} diff --git a/java/src/main/java/org/rocksdb/TransactionalDB.java b/java/src/main/java/org/rocksdb/TransactionalDB.java new file mode 100644 index 00000000000..3f0eceda855 --- /dev/null +++ b/java/src/main/java/org/rocksdb/TransactionalDB.java @@ -0,0 +1,68 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + + +interface TransactionalDB + extends AutoCloseable { + + /** + * Starts a new Transaction. + * + * Caller is responsible for calling {@link #close()} on the returned + * transaction when it is no longer needed. + * + * @param writeOptions Any write options for the transaction + * @return a new transaction + */ + Transaction beginTransaction(final WriteOptions writeOptions); + + /** + * Starts a new Transaction. + * + * Caller is responsible for calling {@link #close()} on the returned + * transaction when it is no longer needed. + * + * @param writeOptions Any write options for the transaction + * @param transactionOptions Any options for the transaction + * @return a new transaction + */ + Transaction beginTransaction(final WriteOptions writeOptions, + final T transactionOptions); + + /** + * Starts a new Transaction. + * + * Caller is responsible for calling {@link #close()} on the returned + * transaction when it is no longer needed. + * + * @param writeOptions Any write options for the transaction + * @param oldTransaction this Transaction will be reused instead of allocating + * a new one. This is an optimization to avoid extra allocations + * when repeatedly creating transactions. + * @return The oldTransaction which has been reinitialized as a new + * transaction + */ + Transaction beginTransaction(final WriteOptions writeOptions, + final Transaction oldTransaction); + + /** + * Starts a new Transaction. + * + * Caller is responsible for calling {@link #close()} on the returned + * transaction when it is no longer needed. + * + * @param writeOptions Any write options for the transaction + * @param transactionOptions Any options for the transaction + * @param oldTransaction this Transaction will be reused instead of allocating + * a new one. This is an optimization to avoid extra allocations + * when repeatedly creating transactions. + * @return The oldTransaction which has been reinitialized as a new + * transaction + */ + Transaction beginTransaction(final WriteOptions writeOptions, + final T transactionOptions, final Transaction oldTransaction); +} diff --git a/java/src/main/java/org/rocksdb/TransactionalOptions.java b/java/src/main/java/org/rocksdb/TransactionalOptions.java new file mode 100644 index 00000000000..87aaa7986fd --- /dev/null +++ b/java/src/main/java/org/rocksdb/TransactionalOptions.java @@ -0,0 +1,31 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + + +interface TransactionalOptions extends AutoCloseable { + + /** + * True indicates snapshots will be set, just like if + * {@link Transaction#setSnapshot()} had been called + * + * @return whether a snapshot will be set + */ + boolean isSetSnapshot(); + + /** + * Setting the setSnapshot to true is the same as calling + * {@link Transaction#setSnapshot()}. + * + * Default: false + * + * @param The type of transactional options. + * @param setSnapshot Whether to set a snapshot + * + * @return this TransactionalOptions instance + */ + T setSetSnapshot(final boolean setSnapshot); +} diff --git a/java/src/main/java/org/rocksdb/TxnDBWritePolicy.java b/java/src/main/java/org/rocksdb/TxnDBWritePolicy.java new file mode 100644 index 00000000000..837ce6157f7 --- /dev/null +++ b/java/src/main/java/org/rocksdb/TxnDBWritePolicy.java @@ -0,0 +1,62 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +package org.rocksdb; + +/** + * The transaction db write policy. + */ +public enum TxnDBWritePolicy { + /** + * Write only the committed data. + */ + WRITE_COMMITTED((byte)0x00), + + /** + * Write data after the prepare phase of 2pc. + */ + WRITE_PREPARED((byte)0x1), + + /** + * Write data before the prepare phase of 2pc. + */ + WRITE_UNPREPARED((byte)0x2); + + private byte value; + + TxnDBWritePolicy(final byte value) { + this.value = value; + } + + /** + *

Returns the byte value of the enumerations value.

+ * + * @return byte representation + */ + public byte getValue() { + return value; + } + + /** + *

Get the TxnDBWritePolicy enumeration value by + * passing the byte identifier to this method.

+ * + * @param byteIdentifier of TxnDBWritePolicy. + * + * @return TxnDBWritePolicy instance. + * + * @throws IllegalArgumentException If TxnDBWritePolicy cannot be found for + * the provided byteIdentifier + */ + public static TxnDBWritePolicy getTxnDBWritePolicy(final byte byteIdentifier) { + for (final TxnDBWritePolicy txnDBWritePolicy : TxnDBWritePolicy.values()) { + if (txnDBWritePolicy.getValue() == byteIdentifier) { + return txnDBWritePolicy; + } + } + + throw new IllegalArgumentException( + "Illegal value provided for TxnDBWritePolicy."); + } +} diff --git a/java/src/main/java/org/rocksdb/WALRecoveryMode.java b/java/src/main/java/org/rocksdb/WALRecoveryMode.java index d3fc47b631f..d8b9eeceda0 100644 --- a/java/src/main/java/org/rocksdb/WALRecoveryMode.java +++ b/java/src/main/java/org/rocksdb/WALRecoveryMode.java @@ -65,7 +65,7 @@ public byte getValue() { * * @param byteIdentifier of WALRecoveryMode. * - * @return CompressionType instance. + * @return WALRecoveryMode instance. * * @throws IllegalArgumentException If WALRecoveryMode cannot be found for the * provided byteIdentifier diff --git a/java/src/main/java/org/rocksdb/WBWIRocksIterator.java b/java/src/main/java/org/rocksdb/WBWIRocksIterator.java index 17e78f62d8b..482351e996d 100644 --- a/java/src/main/java/org/rocksdb/WBWIRocksIterator.java +++ b/java/src/main/java/org/rocksdb/WBWIRocksIterator.java @@ -55,10 +55,13 @@ public WriteEntry entry() { * that created the record in the Write Batch */ public enum WriteType { - PUT((byte)0x1), - MERGE((byte)0x2), - DELETE((byte)0x4), - LOG((byte)0x8); + PUT((byte)0x0), + MERGE((byte)0x1), + DELETE((byte)0x2), + SINGLE_DELETE((byte)0x3), + DELETE_RANGE((byte)0x4), + LOG((byte)0x5), + XID((byte)0x6); final byte id; WriteType(final byte id) { diff --git a/java/src/main/java/org/rocksdb/WriteBatch.java b/java/src/main/java/org/rocksdb/WriteBatch.java index 2f7d0f12a3d..5e20daf14cb 100644 --- a/java/src/main/java/org/rocksdb/WriteBatch.java +++ b/java/src/main/java/org/rocksdb/WriteBatch.java @@ -39,6 +39,14 @@ public WriteBatch(final int reserved_bytes) { super(newWriteBatch(reserved_bytes)); } + /** + * Constructs a WriteBatch instance from a serialized representation + * as returned by {@link #data()}. + */ + public WriteBatch(final byte[] serialized) { + super(newWriteBatch(serialized, serialized.length)); + } + /** * Support for iterating over the contents of a batch. * @@ -51,6 +59,134 @@ public void iterate(final Handler handler) throws RocksDBException { iterate(nativeHandle_, handler.nativeHandle_); } + /** + * Retrieve the serialized version of this batch. + * + * @return the serialized representation of this write batch. + */ + public byte[] data() { + return data(nativeHandle_); + } + + /** + * Retrieve data size of the batch. + * + * @return the serialized data size of the batch. + */ + public long getDataSize() { + return getDataSize(nativeHandle_); + } + + /** + * Returns true if PutCF will be called during Iterate. + * + * Return true if PutCF will be called during Iterate. + */ + public boolean hasPut() { + return hasPut(nativeHandle_); + } + + /** + * Returns true if DeleteCF will be called during Iterate. + * + * Return true if DeleteCF will be called during Iterate. + */ + public boolean hasDelete() { + return hasDelete(nativeHandle_); + } + + /** + * Returns true if SingleDeleteCF will be called during Iterate. + * + * Return true if SingleDeleteCF will be called during Iterate. + */ + public boolean hasSingleDelete() { + return hasSingleDelete(nativeHandle_); + } + + /** + * Returns true if DeleteRangeCF will be called during Iterate. + * + * Return true if DeleteRangeCF will be called during Iterate. + */ + public boolean hasDeleteRange() { + return hasDeleteRange(nativeHandle_); + } + + /** + * Returns true if MergeCF will be called during Iterate. + * + * Return true if MergeCF will be called during Iterate. + */ + public boolean hasMerge() { + return hasMerge(nativeHandle_); + } + + /** + * Returns true if MarkBeginPrepare will be called during Iterate. + * + * Return true if MarkBeginPrepare will be called during Iterate. + */ + public boolean hasBeginPrepare() { + return hasBeginPrepare(nativeHandle_); + } + + /** + * Returns true if MarkEndPrepare will be called during Iterate. + * + * Return true if MarkEndPrepare will be called during Iterate. + */ + public boolean hasEndPrepare() { + return hasEndPrepare(nativeHandle_); + } + + /** + * Returns true if MarkCommit will be called during Iterate. + * + * Return true if MarkCommit will be called during Iterate. + */ + public boolean hasCommit() { + return hasCommit(nativeHandle_); + } + + /** + * Returns true if MarkRollback will be called during Iterate. + * + * Return true if MarkRollback will be called during Iterate. + */ + public boolean hasRollback() { + return hasRollback(nativeHandle_); + } + + @Override + public WriteBatch getWriteBatch() { + return this; + } + + /** + * Marks this point in the WriteBatch as the last record to + * be inserted into the WAL, provided the WAL is enabled. + */ + public void markWalTerminationPoint() { + markWalTerminationPoint(nativeHandle_); + } + + /** + * Gets the WAL termination point. + * + * See {@link #markWalTerminationPoint()} + * + * @return the WAL termination point + */ + public SavePoint getWalTerminationPoint() { + return getWalTerminationPoint(nativeHandle_); + } + + @Override + WriteBatch getWriteBatch(final long handle) { + return this; + } + /** *

Private WriteBatch constructor which is used to construct * WriteBatch instances from C++ side. As the reference to this @@ -87,10 +223,14 @@ public void iterate(final Handler handler) throws RocksDBException { @Override final native void merge(final long handle, final byte[] key, final int keyLen, final byte[] value, final int valueLen, final long cfHandle); - @Override final native void remove(final long handle, final byte[] key, - final int keyLen); - @Override final native void remove(final long handle, final byte[] key, - final int keyLen, final long cfHandle); + @Override final native void delete(final long handle, final byte[] key, + final int keyLen) throws RocksDBException; + @Override final native void delete(final long handle, final byte[] key, + final int keyLen, final long cfHandle) throws RocksDBException; + @Override final native void singleDelete(final long handle, final byte[] key, + final int keyLen) throws RocksDBException; + @Override final native void singleDelete(final long handle, final byte[] key, + final int keyLen, final long cfHandle) throws RocksDBException; @Override final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen, final byte[] endKey, final int endKeyLen); @@ -98,15 +238,32 @@ final native void deleteRange(final long handle, final byte[] beginKey, final in final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen, final byte[] endKey, final int endKeyLen, final long cfHandle); @Override final native void putLogData(final long handle, - final byte[] blob, final int blobLen); + final byte[] blob, final int blobLen) throws RocksDBException; @Override final native void clear0(final long handle); @Override final native void setSavePoint0(final long handle); @Override final native void rollbackToSavePoint0(final long handle); + @Override final native void popSavePoint(final long handle) throws RocksDBException; + @Override final native void setMaxBytes(final long nativeHandle, + final long maxBytes); private native static long newWriteBatch(final int reserved_bytes); + private native static long newWriteBatch(final byte[] serialized, + final int serializedLength); private native void iterate(final long handle, final long handlerHandle) throws RocksDBException; - + private native byte[] data(final long nativeHandle); + private native long getDataSize(final long nativeHandle); + private native boolean hasPut(final long nativeHandle); + private native boolean hasDelete(final long nativeHandle); + private native boolean hasSingleDelete(final long nativeHandle); + private native boolean hasDeleteRange(final long nativeHandle); + private native boolean hasMerge(final long nativeHandle); + private native boolean hasBeginPrepare(final long nativeHandle); + private native boolean hasEndPrepare(final long nativeHandle); + private native boolean hasCommit(final long nativeHandle); + private native boolean hasRollback(final long nativeHandle); + private native void markWalTerminationPoint(final long nativeHandle); + private native SavePoint getWalTerminationPoint(final long nativeHandle); /** * Handler callback for iterating over the contents of a batch. @@ -122,15 +279,38 @@ protected long initializeNative(final long... nativeParameterHandles) { return createNewHandler0(); } - public abstract void put(byte[] key, byte[] value); - public abstract void merge(byte[] key, byte[] value); - public abstract void delete(byte[] key); - public abstract void deleteRange(byte[] beginKey, byte[] endKey); - public abstract void logData(byte[] blob); + public abstract void put(final int columnFamilyId, final byte[] key, + final byte[] value) throws RocksDBException; + public abstract void put(final byte[] key, final byte[] value); + public abstract void merge(final int columnFamilyId, final byte[] key, + final byte[] value) throws RocksDBException; + public abstract void merge(final byte[] key, final byte[] value); + public abstract void delete(final int columnFamilyId, final byte[] key) + throws RocksDBException; + public abstract void delete(final byte[] key); + public abstract void singleDelete(final int columnFamilyId, + final byte[] key) throws RocksDBException; + public abstract void singleDelete(final byte[] key); + public abstract void deleteRange(final int columnFamilyId, + final byte[] beginKey, final byte[] endKey) throws RocksDBException; + public abstract void deleteRange(final byte[] beginKey, + final byte[] endKey); + public abstract void logData(final byte[] blob); + public abstract void putBlobIndex(final int columnFamilyId, + final byte[] key, final byte[] value) throws RocksDBException; + public abstract void markBeginPrepare() throws RocksDBException; + public abstract void markEndPrepare(final byte[] xid) + throws RocksDBException; + public abstract void markNoop(final boolean emptyBatch) + throws RocksDBException; + public abstract void markRollback(final byte[] xid) + throws RocksDBException; + public abstract void markCommit(final byte[] xid) + throws RocksDBException; /** * shouldContinue is called by the underlying iterator - * WriteBatch::Iterate. If it returns false, + * {@link WriteBatch#iterate(Handler)}. If it returns false, * iteration is halted. Otherwise, it continues * iterating. The default implementation always * returns true. @@ -144,4 +324,57 @@ public boolean shouldContinue() { private native long createNewHandler0(); } + + /** + * A structure for describing the save point in the Write Batch. + */ + public static class SavePoint { + private long size; + private long count; + private long contentFlags; + + public SavePoint(final long size, final long count, + final long contentFlags) { + this.size = size; + this.count = count; + this.contentFlags = contentFlags; + } + + public void clear() { + this.size = 0; + this.count = 0; + this.contentFlags = 0; + } + + /** + * Get the size of the serialized representation. + * + * @return the size of the serialized representation. + */ + public long getSize() { + return size; + } + + /** + * Get the number of elements. + * + * @return the number of elements. + */ + public long getCount() { + return count; + } + + /** + * Get the content flags. + * + * @return the content flags. + */ + public long getContentFlags() { + return contentFlags; + } + + public boolean isCleared() { + return (size | count | contentFlags) == 0; + } + } } diff --git a/java/src/main/java/org/rocksdb/WriteBatchInterface.java b/java/src/main/java/org/rocksdb/WriteBatchInterface.java index cd024ad58d4..21c8b6fae06 100644 --- a/java/src/main/java/org/rocksdb/WriteBatchInterface.java +++ b/java/src/main/java/org/rocksdb/WriteBatchInterface.java @@ -24,7 +24,7 @@ public interface WriteBatchInterface { * @param key the specified key to be inserted. * @param value the value associated with the specified key. */ - void put(byte[] key, byte[] value); + void put(byte[] key, byte[] value) throws RocksDBException; /** *

Store the mapping "key->value" within given column @@ -36,7 +36,7 @@ public interface WriteBatchInterface { * @param value the value associated with the specified key. */ void put(ColumnFamilyHandle columnFamilyHandle, - byte[] key, byte[] value); + byte[] key, byte[] value) throws RocksDBException; /** *

Merge "value" with the existing value of "key" in the database. @@ -46,7 +46,7 @@ void put(ColumnFamilyHandle columnFamilyHandle, * @param value the value to be merged with the current value for * the specified key. */ - void merge(byte[] key, byte[] value); + void merge(byte[] key, byte[] value) throws RocksDBException; /** *

Merge "value" with the existing value of "key" in given column family. @@ -58,14 +58,36 @@ void put(ColumnFamilyHandle columnFamilyHandle, * the specified key. */ void merge(ColumnFamilyHandle columnFamilyHandle, - byte[] key, byte[] value); + byte[] key, byte[] value) throws RocksDBException; + + /** + *

If the database contains a mapping for "key", erase it. Else do nothing.

+ * + * @param key Key to delete within database + * + * @deprecated Use {@link #delete(byte[])} + */ + @Deprecated + void remove(byte[] key) throws RocksDBException; + + /** + *

If column family contains a mapping for "key", erase it. Else do nothing.

+ * + * @param columnFamilyHandle {@link ColumnFamilyHandle} instance + * @param key Key to delete within database + * + * @deprecated Use {@link #delete(ColumnFamilyHandle, byte[])} + */ + @Deprecated + void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key) + throws RocksDBException; /** *

If the database contains a mapping for "key", erase it. Else do nothing.

* * @param key Key to delete within database */ - void remove(byte[] key); + void delete(byte[] key) throws RocksDBException; /** *

If column family contains a mapping for "key", erase it. Else do nothing.

@@ -73,7 +95,58 @@ void merge(ColumnFamilyHandle columnFamilyHandle, * @param columnFamilyHandle {@link ColumnFamilyHandle} instance * @param key Key to delete within database */ - void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key); + void delete(ColumnFamilyHandle columnFamilyHandle, byte[] key) + throws RocksDBException; + + /** + * Remove the database entry for {@code key}. Requires that the key exists + * and was not overwritten. It is not an error if the key did not exist + * in the database. + * + * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple + * times), then the result of calling SingleDelete() on this key is undefined. + * SingleDelete() only behaves correctly if there has been only one Put() + * for this key since the previous call to SingleDelete() for this key. + * + * This feature is currently an experimental performance optimization + * for a very specific workload. It is up to the caller to ensure that + * SingleDelete is only used for a key that is not deleted using Delete() or + * written using Merge(). Mixing SingleDelete operations with Deletes and + * Merges can result in undefined behavior. + * + * @param key Key to delete within database + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + @Experimental("Performance optimization for a very specific workload") + void singleDelete(final byte[] key) throws RocksDBException; + + /** + * Remove the database entry for {@code key}. Requires that the key exists + * and was not overwritten. It is not an error if the key did not exist + * in the database. + * + * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple + * times), then the result of calling SingleDelete() on this key is undefined. + * SingleDelete() only behaves correctly if there has been only one Put() + * for this key since the previous call to SingleDelete() for this key. + * + * This feature is currently an experimental performance optimization + * for a very specific workload. It is up to the caller to ensure that + * SingleDelete is only used for a key that is not deleted using Delete() or + * written using Merge(). Mixing SingleDelete operations with Deletes and + * Merges can result in undefined behavior. + * + * @param columnFamilyHandle The column family to delete the key from + * @param key Key to delete within database + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + @Experimental("Performance optimization for a very specific workload") + void singleDelete(final ColumnFamilyHandle columnFamilyHandle, + final byte[] key) throws RocksDBException; /** * Removes the database entries in the range ["beginKey", "endKey"), i.e., @@ -89,7 +162,7 @@ void merge(ColumnFamilyHandle columnFamilyHandle, * @param endKey * Last key to delete within database (excluded) */ - void deleteRange(byte[] beginKey, byte[] endKey); + void deleteRange(byte[] beginKey, byte[] endKey) throws RocksDBException; /** * Removes the database entries in the range ["beginKey", "endKey"), i.e., @@ -106,7 +179,8 @@ void merge(ColumnFamilyHandle columnFamilyHandle, * @param endKey * Last key to delete within database (excluded) */ - void deleteRange(ColumnFamilyHandle columnFamilyHandle, byte[] beginKey, byte[] endKey); + void deleteRange(ColumnFamilyHandle columnFamilyHandle, byte[] beginKey, + byte[] endKey) throws RocksDBException; /** * Append a blob of arbitrary size to the records in this batch. The blob will @@ -122,7 +196,7 @@ void merge(ColumnFamilyHandle columnFamilyHandle, * * @param blob binary object to be inserted */ - void putLogData(byte[] blob); + void putLogData(byte[] blob) throws RocksDBException; /** * Clear all updates buffered in this batch @@ -143,4 +217,30 @@ void merge(ColumnFamilyHandle columnFamilyHandle, * @throws RocksDBException if there is no previous call to SetSavePoint() */ void rollbackToSavePoint() throws RocksDBException; + + /** + * Pop the most recent save point. + * + * That is to say that it removes the last save point, + * which was set by {@link #setSavePoint()}. + * + * @throws RocksDBException If there is no previous call to + * {@link #setSavePoint()}, an exception with + * {@link Status.Code#NotFound} will be thrown. + */ + void popSavePoint() throws RocksDBException; + + /** + * Set the maximum size of the write batch. + * + * @param maxBytes the maximum size in bytes. + */ + void setMaxBytes(long maxBytes); + + /** + * Get the underlying Write Batch. + * + * @return the underlying WriteBatch. + */ + WriteBatch getWriteBatch(); } diff --git a/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java b/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java index f3d49c92ede..dc6b0ba60fa 100644 --- a/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java +++ b/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java @@ -61,7 +61,20 @@ public WriteBatchWithIndex( fallbackIndexComparator, final int reservedBytes, final boolean overwriteKey) { super(newWriteBatchWithIndex(fallbackIndexComparator.nativeHandle_, - fallbackIndexComparator instanceof DirectComparator, reservedBytes, overwriteKey)); + fallbackIndexComparator.getComparatorType().getValue(), reservedBytes, + overwriteKey)); + } + + /** + *

Private WriteBatchWithIndex constructor which is used to construct + * WriteBatchWithIndex instances from C++ side. As the reference to this + * object is also managed from C++ side the handle will be disowned.

+ * + * @param nativeHandle address of native instance. + */ + WriteBatchWithIndex(final long nativeHandle) { + super(nativeHandle); + disOwnNativeHandle(); } /** @@ -244,10 +257,14 @@ public byte[] getFromBatchAndDB(final RocksDB db, final ReadOptions options, @Override final native void merge(final long handle, final byte[] key, final int keyLen, final byte[] value, final int valueLen, final long cfHandle); - @Override final native void remove(final long handle, final byte[] key, - final int keyLen); - @Override final native void remove(final long handle, final byte[] key, - final int keyLen, final long cfHandle); + @Override final native void delete(final long handle, final byte[] key, + final int keyLen) throws RocksDBException; + @Override final native void delete(final long handle, final byte[] key, + final int keyLen, final long cfHandle) throws RocksDBException; + @Override final native void singleDelete(final long handle, final byte[] key, + final int keyLen) throws RocksDBException; + @Override final native void singleDelete(final long handle, final byte[] key, + final int keyLen, final long cfHandle) throws RocksDBException; @Override final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen, final byte[] endKey, final int endKeyLen); @@ -255,15 +272,20 @@ final native void deleteRange(final long handle, final byte[] beginKey, final in final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen, final byte[] endKey, final int endKeyLen, final long cfHandle); @Override final native void putLogData(final long handle, final byte[] blob, - final int blobLen); + final int blobLen) throws RocksDBException; @Override final native void clear0(final long handle); @Override final native void setSavePoint0(final long handle); @Override final native void rollbackToSavePoint0(final long handle); + @Override final native void popSavePoint(final long handle) throws RocksDBException; + @Override final native void setMaxBytes(final long nativeHandle, + final long maxBytes); + @Override final native WriteBatch getWriteBatch(final long handle); private native static long newWriteBatchWithIndex(); private native static long newWriteBatchWithIndex(final boolean overwriteKey); private native static long newWriteBatchWithIndex( - final long fallbackIndexComparatorHandle, final boolean isDirect, final int reservedBytes, + final long fallbackIndexComparatorHandle, + final byte comparatorType, final int reservedBytes, final boolean overwriteKey); private native long iterator0(final long handle); private native long iterator1(final long handle, final long cfHandle); diff --git a/java/src/main/java/org/rocksdb/WriteOptions.java b/java/src/main/java/org/rocksdb/WriteOptions.java index f3c5aa6675c..db662aa5095 100644 --- a/java/src/main/java/org/rocksdb/WriteOptions.java +++ b/java/src/main/java/org/rocksdb/WriteOptions.java @@ -20,6 +20,12 @@ public WriteOptions() { } + // TODO(AR) consider ownership + WriteOptions(final long nativeHandle) { + super(nativeHandle); + disOwnNativeHandle(); + } + /** * Copy constructor for WriteOptions. * diff --git a/java/src/main/java/org/rocksdb/util/Environment.java b/java/src/main/java/org/rocksdb/util/Environment.java index 36a4b4a2079..7e1b50cfff1 100644 --- a/java/src/main/java/org/rocksdb/util/Environment.java +++ b/java/src/main/java/org/rocksdb/util/Environment.java @@ -33,6 +33,10 @@ public static boolean isSolaris() { return OS.contains("sunos"); } + public static boolean isOpenBSD() { + return (OS.contains("openbsd")); + } + public static boolean is64Bit() { if (ARCH.indexOf("sparcv9") >= 0) { return true; @@ -67,6 +71,8 @@ public static String getJniLibraryName(final String name) { return String.format("%sjni-solaris%s", name, arch); } else if (isWindows() && is64Bit()) { return String.format("%sjni-win64", name); + } else if (isOpenBSD()) { + return String.format("%sjni-openbsd%s", name, is64Bit() ? "64" : "32"); } throw new UnsupportedOperationException(String.format("Cannot determine JNI library name for ARCH='%s' OS='%s' name='%s'", ARCH, OS, name)); @@ -77,7 +83,7 @@ public static String getJniLibraryFileName(final String name) { } private static String appendLibOsSuffix(final String libraryFileName, final boolean shared) { - if (isUnix() || isAix() || isSolaris() || isFreeBSD()) { + if (isUnix() || isAix() || isSolaris() || isFreeBSD() || isOpenBSD()) { return libraryFileName + ".so"; } else if (isMac()) { return libraryFileName + (shared ? ".dylib" : ".jnilib"); diff --git a/java/src/test/java/org/rocksdb/AbstractTransactionTest.java b/java/src/test/java/org/rocksdb/AbstractTransactionTest.java new file mode 100644 index 00000000000..08f3dbf5857 --- /dev/null +++ b/java/src/test/java/org/rocksdb/AbstractTransactionTest.java @@ -0,0 +1,903 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.fail; + +/** + * Base class of {@link TransactionTest} and {@link OptimisticTransactionTest} + */ +public abstract class AbstractTransactionTest { + + protected final static byte[] TXN_TEST_COLUMN_FAMILY = "txn_test_cf" + .getBytes(); + + protected static final Random rand = PlatformRandomHelper. + getPlatformSpecificRandomFactory(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + public abstract DBContainer startDb() + throws RocksDBException; + + @Test + public void setSnapshot() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + txn.setSnapshot(); + } + } + + @Test + public void setSnapshotOnNextOperation() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + txn.setSnapshotOnNextOperation(); + txn.put("key1".getBytes(), "value1".getBytes()); + } + } + + @Test + public void setSnapshotOnNextOperation_transactionNotifier() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + + try(final TestTransactionNotifier notifier = new TestTransactionNotifier()) { + txn.setSnapshotOnNextOperation(notifier); + txn.put("key1".getBytes(), "value1".getBytes()); + + txn.setSnapshotOnNextOperation(notifier); + txn.put("key2".getBytes(), "value2".getBytes()); + + assertThat(notifier.getCreatedSnapshots().size()).isEqualTo(2); + } + } + } + + @Test + public void getSnapshot() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + txn.setSnapshot(); + final Snapshot snapshot = txn.getSnapshot(); + assertThat(snapshot.isOwningHandle()).isFalse(); + } + } + + @Test + public void getSnapshot_null() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + final Snapshot snapshot = txn.getSnapshot(); + assertThat(snapshot).isNull(); + } + } + + @Test + public void clearSnapshot() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + txn.setSnapshot(); + txn.clearSnapshot(); + } + } + + @Test + public void clearSnapshot_none() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + txn.clearSnapshot(); + } + } + + @Test + public void commit() throws RocksDBException { + final byte k1[] = "rollback-key1".getBytes(UTF_8); + final byte v1[] = "rollback-value1".getBytes(UTF_8); + try(final DBContainer dbContainer = startDb()) { + try(final Transaction txn = dbContainer.beginTransaction()) { + txn.put(k1, v1); + txn.commit(); + } + + try(final ReadOptions readOptions = new ReadOptions(); + final Transaction txn2 = dbContainer.beginTransaction()) { + assertThat(txn2.get(readOptions, k1)).isEqualTo(v1); + } + } + } + + @Test + public void rollback() throws RocksDBException { + final byte k1[] = "rollback-key1".getBytes(UTF_8); + final byte v1[] = "rollback-value1".getBytes(UTF_8); + try(final DBContainer dbContainer = startDb()) { + try(final Transaction txn = dbContainer.beginTransaction()) { + txn.put(k1, v1); + txn.rollback(); + } + + try(final ReadOptions readOptions = new ReadOptions(); + final Transaction txn2 = dbContainer.beginTransaction()) { + assertThat(txn2.get(readOptions, k1)).isNull(); + } + } + } + + @Test + public void savePoint() throws RocksDBException { + final byte k1[] = "savePoint-key1".getBytes(UTF_8); + final byte v1[] = "savePoint-value1".getBytes(UTF_8); + final byte k2[] = "savePoint-key2".getBytes(UTF_8); + final byte v2[] = "savePoint-value2".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions()) { + + + try(final Transaction txn = dbContainer.beginTransaction()) { + txn.put(k1, v1); + + assertThat(txn.get(readOptions, k1)).isEqualTo(v1); + + txn.setSavePoint(); + + txn.put(k2, v2); + + assertThat(txn.get(readOptions, k1)).isEqualTo(v1); + assertThat(txn.get(readOptions, k2)).isEqualTo(v2); + + txn.rollbackToSavePoint(); + + assertThat(txn.get(readOptions, k1)).isEqualTo(v1); + assertThat(txn.get(readOptions, k2)).isNull(); + + txn.commit(); + } + + try(final Transaction txn2 = dbContainer.beginTransaction()) { + assertThat(txn2.get(readOptions, k1)).isEqualTo(v1); + assertThat(txn2.get(readOptions, k2)).isNull(); + } + } + } + + @Test + public void getPut_cf() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + assertThat(txn.get(testCf, readOptions, k1)).isNull(); + txn.put(testCf, k1, v1); + assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1); + } + } + + @Test + public void getPut() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + assertThat(txn.get(readOptions, k1)).isNull(); + txn.put(k1, v1); + assertThat(txn.get(readOptions, k1)).isEqualTo(v1); + } + } + + @Test + public void multiGetPut_cf() throws RocksDBException { + final byte keys[][] = new byte[][] { + "key1".getBytes(UTF_8), + "key2".getBytes(UTF_8)}; + final byte values[][] = new byte[][] { + "value1".getBytes(UTF_8), + "value2".getBytes(UTF_8)}; + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + final List cfList = Arrays.asList(testCf, testCf); + + assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(new byte[][] { null, null }); + + txn.put(testCf, keys[0], values[0]); + txn.put(testCf, keys[1], values[1]); + assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(values); + } + } + + @Test + public void multiGetPut() throws RocksDBException { + final byte keys[][] = new byte[][] { + "key1".getBytes(UTF_8), + "key2".getBytes(UTF_8)}; + final byte values[][] = new byte[][] { + "value1".getBytes(UTF_8), + "value2".getBytes(UTF_8)}; + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + + assertThat(txn.multiGet(readOptions, keys)).isEqualTo(new byte[][] { null, null }); + + txn.put(keys[0], values[0]); + txn.put(keys[1], values[1]); + assertThat(txn.multiGet(readOptions, keys)).isEqualTo(values); + } + } + + @Test + public void getForUpdate_cf() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + assertThat(txn.getForUpdate(readOptions, testCf, k1, true)).isNull(); + txn.put(testCf, k1, v1); + assertThat(txn.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1); + } + } + + @Test + public void getForUpdate() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + assertThat(txn.getForUpdate(readOptions, k1, true)).isNull(); + txn.put(k1, v1); + assertThat(txn.getForUpdate(readOptions, k1, true)).isEqualTo(v1); + } + } + + @Test + public void multiGetForUpdate_cf() throws RocksDBException { + final byte keys[][] = new byte[][] { + "key1".getBytes(UTF_8), + "key2".getBytes(UTF_8)}; + final byte values[][] = new byte[][] { + "value1".getBytes(UTF_8), + "value2".getBytes(UTF_8)}; + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + final List cfList = Arrays.asList(testCf, testCf); + + assertThat(txn.multiGetForUpdate(readOptions, cfList, keys)) + .isEqualTo(new byte[][] { null, null }); + + txn.put(testCf, keys[0], values[0]); + txn.put(testCf, keys[1], values[1]); + assertThat(txn.multiGetForUpdate(readOptions, cfList, keys)) + .isEqualTo(values); + } + } + + @Test + public void multiGetForUpdate() throws RocksDBException { + final byte keys[][] = new byte[][]{ + "key1".getBytes(UTF_8), + "key2".getBytes(UTF_8)}; + final byte values[][] = new byte[][]{ + "value1".getBytes(UTF_8), + "value2".getBytes(UTF_8)}; + + try (final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + assertThat(txn.multiGetForUpdate(readOptions, keys)).isEqualTo(new byte[][]{null, null}); + + txn.put(keys[0], values[0]); + txn.put(keys[1], values[1]); + assertThat(txn.multiGetForUpdate(readOptions, keys)).isEqualTo(values); + } + } + + @Test + public void getIterator() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); + + txn.put(k1, v1); + + try(final RocksIterator iterator = txn.getIterator(readOptions)) { + iterator.seek(k1); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo(k1); + assertThat(iterator.value()).isEqualTo(v1); + } + } + } + + @Test + public void getIterator_cf() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); + + txn.put(testCf, k1, v1); + + try(final RocksIterator iterator = txn.getIterator(readOptions, testCf)) { + iterator.seek(k1); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo(k1); + assertThat(iterator.value()).isEqualTo(v1); + } + } + } + + @Test + public void merge_cf() throws RocksDBException { + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + txn.merge(testCf, k1, v1); + } + } + + @Test + public void merge() throws RocksDBException { + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + txn.merge(k1, v1); + } + } + + + @Test + public void delete_cf() throws RocksDBException { + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + txn.put(testCf, k1, v1); + assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1); + + txn.delete(testCf, k1); + assertThat(txn.get(testCf, readOptions, k1)).isNull(); + } + } + + @Test + public void delete() throws RocksDBException { + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + txn.put(k1, v1); + assertThat(txn.get(readOptions, k1)).isEqualTo(v1); + + txn.delete(k1); + assertThat(txn.get(readOptions, k1)).isNull(); + } + } + + @Test + public void delete_parts_cf() throws RocksDBException { + final byte keyParts[][] = new byte[][] { + "ke".getBytes(UTF_8), + "y1".getBytes(UTF_8)}; + final byte valueParts[][] = new byte[][] { + "val".getBytes(UTF_8), + "ue1".getBytes(UTF_8)}; + final byte[] key = concat(keyParts); + final byte[] value = concat(valueParts); + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + txn.put(testCf, keyParts, valueParts); + assertThat(txn.get(testCf, readOptions, key)).isEqualTo(value); + + txn.delete(testCf, keyParts); + + assertThat(txn.get(testCf, readOptions, key)) + .isNull(); + } + } + + @Test + public void delete_parts() throws RocksDBException { + final byte keyParts[][] = new byte[][] { + "ke".getBytes(UTF_8), + "y1".getBytes(UTF_8)}; + final byte valueParts[][] = new byte[][] { + "val".getBytes(UTF_8), + "ue1".getBytes(UTF_8)}; + final byte[] key = concat(keyParts); + final byte[] value = concat(valueParts); + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + + txn.put(keyParts, valueParts); + + assertThat(txn.get(readOptions, key)).isEqualTo(value); + + txn.delete(keyParts); + + assertThat(txn.get(readOptions, key)).isNull(); + } + } + + @Test + public void getPutUntracked_cf() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + assertThat(txn.get(testCf, readOptions, k1)).isNull(); + txn.putUntracked(testCf, k1, v1); + assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1); + } + } + + @Test + public void getPutUntracked() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + assertThat(txn.get(readOptions, k1)).isNull(); + txn.putUntracked(k1, v1); + assertThat(txn.get(readOptions, k1)).isEqualTo(v1); + } + } + + @Test + public void multiGetPutUntracked_cf() throws RocksDBException { + final byte keys[][] = new byte[][] { + "key1".getBytes(UTF_8), + "key2".getBytes(UTF_8)}; + final byte values[][] = new byte[][] { + "value1".getBytes(UTF_8), + "value2".getBytes(UTF_8)}; + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + + final List cfList = Arrays.asList(testCf, testCf); + + assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(new byte[][] { null, null }); + txn.putUntracked(testCf, keys[0], values[0]); + txn.putUntracked(testCf, keys[1], values[1]); + assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(values); + } + } + + @Test + public void multiGetPutUntracked() throws RocksDBException { + final byte keys[][] = new byte[][] { + "key1".getBytes(UTF_8), + "key2".getBytes(UTF_8)}; + final byte values[][] = new byte[][] { + "value1".getBytes(UTF_8), + "value2".getBytes(UTF_8)}; + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + + assertThat(txn.multiGet(readOptions, keys)).isEqualTo(new byte[][] { null, null }); + txn.putUntracked(keys[0], values[0]); + txn.putUntracked(keys[1], values[1]); + assertThat(txn.multiGet(readOptions, keys)).isEqualTo(values); + } + } + + @Test + public void mergeUntracked_cf() throws RocksDBException { + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + txn.mergeUntracked(testCf, k1, v1); + } + } + + @Test + public void mergeUntracked() throws RocksDBException { + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + txn.mergeUntracked(k1, v1); + } + } + + @Test + public void deleteUntracked_cf() throws RocksDBException { + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + txn.put(testCf, k1, v1); + assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1); + + txn.deleteUntracked(testCf, k1); + assertThat(txn.get(testCf, readOptions, k1)).isNull(); + } + } + + @Test + public void deleteUntracked() throws RocksDBException { + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + txn.put(k1, v1); + assertThat(txn.get(readOptions, k1)).isEqualTo(v1); + + txn.deleteUntracked(k1); + assertThat(txn.get(readOptions, k1)).isNull(); + } + } + + @Test + public void deleteUntracked_parts_cf() throws RocksDBException { + final byte keyParts[][] = new byte[][] { + "ke".getBytes(UTF_8), + "y1".getBytes(UTF_8)}; + final byte valueParts[][] = new byte[][] { + "val".getBytes(UTF_8), + "ue1".getBytes(UTF_8)}; + final byte[] key = concat(keyParts); + final byte[] value = concat(valueParts); + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + txn.put(testCf, keyParts, valueParts); + assertThat(txn.get(testCf, readOptions, key)).isEqualTo(value); + + txn.deleteUntracked(testCf, keyParts); + assertThat(txn.get(testCf, readOptions, key)).isNull(); + } + } + + @Test + public void deleteUntracked_parts() throws RocksDBException { + final byte keyParts[][] = new byte[][] { + "ke".getBytes(UTF_8), + "y1".getBytes(UTF_8)}; + final byte valueParts[][] = new byte[][] { + "val".getBytes(UTF_8), + "ue1".getBytes(UTF_8)}; + final byte[] key = concat(keyParts); + final byte[] value = concat(valueParts); + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + txn.put(keyParts, valueParts); + assertThat(txn.get(readOptions, key)).isEqualTo(value); + + txn.deleteUntracked(keyParts); + assertThat(txn.get(readOptions, key)).isNull(); + } + } + + @Test + public void putLogData() throws RocksDBException { + final byte[] blob = "blobby".getBytes(UTF_8); + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + txn.putLogData(blob); + } + } + + @Test + public void enabledDisableIndexing() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + txn.disableIndexing(); + txn.enableIndexing(); + txn.disableIndexing(); + txn.enableIndexing(); + } + } + + @Test + public void numKeys() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + final byte k2[] = "key2".getBytes(UTF_8); + final byte v2[] = "value2".getBytes(UTF_8); + final byte k3[] = "key3".getBytes(UTF_8); + final byte v3[] = "value3".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + txn.put(k1, v1); + txn.put(testCf, k2, v2); + txn.merge(k3, v3); + txn.delete(testCf, k2); + + assertThat(txn.getNumKeys()).isEqualTo(3); + assertThat(txn.getNumPuts()).isEqualTo(2); + assertThat(txn.getNumMerges()).isEqualTo(1); + assertThat(txn.getNumDeletes()).isEqualTo(1); + } + } + + @Test + public void elapsedTime() throws RocksDBException, InterruptedException { + final long preStartTxnTime = System.currentTimeMillis(); + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + Thread.sleep(1); + + final long txnElapsedTime = txn.getElapsedTime(); + assertThat(txnElapsedTime).isLessThan(System.currentTimeMillis() + - preStartTxnTime); + assertThat(txnElapsedTime).isGreaterThan(0); + } + } + + @Test + public void getWriteBatch() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + + txn.put(k1, v1); + + final WriteBatchWithIndex writeBatch = txn.getWriteBatch(); + assertThat(writeBatch).isNotNull(); + assertThat(writeBatch.isOwningHandle()).isFalse(); + assertThat(writeBatch.count()).isEqualTo(1); + } + } + + @Test + public void setLockTimeout() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + txn.setLockTimeout(1000); + } + } + + @Test + public void writeOptions() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final WriteOptions writeOptions = new WriteOptions() + .setDisableWAL(true) + .setSync(true); + final Transaction txn = dbContainer.beginTransaction(writeOptions)) { + + txn.put(k1, v1); + + WriteOptions txnWriteOptions = txn.getWriteOptions(); + assertThat(txnWriteOptions).isNotNull(); + assertThat(txnWriteOptions.isOwningHandle()).isFalse(); + assertThat(txnWriteOptions).isNotSameAs(writeOptions); + assertThat(txnWriteOptions.disableWAL()).isTrue(); + assertThat(txnWriteOptions.sync()).isTrue(); + + txn.setWriteOptions(txnWriteOptions.setSync(false)); + txnWriteOptions = txn.getWriteOptions(); + assertThat(txnWriteOptions).isNotNull(); + assertThat(txnWriteOptions.isOwningHandle()).isFalse(); + assertThat(txnWriteOptions).isNotSameAs(writeOptions); + assertThat(txnWriteOptions.disableWAL()).isTrue(); + assertThat(txnWriteOptions.sync()).isFalse(); + } + } + + @Test + public void undoGetForUpdate_cf() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + assertThat(txn.getForUpdate(readOptions, testCf, k1, true)).isNull(); + txn.put(testCf, k1, v1); + assertThat(txn.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1); + txn.undoGetForUpdate(testCf, k1); + } + } + + @Test + public void undoGetForUpdate() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + assertThat(txn.getForUpdate(readOptions, k1, true)).isNull(); + txn.put(k1, v1); + assertThat(txn.getForUpdate(readOptions, k1, true)).isEqualTo(v1); + txn.undoGetForUpdate(k1); + } + } + + @Test + public void rebuildFromWriteBatch() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + final byte k2[] = "key2".getBytes(UTF_8); + final byte v2[] = "value2".getBytes(UTF_8); + final byte k3[] = "key3".getBytes(UTF_8); + final byte v3[] = "value3".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + + txn.put(k1, v1); + + assertThat(txn.get(readOptions, k1)).isEqualTo(v1); + assertThat(txn.getNumKeys()).isEqualTo(1); + + try(final WriteBatch writeBatch = new WriteBatch()) { + writeBatch.put(k2, v2); + writeBatch.put(k3, v3); + txn.rebuildFromWriteBatch(writeBatch); + + assertThat(txn.get(readOptions, k1)).isEqualTo(v1); + assertThat(txn.get(readOptions, k2)).isEqualTo(v2); + assertThat(txn.get(readOptions, k3)).isEqualTo(v3); + assertThat(txn.getNumKeys()).isEqualTo(3); + } + } + } + + @Test + public void getCommitTimeWriteBatch() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + + txn.put(k1, v1); + final WriteBatch writeBatch = txn.getCommitTimeWriteBatch(); + + assertThat(writeBatch).isNotNull(); + assertThat(writeBatch.isOwningHandle()).isFalse(); + assertThat(writeBatch.count()).isEqualTo(0); + } + } + + @Test + public void logNumber() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + assertThat(txn.getLogNumber()).isEqualTo(0); + final long logNumber = rand.nextLong(); + txn.setLogNumber(logNumber); + assertThat(txn.getLogNumber()).isEqualTo(logNumber); + } + } + + private static byte[] concat(final byte[][] bufs) { + int resultLength = 0; + for(final byte[] buf : bufs) { + resultLength += buf.length; + } + + final byte[] result = new byte[resultLength]; + int resultOffset = 0; + for(final byte[] buf : bufs) { + final int srcLength = buf.length; + System.arraycopy(buf, 0, result, resultOffset, srcLength); + resultOffset += srcLength; + } + + return result; + } + + private static class TestTransactionNotifier + extends AbstractTransactionNotifier { + private final List createdSnapshots = new ArrayList<>(); + + @Override + public void snapshotCreated(final Snapshot newSnapshot) { + createdSnapshots.add(newSnapshot); + } + + public List getCreatedSnapshots() { + return createdSnapshots; + } + } + + protected static abstract class DBContainer + implements AutoCloseable { + protected final WriteOptions writeOptions; + protected final List columnFamilyHandles; + protected final ColumnFamilyOptions columnFamilyOptions; + protected final DBOptions options; + + public DBContainer(final WriteOptions writeOptions, + final List columnFamilyHandles, + final ColumnFamilyOptions columnFamilyOptions, + final DBOptions options) { + this.writeOptions = writeOptions; + this.columnFamilyHandles = columnFamilyHandles; + this.columnFamilyOptions = columnFamilyOptions; + this.options = options; + } + + public abstract Transaction beginTransaction(); + + public abstract Transaction beginTransaction( + final WriteOptions writeOptions); + + public ColumnFamilyHandle getTestColumnFamily() { + return columnFamilyHandles.get(1); + } + + @Override + public abstract void close(); + } +} diff --git a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java index 8edc8b89fd6..2b15b69f812 100644 --- a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java +++ b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java @@ -6,7 +6,11 @@ package org.rocksdb; import org.junit.ClassRule; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.nio.charset.StandardCharsets; import static org.assertj.core.api.Assertions.assertThat; @@ -16,6 +20,8 @@ public class BlockBasedTableConfigTest { public static final RocksMemoryResource rocksMemoryResource = new RocksMemoryResource(); + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + @Test public void noBlockCache() { BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); @@ -31,6 +37,31 @@ public void blockCacheSize() { isEqualTo(8 * 1024); } + @Test + public void sharedBlockCache() throws RocksDBException { + try (final Cache cache = new LRUCache(8 * 1024 * 1024); + final Statistics statistics = new Statistics()) { + for (int shard = 0; shard < 8; shard++) { + try (final Options options = + new Options() + .setCreateIfMissing(true) + .setStatistics(statistics) + .setTableFormatConfig(new BlockBasedTableConfig().setBlockCache(cache)); + final RocksDB db = + RocksDB.open(options, dbFolder.getRoot().getAbsolutePath() + "/" + shard)) { + final byte[] key = "some-key".getBytes(StandardCharsets.UTF_8); + final byte[] value = "some-value".getBytes(StandardCharsets.UTF_8); + + db.put(key, value); + db.flush(new FlushOptions()); + db.get(key); + + assertThat(statistics.getTickerCount(TickerType.BLOCK_CACHE_ADD)).isEqualTo(shard + 1); + } + } + } + } + @Test public void blockSizeDeviation() { BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); @@ -148,6 +179,14 @@ public void blockBasedTableWithoutFilter() { } } + @Test + public void blockBasedTableWithBlockCache() { + try (final Options options = new Options().setTableFormatConfig( + new BlockBasedTableConfig().setBlockCache(new LRUCache(17 * 1024 * 1024)))) { + assertThat(options.tableFactoryName()).isEqualTo("BlockBasedTable"); + } + } + @Test public void blockBasedTableFormatVersion() { BlockBasedTableConfig config = new BlockBasedTableConfig(); diff --git a/java/src/test/java/org/rocksdb/ColumnFamilyTest.java b/java/src/test/java/org/rocksdb/ColumnFamilyTest.java index 19fe332df97..3df63c65ff1 100644 --- a/java/src/test/java/org/rocksdb/ColumnFamilyTest.java +++ b/java/src/test/java/org/rocksdb/ColumnFamilyTest.java @@ -12,6 +12,7 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.assertj.core.api.Assertions.assertThat; public class ColumnFamilyTest { @@ -23,6 +24,31 @@ public class ColumnFamilyTest { @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + @Test + public void columnFamilyDescriptorName() throws RocksDBException { + final byte[] cfName = "some_name".getBytes(UTF_8); + + try(final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions()) { + final ColumnFamilyDescriptor cfDescriptor = + new ColumnFamilyDescriptor(cfName, cfOptions); + assertThat(cfDescriptor.getName()).isEqualTo(cfName); + } + } + + @Test + public void columnFamilyDescriptorOptions() throws RocksDBException { + final byte[] cfName = "some_name".getBytes(UTF_8); + + try(final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions() + .setCompressionType(CompressionType.BZLIB2_COMPRESSION)) { + final ColumnFamilyDescriptor cfDescriptor = + new ColumnFamilyDescriptor(cfName, cfOptions); + + assertThat(cfDescriptor.getOptions().compressionType()) + .isEqualTo(CompressionType.BZLIB2_COMPRESSION); + } + } + @Test public void listColumnFamilies() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true); @@ -47,6 +73,9 @@ public void defaultColumnFamily() throws RocksDBException { try { assertThat(cfh).isNotNull(); + assertThat(cfh.getName()).isEqualTo("default".getBytes(UTF_8)); + assertThat(cfh.getID()).isEqualTo(0); + final byte[] key = "key".getBytes(); final byte[] value = "value".getBytes(); @@ -64,15 +93,25 @@ public void defaultColumnFamily() throws RocksDBException { @Test public void createColumnFamily() throws RocksDBException { + final byte[] cfName = "new_cf".getBytes(UTF_8); + final ColumnFamilyDescriptor cfDescriptor = new ColumnFamilyDescriptor(cfName, + new ColumnFamilyOptions()); + try (final Options options = new Options().setCreateIfMissing(true); final RocksDB db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath())) { - final ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily( - new ColumnFamilyDescriptor("new_cf".getBytes(), - new ColumnFamilyOptions())); + dbFolder.getRoot().getAbsolutePath())) { + + final ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily(cfDescriptor); + try { + assertThat(columnFamilyHandle.getName()).isEqualTo(cfName); + assertThat(columnFamilyHandle.getID()).isEqualTo(1); + + final ColumnFamilyDescriptor latestDescriptor = columnFamilyHandle.getDescriptor(); + assertThat(latestDescriptor.getName()).isEqualTo(cfName); + final List columnFamilyNames = RocksDB.listColumnFamilies( - options, dbFolder.getRoot().getAbsolutePath()); + options, dbFolder.getRoot().getAbsolutePath()); assertThat(columnFamilyNames).isNotNull(); assertThat(columnFamilyNames.size()).isGreaterThan(0); assertThat(columnFamilyNames.size()).isEqualTo(2); diff --git a/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java b/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java new file mode 100644 index 00000000000..d1bdf0f8844 --- /dev/null +++ b/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java @@ -0,0 +1,92 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.util.*; +import java.util.Comparator; + +import static org.junit.Assert.assertEquals; + +public class NativeComparatorWrapperTest { + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + private static final Random random = new Random(); + + @Test + public void rountrip() throws RocksDBException { + final String dbPath = dbFolder.getRoot().getAbsolutePath(); + final int ITERATIONS = 1_000; + + final String[] storedKeys = new String[ITERATIONS]; + try (final NativeStringComparatorWrapper comparator = new NativeStringComparatorWrapper(); + final Options opt = new Options() + .setCreateIfMissing(true) + .setComparator(comparator)) { + + // store random integer keys + try (final RocksDB db = RocksDB.open(opt, dbPath)) { + for (int i = 0; i < ITERATIONS; i++) { + final String strKey = randomString(); + final byte key[] = strKey.getBytes(); + // does key already exist (avoid duplicates) + if (i > 0 && db.get(key) != null) { + i--; // generate a different key + } else { + db.put(key, "value".getBytes()); + storedKeys[i] = strKey; + } + } + } + + // sort the stored keys into ascending alpha-numeric order + Arrays.sort(storedKeys, new Comparator() { + @Override + public int compare(final String o1, final String o2) { + return o1.compareTo(o2); + } + }); + + // re-open db and read from start to end + // string keys should be in ascending + // order + try (final RocksDB db = RocksDB.open(opt, dbPath); + final RocksIterator it = db.newIterator()) { + int count = 0; + for (it.seekToFirst(); it.isValid(); it.next()) { + final String strKey = new String(it.key()); + assertEquals(storedKeys[count++], strKey); + } + } + } + } + + private String randomString() { + final char[] chars = new char[12]; + for(int i = 0; i < 12; i++) { + final int letterCode = random.nextInt(24); + final char letter = (char) (((int) 'a') + letterCode); + chars[i] = letter; + } + return String.copyValueOf(chars); + } + + public static class NativeStringComparatorWrapper + extends NativeComparatorWrapper { + + @Override + protected long initializeNative(final long... nativeParameterHandles) { + return newStringComparator(); + } + + private native long newStringComparator(); + } +} diff --git a/java/src/test/java/org/rocksdb/OptimisticTransactionDBTest.java b/java/src/test/java/org/rocksdb/OptimisticTransactionDBTest.java new file mode 100644 index 00000000000..519b70b1d2f --- /dev/null +++ b/java/src/test/java/org/rocksdb/OptimisticTransactionDBTest.java @@ -0,0 +1,131 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +public class OptimisticTransactionDBTest { + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void open() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + assertThat(otdb).isNotNull(); + } + } + + @Test + public void open_columnFamilies() throws RocksDBException { + try(final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + final ColumnFamilyOptions myCfOpts = new ColumnFamilyOptions()) { + + final List columnFamilyDescriptors = + Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("myCf".getBytes(), myCfOpts)); + + final List columnFamilyHandles = new ArrayList<>(); + + try (final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(dbOptions, + dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, columnFamilyHandles)) { + try { + assertThat(otdb).isNotNull(); + } finally { + for (final ColumnFamilyHandle handle : columnFamilyHandles) { + handle.close(); + } + } + } + } + } + + @Test + public void beginTransaction() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final OptimisticTransactionDB otdb = OptimisticTransactionDB.open( + options, dbFolder.getRoot().getAbsolutePath()); + final WriteOptions writeOptions = new WriteOptions()) { + + try(final Transaction txn = otdb.beginTransaction(writeOptions)) { + assertThat(txn).isNotNull(); + } + } + } + + @Test + public void beginTransaction_transactionOptions() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final OptimisticTransactionDB otdb = OptimisticTransactionDB.open( + options, dbFolder.getRoot().getAbsolutePath()); + final WriteOptions writeOptions = new WriteOptions(); + final OptimisticTransactionOptions optimisticTxnOptions = + new OptimisticTransactionOptions()) { + + try(final Transaction txn = otdb.beginTransaction(writeOptions, + optimisticTxnOptions)) { + assertThat(txn).isNotNull(); + } + } + } + + @Test + public void beginTransaction_withOld() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final OptimisticTransactionDB otdb = OptimisticTransactionDB.open( + options, dbFolder.getRoot().getAbsolutePath()); + final WriteOptions writeOptions = new WriteOptions()) { + + try(final Transaction txn = otdb.beginTransaction(writeOptions)) { + final Transaction txnReused = otdb.beginTransaction(writeOptions, txn); + assertThat(txnReused).isSameAs(txn); + } + } + } + + @Test + public void beginTransaction_withOld_transactionOptions() + throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final OptimisticTransactionDB otdb = OptimisticTransactionDB.open( + options, dbFolder.getRoot().getAbsolutePath()); + final WriteOptions writeOptions = new WriteOptions(); + final OptimisticTransactionOptions optimisticTxnOptions = + new OptimisticTransactionOptions()) { + + try(final Transaction txn = otdb.beginTransaction(writeOptions)) { + final Transaction txnReused = otdb.beginTransaction(writeOptions, + optimisticTxnOptions, txn); + assertThat(txnReused).isSameAs(txn); + } + } + } + + @Test + public void baseDB() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(options, + dbFolder.getRoot().getAbsolutePath())) { + assertThat(otdb).isNotNull(); + final RocksDB db = otdb.getBaseDB(); + assertThat(db).isNotNull(); + assertThat(db.isOwningHandle()).isFalse(); + } + } +} diff --git a/java/src/test/java/org/rocksdb/OptimisticTransactionOptionsTest.java b/java/src/test/java/org/rocksdb/OptimisticTransactionOptionsTest.java new file mode 100644 index 00000000000..4a57e33568c --- /dev/null +++ b/java/src/test/java/org/rocksdb/OptimisticTransactionOptionsTest.java @@ -0,0 +1,37 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import org.junit.Test; +import org.rocksdb.util.DirectBytewiseComparator; + +import java.util.Random; + +import static org.assertj.core.api.Assertions.assertThat; + +public class OptimisticTransactionOptionsTest { + + private static final Random rand = PlatformRandomHelper. + getPlatformSpecificRandomFactory(); + + @Test + public void setSnapshot() { + try (final OptimisticTransactionOptions opt = new OptimisticTransactionOptions()) { + final boolean boolValue = rand.nextBoolean(); + opt.setSetSnapshot(boolValue); + assertThat(opt.isSetSnapshot()).isEqualTo(boolValue); + } + } + + @Test + public void comparator() { + try (final OptimisticTransactionOptions opt = new OptimisticTransactionOptions(); + final ComparatorOptions copt = new ComparatorOptions(); + final DirectComparator comparator = new DirectBytewiseComparator(copt)) { + opt.setComparator(comparator); + } + } +} diff --git a/java/src/test/java/org/rocksdb/OptimisticTransactionTest.java b/java/src/test/java/org/rocksdb/OptimisticTransactionTest.java new file mode 100644 index 00000000000..f44816e64b7 --- /dev/null +++ b/java/src/test/java/org/rocksdb/OptimisticTransactionTest.java @@ -0,0 +1,350 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.fail; + +public class OptimisticTransactionTest extends AbstractTransactionTest { + + @Test + public void getForUpdate_cf_conflict() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + final byte v12[] = "value12".getBytes(UTF_8); + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + + try(final Transaction txn = dbContainer.beginTransaction()) { + txn.put(testCf, k1, v1); + assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1); + txn.commit(); + } + + try(final Transaction txn2 = dbContainer.beginTransaction()) { + try(final Transaction txn3 = dbContainer.beginTransaction()) { + assertThat(txn3.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1); + + // NOTE: txn2 updates k1, during txn3 + txn2.put(testCf, k1, v12); + assertThat(txn2.get(testCf, readOptions, k1)).isEqualTo(v12); + txn2.commit(); + + try { + txn3.commit(); // should cause an exception! + } catch(final RocksDBException e) { + assertThat(e.getStatus().getCode()).isSameAs(Status.Code.Busy); + return; + } + } + } + + fail("Expected an exception for put after getForUpdate from conflicting" + + "transactions"); + } + } + + @Test + public void getForUpdate_conflict() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + final byte v12[] = "value12".getBytes(UTF_8); + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions()) { + + try(final Transaction txn = dbContainer.beginTransaction()) { + txn.put(k1, v1); + assertThat(txn.get(readOptions, k1)).isEqualTo(v1); + txn.commit(); + } + + try(final Transaction txn2 = dbContainer.beginTransaction()) { + try(final Transaction txn3 = dbContainer.beginTransaction()) { + assertThat(txn3.getForUpdate(readOptions, k1, true)).isEqualTo(v1); + + // NOTE: txn2 updates k1, during txn3 + txn2.put(k1, v12); + assertThat(txn2.get(readOptions, k1)).isEqualTo(v12); + txn2.commit(); + + try { + txn3.commit(); // should cause an exception! + } catch(final RocksDBException e) { + assertThat(e.getStatus().getCode()).isSameAs(Status.Code.Busy); + return; + } + } + } + + fail("Expected an exception for put after getForUpdate from conflicting" + + "transactions"); + } + } + + @Test + public void multiGetForUpdate_cf_conflict() throws RocksDBException { + final byte keys[][] = new byte[][] { + "key1".getBytes(UTF_8), + "key2".getBytes(UTF_8)}; + final byte values[][] = new byte[][] { + "value1".getBytes(UTF_8), + "value2".getBytes(UTF_8)}; + final byte[] otherValue = "otherValue".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + final List cfList = Arrays.asList(testCf, testCf); + + try(final Transaction txn = dbContainer.beginTransaction()) { + txn.put(testCf, keys[0], values[0]); + txn.put(testCf, keys[1], values[1]); + assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(values); + txn.commit(); + } + + try(final Transaction txn2 = dbContainer.beginTransaction()) { + try(final Transaction txn3 = dbContainer.beginTransaction()) { + assertThat(txn3.multiGetForUpdate(readOptions, cfList, keys)) + .isEqualTo(values); + + // NOTE: txn2 updates k1, during txn3 + txn2.put(testCf, keys[0], otherValue); + assertThat(txn2.get(testCf, readOptions, keys[0])) + .isEqualTo(otherValue); + txn2.commit(); + + try { + txn3.commit(); // should cause an exception! + } catch(final RocksDBException e) { + assertThat(e.getStatus().getCode()).isSameAs(Status.Code.Busy); + return; + } + } + } + + fail("Expected an exception for put after getForUpdate from conflicting" + + "transactions"); + } + } + + @Test + public void multiGetForUpdate_conflict() throws RocksDBException { + final byte keys[][] = new byte[][] { + "key1".getBytes(UTF_8), + "key2".getBytes(UTF_8)}; + final byte values[][] = new byte[][] { + "value1".getBytes(UTF_8), + "value2".getBytes(UTF_8)}; + final byte[] otherValue = "otherValue".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions()) { + try(final Transaction txn = dbContainer.beginTransaction()) { + txn.put(keys[0], values[0]); + txn.put(keys[1], values[1]); + assertThat(txn.multiGet(readOptions, keys)).isEqualTo(values); + txn.commit(); + } + + try(final Transaction txn2 = dbContainer.beginTransaction()) { + try(final Transaction txn3 = dbContainer.beginTransaction()) { + assertThat(txn3.multiGetForUpdate(readOptions, keys)) + .isEqualTo(values); + + // NOTE: txn2 updates k1, during txn3 + txn2.put(keys[0], otherValue); + assertThat(txn2.get(readOptions, keys[0])) + .isEqualTo(otherValue); + txn2.commit(); + + try { + txn3.commit(); // should cause an exception! + } catch(final RocksDBException e) { + assertThat(e.getStatus().getCode()).isSameAs(Status.Code.Busy); + return; + } + } + } + + fail("Expected an exception for put after getForUpdate from conflicting" + + "transactions"); + } + } + + @Test + public void undoGetForUpdate_cf_conflict() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + final byte v12[] = "value12".getBytes(UTF_8); + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + + try(final Transaction txn = dbContainer.beginTransaction()) { + txn.put(testCf, k1, v1); + assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1); + txn.commit(); + } + + try(final Transaction txn2 = dbContainer.beginTransaction()) { + try(final Transaction txn3 = dbContainer.beginTransaction()) { + assertThat(txn3.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1); + + // undo the getForUpdate + txn3.undoGetForUpdate(testCf, k1); + + // NOTE: txn2 updates k1, during txn3 + txn2.put(testCf, k1, v12); + assertThat(txn2.get(testCf, readOptions, k1)).isEqualTo(v12); + txn2.commit(); + + // should not cause an exception + // because we undid the getForUpdate above! + txn3.commit(); + } + } + } + } + + @Test + public void undoGetForUpdate_conflict() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + final byte v12[] = "value12".getBytes(UTF_8); + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions()) { + + try(final Transaction txn = dbContainer.beginTransaction()) { + txn.put(k1, v1); + assertThat(txn.get(readOptions, k1)).isEqualTo(v1); + txn.commit(); + } + + try(final Transaction txn2 = dbContainer.beginTransaction()) { + try(final Transaction txn3 = dbContainer.beginTransaction()) { + assertThat(txn3.getForUpdate(readOptions, k1, true)).isEqualTo(v1); + + // undo the getForUpdate + txn3.undoGetForUpdate(k1); + + // NOTE: txn2 updates k1, during txn3 + txn2.put(k1, v12); + assertThat(txn2.get(readOptions, k1)).isEqualTo(v12); + txn2.commit(); + + // should not cause an exception + // because we undid the getForUpdate above! + txn3.commit(); + } + } + } + } + + @Test + public void name() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + assertThat(txn.getName()).isEmpty(); + final String name = "my-transaction-" + rand.nextLong(); + + try { + txn.setName(name); + } catch(final RocksDBException e) { + assertThat(e.getStatus().getCode() == Status.Code.InvalidArgument); + return; + } + + fail("Optimistic transactions cannot be named."); + } + } + + @Override + public OptimisticTransactionDBContainer startDb() + throws RocksDBException { + final DBOptions options = new DBOptions() + .setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + + final ColumnFamilyOptions columnFamilyOptions = new ColumnFamilyOptions(); + final List columnFamilyDescriptors = + Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor(TXN_TEST_COLUMN_FAMILY, + columnFamilyOptions)); + final List columnFamilyHandles = new ArrayList<>(); + + final OptimisticTransactionDB optimisticTxnDb; + try { + optimisticTxnDb = OptimisticTransactionDB.open( + options, dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, columnFamilyHandles); + } catch(final RocksDBException e) { + columnFamilyOptions.close(); + options.close(); + throw e; + } + + final WriteOptions writeOptions = new WriteOptions(); + final OptimisticTransactionOptions optimisticTxnOptions = + new OptimisticTransactionOptions(); + + return new OptimisticTransactionDBContainer(optimisticTxnOptions, + writeOptions, columnFamilyHandles, optimisticTxnDb, columnFamilyOptions, + options); + } + + private static class OptimisticTransactionDBContainer + extends DBContainer { + + private final OptimisticTransactionOptions optimisticTxnOptions; + private final OptimisticTransactionDB optimisticTxnDb; + + public OptimisticTransactionDBContainer( + final OptimisticTransactionOptions optimisticTxnOptions, + final WriteOptions writeOptions, + final List columnFamilyHandles, + final OptimisticTransactionDB optimisticTxnDb, + final ColumnFamilyOptions columnFamilyOptions, + final DBOptions options) { + super(writeOptions, columnFamilyHandles, columnFamilyOptions, + options); + this.optimisticTxnOptions = optimisticTxnOptions; + this.optimisticTxnDb = optimisticTxnDb; + } + + @Override + public Transaction beginTransaction() { + return optimisticTxnDb.beginTransaction(writeOptions, + optimisticTxnOptions); + } + + @Override + public Transaction beginTransaction(final WriteOptions writeOptions) { + return optimisticTxnDb.beginTransaction(writeOptions, + optimisticTxnOptions); + } + + @Override + public void close() { + optimisticTxnOptions.close(); + writeOptions.close(); + for(final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) { + columnFamilyHandle.close(); + } + optimisticTxnDb.close(); + options.close(); + } + } +} diff --git a/java/src/test/java/org/rocksdb/TransactionDBOptionsTest.java b/java/src/test/java/org/rocksdb/TransactionDBOptionsTest.java new file mode 100644 index 00000000000..7eaa6b16cdf --- /dev/null +++ b/java/src/test/java/org/rocksdb/TransactionDBOptionsTest.java @@ -0,0 +1,64 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import org.junit.Test; + +import java.util.Random; + +import static org.assertj.core.api.Assertions.assertThat; + +public class TransactionDBOptionsTest { + + private static final Random rand = PlatformRandomHelper. + getPlatformSpecificRandomFactory(); + + @Test + public void maxNumLocks() { + try (final TransactionDBOptions opt = new TransactionDBOptions()) { + final long longValue = rand.nextLong(); + opt.setMaxNumLocks(longValue); + assertThat(opt.getMaxNumLocks()).isEqualTo(longValue); + } + } + + @Test + public void maxNumStripes() { + try (final TransactionDBOptions opt = new TransactionDBOptions()) { + final long longValue = rand.nextLong(); + opt.setNumStripes(longValue); + assertThat(opt.getNumStripes()).isEqualTo(longValue); + } + } + + @Test + public void transactionLockTimeout() { + try (final TransactionDBOptions opt = new TransactionDBOptions()) { + final long longValue = rand.nextLong(); + opt.setTransactionLockTimeout(longValue); + assertThat(opt.getTransactionLockTimeout()).isEqualTo(longValue); + } + } + + @Test + public void defaultLockTimeout() { + try (final TransactionDBOptions opt = new TransactionDBOptions()) { + final long longValue = rand.nextLong(); + opt.setDefaultLockTimeout(longValue); + assertThat(opt.getDefaultLockTimeout()).isEqualTo(longValue); + } + } + + @Test + public void writePolicy() { + try (final TransactionDBOptions opt = new TransactionDBOptions()) { + final TxnDBWritePolicy writePolicy = TxnDBWritePolicy.WRITE_UNPREPARED; // non-default + opt.setWritePolicy(writePolicy); + assertThat(opt.getWritePolicy()).isEqualTo(writePolicy); + } + } + +} diff --git a/java/src/test/java/org/rocksdb/TransactionDBTest.java b/java/src/test/java/org/rocksdb/TransactionDBTest.java new file mode 100644 index 00000000000..b0ea813ff5e --- /dev/null +++ b/java/src/test/java/org/rocksdb/TransactionDBTest.java @@ -0,0 +1,178 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.util.*; + +import static org.assertj.core.api.Assertions.assertThat; +import static java.nio.charset.StandardCharsets.UTF_8; + +public class TransactionDBTest { + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void open() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB tdb = TransactionDB.open(options, txnDbOptions, + dbFolder.getRoot().getAbsolutePath())) { + assertThat(tdb).isNotNull(); + } + } + + @Test + public void open_columnFamilies() throws RocksDBException { + try(final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + final ColumnFamilyOptions myCfOpts = new ColumnFamilyOptions()) { + + final List columnFamilyDescriptors = + Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("myCf".getBytes(), myCfOpts)); + + final List columnFamilyHandles = new ArrayList<>(); + + try (final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB tdb = TransactionDB.open(dbOptions, txnDbOptions, + dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, columnFamilyHandles)) { + try { + assertThat(tdb).isNotNull(); + } finally { + for (final ColumnFamilyHandle handle : columnFamilyHandles) { + handle.close(); + } + } + } + } + } + + @Test + public void beginTransaction() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB tdb = TransactionDB.open(options, txnDbOptions, + dbFolder.getRoot().getAbsolutePath()); + final WriteOptions writeOptions = new WriteOptions()) { + + try(final Transaction txn = tdb.beginTransaction(writeOptions)) { + assertThat(txn).isNotNull(); + } + } + } + + @Test + public void beginTransaction_transactionOptions() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB tdb = TransactionDB.open(options, txnDbOptions, + dbFolder.getRoot().getAbsolutePath()); + final WriteOptions writeOptions = new WriteOptions(); + final TransactionOptions txnOptions = new TransactionOptions()) { + + try(final Transaction txn = tdb.beginTransaction(writeOptions, + txnOptions)) { + assertThat(txn).isNotNull(); + } + } + } + + @Test + public void beginTransaction_withOld() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB tdb = TransactionDB.open(options, txnDbOptions, + dbFolder.getRoot().getAbsolutePath()); + final WriteOptions writeOptions = new WriteOptions()) { + + try(final Transaction txn = tdb.beginTransaction(writeOptions)) { + final Transaction txnReused = tdb.beginTransaction(writeOptions, txn); + assertThat(txnReused).isSameAs(txn); + } + } + } + + @Test + public void beginTransaction_withOld_transactionOptions() + throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB tdb = TransactionDB.open(options, txnDbOptions, + dbFolder.getRoot().getAbsolutePath()); + final WriteOptions writeOptions = new WriteOptions(); + final TransactionOptions txnOptions = new TransactionOptions()) { + + try(final Transaction txn = tdb.beginTransaction(writeOptions)) { + final Transaction txnReused = tdb.beginTransaction(writeOptions, + txnOptions, txn); + assertThat(txnReused).isSameAs(txn); + } + } + } + + @Test + public void lockStatusData() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB tdb = TransactionDB.open(options, txnDbOptions, + dbFolder.getRoot().getAbsolutePath()); + final WriteOptions writeOptions = new WriteOptions(); + final ReadOptions readOptions = new ReadOptions()) { + + try (final Transaction txn = tdb.beginTransaction(writeOptions)) { + + final byte key[] = "key".getBytes(UTF_8); + final byte value[] = "value".getBytes(UTF_8); + + txn.put(key, value); + assertThat(txn.getForUpdate(readOptions, key, true)).isEqualTo(value); + + final Map lockStatus = + tdb.getLockStatusData(); + + assertThat(lockStatus.size()).isEqualTo(1); + final Set> entrySet = lockStatus.entrySet(); + final Map.Entry entry = entrySet.iterator().next(); + final long columnFamilyId = entry.getKey(); + assertThat(columnFamilyId).isEqualTo(0); + final TransactionDB.KeyLockInfo keyLockInfo = entry.getValue(); + assertThat(keyLockInfo.getKey()).isEqualTo(new String(key, UTF_8)); + assertThat(keyLockInfo.getTransactionIDs().length).isEqualTo(1); + assertThat(keyLockInfo.getTransactionIDs()[0]).isEqualTo(txn.getId()); + assertThat(keyLockInfo.isExclusive()).isTrue(); + } + } + } + + @Test + public void deadlockInfoBuffer() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB tdb = TransactionDB.open(options, txnDbOptions, + dbFolder.getRoot().getAbsolutePath())) { + + // TODO(AR) can we cause a deadlock so that we can test the output here? + assertThat(tdb.getDeadlockInfoBuffer()).isEmpty(); + } + } + + @Test + public void setDeadlockInfoBufferSize() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB tdb = TransactionDB.open(options, txnDbOptions, + dbFolder.getRoot().getAbsolutePath())) { + tdb.setDeadlockInfoBufferSize(123); + } + } +} diff --git a/java/src/test/java/org/rocksdb/TransactionOptionsTest.java b/java/src/test/java/org/rocksdb/TransactionOptionsTest.java new file mode 100644 index 00000000000..add0439e03a --- /dev/null +++ b/java/src/test/java/org/rocksdb/TransactionOptionsTest.java @@ -0,0 +1,72 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import org.junit.Test; + +import java.util.Random; + +import static org.assertj.core.api.Assertions.assertThat; + +public class TransactionOptionsTest { + + private static final Random rand = PlatformRandomHelper. + getPlatformSpecificRandomFactory(); + + @Test + public void snapshot() { + try (final TransactionOptions opt = new TransactionOptions()) { + final boolean boolValue = rand.nextBoolean(); + opt.setSetSnapshot(boolValue); + assertThat(opt.isSetSnapshot()).isEqualTo(boolValue); + } + } + + @Test + public void deadlockDetect() { + try (final TransactionOptions opt = new TransactionOptions()) { + final boolean boolValue = rand.nextBoolean(); + opt.setDeadlockDetect(boolValue); + assertThat(opt.isDeadlockDetect()).isEqualTo(boolValue); + } + } + + @Test + public void lockTimeout() { + try (final TransactionOptions opt = new TransactionOptions()) { + final long longValue = rand.nextLong(); + opt.setLockTimeout(longValue); + assertThat(opt.getLockTimeout()).isEqualTo(longValue); + } + } + + @Test + public void expiration() { + try (final TransactionOptions opt = new TransactionOptions()) { + final long longValue = rand.nextLong(); + opt.setExpiration(longValue); + assertThat(opt.getExpiration()).isEqualTo(longValue); + } + } + + @Test + public void deadlockDetectDepth() { + try (final TransactionOptions opt = new TransactionOptions()) { + final long longValue = rand.nextLong(); + opt.setDeadlockDetectDepth(longValue); + assertThat(opt.getDeadlockDetectDepth()).isEqualTo(longValue); + } + } + + @Test + public void maxWriteBatchSize() { + try (final TransactionOptions opt = new TransactionOptions()) { + final long longValue = rand.nextLong(); + opt.setMaxWriteBatchSize(longValue); + assertThat(opt.getMaxWriteBatchSize()).isEqualTo(longValue); + } + } +} diff --git a/java/src/test/java/org/rocksdb/TransactionTest.java b/java/src/test/java/org/rocksdb/TransactionTest.java new file mode 100644 index 00000000000..57a05c9e3af --- /dev/null +++ b/java/src/test/java/org/rocksdb/TransactionTest.java @@ -0,0 +1,308 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.fail; + +public class TransactionTest extends AbstractTransactionTest { + + @Test + public void getForUpdate_cf_conflict() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + final byte v12[] = "value12".getBytes(UTF_8); + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + + try(final Transaction txn = dbContainer.beginTransaction()) { + txn.put(testCf, k1, v1); + assertThat(txn.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1); + txn.commit(); + } + + try(final Transaction txn2 = dbContainer.beginTransaction()) { + try(final Transaction txn3 = dbContainer.beginTransaction()) { + assertThat(txn3.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1); + + // NOTE: txn2 updates k1, during txn3 + try { + txn2.put(testCf, k1, v12); // should cause an exception! + } catch(final RocksDBException e) { + assertThat(e.getStatus().getCode()).isSameAs(Status.Code.TimedOut); + return; + } + } + } + + fail("Expected an exception for put after getForUpdate from conflicting" + + "transactions"); + } + } + + @Test + public void getForUpdate_conflict() throws RocksDBException { + final byte k1[] = "key1".getBytes(UTF_8); + final byte v1[] = "value1".getBytes(UTF_8); + final byte v12[] = "value12".getBytes(UTF_8); + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions()) { + + try(final Transaction txn = dbContainer.beginTransaction()) { + txn.put(k1, v1); + assertThat(txn.getForUpdate(readOptions, k1, true)).isEqualTo(v1); + txn.commit(); + } + + try(final Transaction txn2 = dbContainer.beginTransaction()) { + try(final Transaction txn3 = dbContainer.beginTransaction()) { + assertThat(txn3.getForUpdate(readOptions, k1, true)).isEqualTo(v1); + + // NOTE: txn2 updates k1, during txn3 + try { + txn2.put(k1, v12); // should cause an exception! + } catch(final RocksDBException e) { + assertThat(e.getStatus().getCode()).isSameAs(Status.Code.TimedOut); + return; + } + } + } + + fail("Expected an exception for put after getForUpdate from conflicting" + + "transactions"); + } + } + + @Test + public void multiGetForUpdate_cf_conflict() throws RocksDBException { + final byte keys[][] = new byte[][] { + "key1".getBytes(UTF_8), + "key2".getBytes(UTF_8)}; + final byte values[][] = new byte[][] { + "value1".getBytes(UTF_8), + "value2".getBytes(UTF_8)}; + final byte[] otherValue = "otherValue".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + final List cfList = Arrays.asList(testCf, testCf); + + try(final Transaction txn = dbContainer.beginTransaction()) { + txn.put(testCf, keys[0], values[0]); + txn.put(testCf, keys[1], values[1]); + assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(values); + txn.commit(); + } + + try(final Transaction txn2 = dbContainer.beginTransaction()) { + try(final Transaction txn3 = dbContainer.beginTransaction()) { + assertThat(txn3.multiGetForUpdate(readOptions, cfList, keys)) + .isEqualTo(values); + + // NOTE: txn2 updates k1, during txn3 + try { + txn2.put(testCf, keys[0], otherValue); // should cause an exception! + } catch(final RocksDBException e) { + assertThat(e.getStatus().getCode()).isSameAs(Status.Code.TimedOut); + return; + } + } + } + + fail("Expected an exception for put after getForUpdate from conflicting" + + "transactions"); + } + } + + @Test + public void multiGetForUpdate_conflict() throws RocksDBException { + final byte keys[][] = new byte[][] { + "key1".getBytes(UTF_8), + "key2".getBytes(UTF_8)}; + final byte values[][] = new byte[][] { + "value1".getBytes(UTF_8), + "value2".getBytes(UTF_8)}; + final byte[] otherValue = "otherValue".getBytes(UTF_8); + + try(final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions()) { + try(final Transaction txn = dbContainer.beginTransaction()) { + txn.put(keys[0], values[0]); + txn.put(keys[1], values[1]); + assertThat(txn.multiGet(readOptions, keys)).isEqualTo(values); + txn.commit(); + } + + try(final Transaction txn2 = dbContainer.beginTransaction()) { + try(final Transaction txn3 = dbContainer.beginTransaction()) { + assertThat(txn3.multiGetForUpdate(readOptions, keys)) + .isEqualTo(values); + + // NOTE: txn2 updates k1, during txn3 + try { + txn2.put(keys[0], otherValue); // should cause an exception! + } catch(final RocksDBException e) { + assertThat(e.getStatus().getCode()).isSameAs(Status.Code.TimedOut); + return; + } + } + } + + fail("Expected an exception for put after getForUpdate from conflicting" + + "transactions"); + } + } + + @Test + public void name() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + assertThat(txn.getName()).isEmpty(); + final String name = "my-transaction-" + rand.nextLong(); + txn.setName(name); + assertThat(txn.getName()).isEqualTo(name); + } + } + + @Test + public void ID() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + assertThat(txn.getID()).isGreaterThan(0); + } + } + + @Test + public void deadlockDetect() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + assertThat(txn.isDeadlockDetect()).isFalse(); + } + } + + @Test + public void waitingTxns() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + assertThat(txn.getWaitingTxns().getTransactionIds().length).isEqualTo(0); + } + } + + @Test + public void state() throws RocksDBException { + try(final DBContainer dbContainer = startDb()) { + + try(final Transaction txn = dbContainer.beginTransaction()) { + assertThat(txn.getState()) + .isSameAs(Transaction.TransactionState.STARTED); + txn.commit(); + assertThat(txn.getState()) + .isSameAs(Transaction.TransactionState.COMMITED); + } + + try(final Transaction txn = dbContainer.beginTransaction()) { + assertThat(txn.getState()) + .isSameAs(Transaction.TransactionState.STARTED); + txn.rollback(); + assertThat(txn.getState()) + .isSameAs(Transaction.TransactionState.STARTED); + } + } + } + + @Test + public void Id() throws RocksDBException { + try(final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { + assertThat(txn.getId()).isNotNull(); + } + } + + @Override + public TransactionDBContainer startDb() throws RocksDBException { + final DBOptions options = new DBOptions() + .setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final ColumnFamilyOptions columnFamilyOptions = new ColumnFamilyOptions(); + final List columnFamilyDescriptors = + Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor(TXN_TEST_COLUMN_FAMILY, + columnFamilyOptions)); + final List columnFamilyHandles = new ArrayList<>(); + + final TransactionDB txnDb; + try { + txnDb = TransactionDB.open(options, txnDbOptions, + dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, + columnFamilyHandles); + } catch(final RocksDBException e) { + columnFamilyOptions.close(); + txnDbOptions.close(); + options.close(); + throw e; + } + + final WriteOptions writeOptions = new WriteOptions(); + final TransactionOptions txnOptions = new TransactionOptions(); + + return new TransactionDBContainer(txnOptions, writeOptions, + columnFamilyHandles, txnDb, txnDbOptions, columnFamilyOptions, options); + } + + private static class TransactionDBContainer + extends DBContainer { + private final TransactionOptions txnOptions; + private final TransactionDB txnDb; + private final TransactionDBOptions txnDbOptions; + + public TransactionDBContainer( + final TransactionOptions txnOptions, final WriteOptions writeOptions, + final List columnFamilyHandles, + final TransactionDB txnDb, final TransactionDBOptions txnDbOptions, + final ColumnFamilyOptions columnFamilyOptions, + final DBOptions options) { + super(writeOptions, columnFamilyHandles, columnFamilyOptions, + options); + this.txnOptions = txnOptions; + this.txnDb = txnDb; + this.txnDbOptions = txnDbOptions; + } + + @Override + public Transaction beginTransaction() { + return txnDb.beginTransaction(writeOptions, txnOptions); + } + + @Override + public Transaction beginTransaction(final WriteOptions writeOptions) { + return txnDb.beginTransaction(writeOptions, txnOptions); + } + + @Override + public void close() { + txnOptions.close(); + writeOptions.close(); + for(final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) { + columnFamilyHandle.close(); + } + txnDb.close(); + txnDbOptions.close(); + options.close(); + } + } + +} diff --git a/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java b/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java index 646a31ce7c7..0c7b0d3cad3 100644 --- a/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java +++ b/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java @@ -5,15 +5,16 @@ package org.rocksdb; -import java.io.IOException; -import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.junit.ClassRule; import org.junit.Test; +import org.rocksdb.util.CapturingWriteBatchHandler; +import org.rocksdb.util.CapturingWriteBatchHandler.Event; import static org.assertj.core.api.Assertions.assertThat; +import static org.rocksdb.util.CapturingWriteBatchHandler.Action.*; public class WriteBatchHandlerTest { @@ -22,45 +23,37 @@ public class WriteBatchHandlerTest { new RocksMemoryResource(); @Test - public void writeBatchHandler() throws IOException, RocksDBException { + public void writeBatchHandler() throws RocksDBException { // setup test data - final List>> testEvents = Arrays.asList( - new Tuple<>(Action.DELETE, - new Tuple("k0".getBytes(), null)), - new Tuple<>(Action.PUT, - new Tuple<>("k1".getBytes(), "v1".getBytes())), - new Tuple<>(Action.PUT, - new Tuple<>("k2".getBytes(), "v2".getBytes())), - new Tuple<>(Action.PUT, - new Tuple<>("k3".getBytes(), "v3".getBytes())), - new Tuple<>(Action.LOG, - new Tuple(null, "log1".getBytes())), - new Tuple<>(Action.MERGE, - new Tuple<>("k2".getBytes(), "v22".getBytes())), - new Tuple<>(Action.DELETE, - new Tuple("k3".getBytes(), null)) + final List testEvents = Arrays.asList( + new Event(DELETE, "k0".getBytes(), null), + new Event(PUT, "k1".getBytes(), "v1".getBytes()), + new Event(PUT, "k2".getBytes(), "v2".getBytes()), + new Event(PUT, "k3".getBytes(), "v3".getBytes()), + new Event(LOG, null, "log1".getBytes()), + new Event(MERGE, "k2".getBytes(), "v22".getBytes()), + new Event(DELETE, "k3".getBytes(), null) ); // load test data to the write batch try (final WriteBatch batch = new WriteBatch()) { - for (final Tuple> testEvent : testEvents) { - final Tuple data = testEvent.value; - switch (testEvent.key) { + for (final Event testEvent : testEvents) { + switch (testEvent.action) { case PUT: - batch.put(data.key, data.value); + batch.put(testEvent.key, testEvent.value); break; case MERGE: - batch.merge(data.key, data.value); + batch.merge(testEvent.key, testEvent.value); break; case DELETE: - batch.remove(data.key); + batch.remove(testEvent.key); break; case LOG: - batch.putLogData(data.value); + batch.putLogData(testEvent.value); break; } } @@ -72,98 +65,12 @@ public void writeBatchHandler() throws IOException, RocksDBException { batch.iterate(handler); // compare the results to the test data - final List>> actualEvents = + final List actualEvents = handler.getEvents(); assertThat(testEvents.size()).isSameAs(actualEvents.size()); - for (int i = 0; i < testEvents.size(); i++) { - assertThat(equals(testEvents.get(i), actualEvents.get(i))).isTrue(); - } + assertThat(testEvents).isEqualTo(actualEvents); } } } - - private static boolean equals( - final Tuple> expected, - final Tuple> actual) { - if (!expected.key.equals(actual.key)) { - return false; - } - - final Tuple expectedData = expected.value; - final Tuple actualData = actual.value; - - return equals(expectedData.key, actualData.key) - && equals(expectedData.value, actualData.value); - } - - private static boolean equals(byte[] expected, byte[] actual) { - if (expected != null) { - return Arrays.equals(expected, actual); - } else { - return actual == null; - } - } - - private static class Tuple { - public final K key; - public final V value; - - public Tuple(final K key, final V value) { - this.key = key; - this.value = value; - } - } - - /** - * Enumeration of Write Batch - * event actions - */ - private enum Action { PUT, MERGE, DELETE, DELETE_RANGE, LOG } - - /** - * A simple WriteBatch Handler which adds a record - * of each event that it receives to a list - */ - private static class CapturingWriteBatchHandler extends WriteBatch.Handler { - - private final List>> events - = new ArrayList<>(); - - /** - * Returns a copy of the current events list - * - * @return a list of the events which have happened upto now - */ - public List>> getEvents() { - return new ArrayList<>(events); - } - - @Override - public void put(final byte[] key, final byte[] value) { - events.add(new Tuple<>(Action.PUT, new Tuple<>(key, value))); - } - - @Override - public void merge(final byte[] key, final byte[] value) { - events.add(new Tuple<>(Action.MERGE, new Tuple<>(key, value))); - } - - @Override - public void delete(final byte[] key) { - events.add(new Tuple<>(Action.DELETE, - new Tuple(key, null))); - } - - @Override - public void deleteRange(final byte[] beginKey, final byte[] endKey) { - events.add(new Tuple<>(Action.DELETE_RANGE, new Tuple(beginKey, endKey))); - } - - @Override - public void logData(final byte[] blob) { - events.add(new Tuple<>(Action.LOG, - new Tuple(null, blob))); - } - } } diff --git a/java/src/test/java/org/rocksdb/WriteBatchTest.java b/java/src/test/java/org/rocksdb/WriteBatchTest.java index 83f90c8eb45..1e3e50b7e47 100644 --- a/java/src/test/java/org/rocksdb/WriteBatchTest.java +++ b/java/src/test/java/org/rocksdb/WriteBatchTest.java @@ -12,20 +12,17 @@ import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; - -import java.io.UnsupportedEncodingException; -import java.util.Arrays; +import org.rocksdb.util.CapturingWriteBatchHandler; +import org.rocksdb.util.CapturingWriteBatchHandler.Event; +import org.rocksdb.util.WriteBatchGetter; import static org.assertj.core.api.Assertions.assertThat; +import static org.rocksdb.util.CapturingWriteBatchHandler.Action.*; +import static java.nio.charset.StandardCharsets.UTF_8; /** * This class mimics the db/write_batch_test.cc * in the c++ rocksdb library. - *

- * Not ported yet: - *

- * Continue(); - * PutGatherSlices(); */ public class WriteBatchTest { @ClassRule @@ -44,27 +41,45 @@ public void emptyWriteBatch() { @Test public void multipleBatchOperations() - throws UnsupportedEncodingException { - try (WriteBatch batch = new WriteBatch()) { - batch.put("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII")); - batch.remove("box".getBytes("US-ASCII")); - batch.put("baz".getBytes("US-ASCII"), "boo".getBytes("US-ASCII")); - - WriteBatchTestInternalHelper.setSequence(batch, 100); - assertThat(WriteBatchTestInternalHelper.sequence(batch)). - isNotNull(). - isEqualTo(100); - assertThat(batch.count()).isEqualTo(3); - assertThat(new String(getContents(batch), "US-ASCII")). - isEqualTo("Put(baz, boo)@102" + - "Delete(box)@101" + - "Put(foo, bar)@100"); + throws RocksDBException { + + final byte[] foo = "foo".getBytes(UTF_8); + final byte[] bar = "bar".getBytes(UTF_8); + final byte[] box = "box".getBytes(UTF_8); + final byte[] baz = "baz".getBytes(UTF_8); + final byte[] boo = "boo".getBytes(UTF_8); + final byte[] hoo = "hoo".getBytes(UTF_8); + final byte[] hello = "hello".getBytes(UTF_8); + + try (final WriteBatch batch = new WriteBatch()) { + batch.put(foo, bar); + batch.delete(box); + batch.put(baz, boo); + batch.merge(baz, hoo); + batch.singleDelete(foo); + batch.deleteRange(baz, foo); + batch.putLogData(hello); + + try(final CapturingWriteBatchHandler handler = + new CapturingWriteBatchHandler()) { + batch.iterate(handler); + + assertThat(handler.getEvents().size()).isEqualTo(7); + + assertThat(handler.getEvents().get(0)).isEqualTo(new Event(PUT, foo, bar)); + assertThat(handler.getEvents().get(1)).isEqualTo(new Event(DELETE, box, null)); + assertThat(handler.getEvents().get(2)).isEqualTo(new Event(PUT, baz, boo)); + assertThat(handler.getEvents().get(3)).isEqualTo(new Event(MERGE, baz, hoo)); + assertThat(handler.getEvents().get(4)).isEqualTo(new Event(SINGLE_DELETE, foo, null)); + assertThat(handler.getEvents().get(5)).isEqualTo(new Event(DELETE_RANGE, baz, foo)); + assertThat(handler.getEvents().get(6)).isEqualTo(new Event(LOG, null, hello)); + } } } @Test public void testAppendOperation() - throws UnsupportedEncodingException { + throws RocksDBException { try (final WriteBatch b1 = new WriteBatch(); final WriteBatch b2 = new WriteBatch()) { WriteBatchTestInternalHelper.setSequence(b1, 200); @@ -72,67 +87,66 @@ public void testAppendOperation() WriteBatchTestInternalHelper.append(b1, b2); assertThat(getContents(b1).length).isEqualTo(0); assertThat(b1.count()).isEqualTo(0); - b2.put("a".getBytes("US-ASCII"), "va".getBytes("US-ASCII")); + b2.put("a".getBytes(UTF_8), "va".getBytes(UTF_8)); WriteBatchTestInternalHelper.append(b1, b2); assertThat("Put(a, va)@200".equals(new String(getContents(b1), - "US-ASCII"))); + UTF_8))); assertThat(b1.count()).isEqualTo(1); b2.clear(); - b2.put("b".getBytes("US-ASCII"), "vb".getBytes("US-ASCII")); + b2.put("b".getBytes(UTF_8), "vb".getBytes(UTF_8)); WriteBatchTestInternalHelper.append(b1, b2); assertThat(("Put(a, va)@200" + "Put(b, vb)@201") - .equals(new String(getContents(b1), "US-ASCII"))); + .equals(new String(getContents(b1), UTF_8))); assertThat(b1.count()).isEqualTo(2); - b2.remove("foo".getBytes("US-ASCII")); + b2.delete("foo".getBytes(UTF_8)); WriteBatchTestInternalHelper.append(b1, b2); assertThat(("Put(a, va)@200" + "Put(b, vb)@202" + "Put(b, vb)@201" + "Delete(foo)@203") - .equals(new String(getContents(b1), "US-ASCII"))); + .equals(new String(getContents(b1), UTF_8))); assertThat(b1.count()).isEqualTo(4); } } @Test public void blobOperation() - throws UnsupportedEncodingException { + throws RocksDBException { try (final WriteBatch batch = new WriteBatch()) { - batch.put("k1".getBytes("US-ASCII"), "v1".getBytes("US-ASCII")); - batch.put("k2".getBytes("US-ASCII"), "v2".getBytes("US-ASCII")); - batch.put("k3".getBytes("US-ASCII"), "v3".getBytes("US-ASCII")); - batch.putLogData("blob1".getBytes("US-ASCII")); - batch.remove("k2".getBytes("US-ASCII")); - batch.putLogData("blob2".getBytes("US-ASCII")); - batch.merge("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII")); + batch.put("k1".getBytes(UTF_8), "v1".getBytes(UTF_8)); + batch.put("k2".getBytes(UTF_8), "v2".getBytes(UTF_8)); + batch.put("k3".getBytes(UTF_8), "v3".getBytes(UTF_8)); + batch.putLogData("blob1".getBytes(UTF_8)); + batch.delete("k2".getBytes(UTF_8)); + batch.putLogData("blob2".getBytes(UTF_8)); + batch.merge("foo".getBytes(UTF_8), "bar".getBytes(UTF_8)); assertThat(batch.count()).isEqualTo(5); assertThat(("Merge(foo, bar)@4" + "Put(k1, v1)@0" + "Delete(k2)@3" + "Put(k2, v2)@1" + "Put(k3, v3)@2") - .equals(new String(getContents(batch), "US-ASCII"))); + .equals(new String(getContents(batch), UTF_8))); } } @Test public void savePoints() - throws UnsupportedEncodingException, RocksDBException { + throws RocksDBException { try (final WriteBatch batch = new WriteBatch()) { - batch.put("k1".getBytes("US-ASCII"), "v1".getBytes("US-ASCII")); - batch.put("k2".getBytes("US-ASCII"), "v2".getBytes("US-ASCII")); - batch.put("k3".getBytes("US-ASCII"), "v3".getBytes("US-ASCII")); + batch.put("k1".getBytes(UTF_8), "v1".getBytes(UTF_8)); + batch.put("k2".getBytes(UTF_8), "v2".getBytes(UTF_8)); + batch.put("k3".getBytes(UTF_8), "v3".getBytes(UTF_8)); assertThat(getFromWriteBatch(batch, "k1")).isEqualTo("v1"); assertThat(getFromWriteBatch(batch, "k2")).isEqualTo("v2"); assertThat(getFromWriteBatch(batch, "k3")).isEqualTo("v3"); - batch.setSavePoint(); - batch.remove("k2".getBytes("US-ASCII")); - batch.put("k3".getBytes("US-ASCII"), "v3-2".getBytes("US-ASCII")); + batch.delete("k2".getBytes(UTF_8)); + batch.put("k3".getBytes(UTF_8), "v3-2".getBytes(UTF_8)); assertThat(getFromWriteBatch(batch, "k2")).isNull(); assertThat(getFromWriteBatch(batch, "k3")).isEqualTo("v3-2"); @@ -140,8 +154,8 @@ public void savePoints() batch.setSavePoint(); - batch.put("k3".getBytes("US-ASCII"), "v3-3".getBytes("US-ASCII")); - batch.put("k4".getBytes("US-ASCII"), "v4".getBytes("US-ASCII")); + batch.put("k3".getBytes(UTF_8), "v3-3".getBytes(UTF_8)); + batch.put("k4".getBytes(UTF_8), "v4".getBytes(UTF_8)); assertThat(getFromWriteBatch(batch, "k3")).isEqualTo("v3-3"); assertThat(getFromWriteBatch(batch, "k4")).isEqualTo("v4"); @@ -187,6 +201,30 @@ public void deleteRange() throws RocksDBException { } } + @Test + public void restorePoints() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + + batch.put("k1".getBytes(), "v1".getBytes()); + batch.put("k2".getBytes(), "v2".getBytes()); + + batch.setSavePoint(); + + batch.put("k1".getBytes(), "123456789".getBytes()); + batch.delete("k2".getBytes()); + + batch.rollbackToSavePoint(); + + try(final CapturingWriteBatchHandler handler = new CapturingWriteBatchHandler()) { + batch.iterate(handler); + + assertThat(handler.getEvents().size()).isEqualTo(2); + assertThat(handler.getEvents().get(0)).isEqualTo(new Event(PUT, "k1".getBytes(), "v1".getBytes())); + assertThat(handler.getEvents().get(1)).isEqualTo(new Event(PUT, "k2".getBytes(), "v2".getBytes())); + } + } + } + @Test(expected = RocksDBException.class) public void restorePoints_withoutSavePoints() throws RocksDBException { try (final WriteBatch batch = new WriteBatch()) { @@ -206,67 +244,222 @@ public void restorePoints_withoutSavePoints_nested() throws RocksDBException { } } - static byte[] getContents(final WriteBatch wb) { - return getContents(wb.nativeHandle_); + @Test + public void popSavePoint() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + + batch.put("k1".getBytes(), "v1".getBytes()); + batch.put("k2".getBytes(), "v2".getBytes()); + + batch.setSavePoint(); + + batch.put("k1".getBytes(), "123456789".getBytes()); + batch.delete("k2".getBytes()); + + batch.setSavePoint(); + + batch.popSavePoint(); + + batch.rollbackToSavePoint(); + + try(final CapturingWriteBatchHandler handler = new CapturingWriteBatchHandler()) { + batch.iterate(handler); + + assertThat(handler.getEvents().size()).isEqualTo(2); + assertThat(handler.getEvents().get(0)).isEqualTo(new Event(PUT, "k1".getBytes(), "v1".getBytes())); + assertThat(handler.getEvents().get(1)).isEqualTo(new Event(PUT, "k2".getBytes(), "v2".getBytes())); + } + } } - static String getFromWriteBatch(final WriteBatch wb, final String key) - throws RocksDBException, UnsupportedEncodingException { - final WriteBatchGetter getter = - new WriteBatchGetter(key.getBytes("US-ASCII")); - wb.iterate(getter); - if(getter.getValue() != null) { - return new String(getter.getValue(), "US-ASCII"); - } else { - return null; + @Test(expected = RocksDBException.class) + public void popSavePoint_withoutSavePoints() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + batch.popSavePoint(); } } - private static native byte[] getContents(final long writeBatchHandle); + @Test(expected = RocksDBException.class) + public void popSavePoint_withoutSavePoints_nested() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { - private static class WriteBatchGetter extends WriteBatch.Handler { + batch.setSavePoint(); + batch.popSavePoint(); + + // without previous corresponding setSavePoint + batch.popSavePoint(); + } + } - private final byte[] key; - private byte[] value; + @Test + public void maxBytes() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + batch.setMaxBytes(19); - public WriteBatchGetter(final byte[] key) { - this.key = key; + batch.put("k1".getBytes(), "v1".getBytes()); } + } - public byte[] getValue() { - return value; + @Test(expected = RocksDBException.class) + public void maxBytes_over() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + batch.setMaxBytes(1); + + batch.put("k1".getBytes(), "v1".getBytes()); } + } - @Override - public void put(final byte[] key, final byte[] value) { - if(Arrays.equals(this.key, key)) { - this.value = value; + @Test + public void data() throws RocksDBException { + try (final WriteBatch batch1 = new WriteBatch()) { + batch1.delete("k0".getBytes()); + batch1.put("k1".getBytes(), "v1".getBytes()); + batch1.put("k2".getBytes(), "v2".getBytes()); + batch1.put("k3".getBytes(), "v3".getBytes()); + batch1.putLogData("log1".getBytes()); + batch1.merge("k2".getBytes(), "v22".getBytes()); + batch1.delete("k3".getBytes()); + + final byte[] serialized = batch1.data(); + + try(final WriteBatch batch2 = new WriteBatch(serialized)) { + assertThat(batch2.count()).isEqualTo(batch1.count()); + + try(final CapturingWriteBatchHandler handler1 = new CapturingWriteBatchHandler()) { + batch1.iterate(handler1); + + try (final CapturingWriteBatchHandler handler2 = new CapturingWriteBatchHandler()) { + batch2.iterate(handler2); + + assertThat(handler1.getEvents().equals(handler2.getEvents())).isTrue(); + } + } } } + } - @Override - public void merge(final byte[] key, final byte[] value) { - if(Arrays.equals(this.key, key)) { - throw new UnsupportedOperationException(); - } + @Test + public void dataSize() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + batch.put("k1".getBytes(), "v1".getBytes()); + + assertThat(batch.getDataSize()).isEqualTo(19); } + } - @Override - public void delete(final byte[] key) { - if(Arrays.equals(this.key, key)) { - this.value = null; - } + @Test + public void hasPut() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + assertThat(batch.hasPut()).isFalse(); + + batch.put("k1".getBytes(), "v1".getBytes()); + + assertThat(batch.hasPut()).isTrue(); + } + } + + @Test + public void hasDelete() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + assertThat(batch.hasDelete()).isFalse(); + + batch.delete("k1".getBytes()); + + assertThat(batch.hasDelete()).isTrue(); + } + } + + @Test + public void hasSingleDelete() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + assertThat(batch.hasSingleDelete()).isFalse(); + + batch.singleDelete("k1".getBytes()); + + assertThat(batch.hasSingleDelete()).isTrue(); + } + } + + @Test + public void hasDeleteRange() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + assertThat(batch.hasDeleteRange()).isFalse(); + + batch.deleteRange("k1".getBytes(), "k2".getBytes()); + + assertThat(batch.hasDeleteRange()).isTrue(); + } + } + + @Test + public void hasBeginPrepareRange() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + assertThat(batch.hasBeginPrepare()).isFalse(); + } + } + + @Test + public void hasEndrepareRange() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + assertThat(batch.hasEndPrepare()).isFalse(); } + } + + @Test + public void hasCommit() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + assertThat(batch.hasCommit()).isFalse(); + } + } + + @Test + public void hasRollback() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + assertThat(batch.hasRollback()).isFalse(); + } + } + + @Test + public void walTerminationPoint() throws RocksDBException { + try (final WriteBatch batch = new WriteBatch()) { + WriteBatch.SavePoint walTerminationPoint = batch.getWalTerminationPoint(); + assertThat(walTerminationPoint.isCleared()).isTrue(); + + batch.put("k1".getBytes(UTF_8), "v1".getBytes(UTF_8)); + + batch.markWalTerminationPoint(); + + walTerminationPoint = batch.getWalTerminationPoint(); + assertThat(walTerminationPoint.getSize()).isEqualTo(19); + assertThat(walTerminationPoint.getCount()).isEqualTo(1); + assertThat(walTerminationPoint.getContentFlags()).isEqualTo(2); + } + } - @Override - public void deleteRange(final byte[] beginKey, final byte[] endKey) { - throw new UnsupportedOperationException(); + @Test + public void getWriteBatch() { + try (final WriteBatch batch = new WriteBatch()) { + assertThat(batch.getWriteBatch()).isEqualTo(batch); } + } + + static byte[] getContents(final WriteBatch wb) { + return getContents(wb.nativeHandle_); + } - @Override - public void logData(final byte[] blob) { + static String getFromWriteBatch(final WriteBatch wb, final String key) + throws RocksDBException { + final WriteBatchGetter getter = + new WriteBatchGetter(key.getBytes(UTF_8)); + wb.iterate(getter); + if(getter.getValue() != null) { + return new String(getter.getValue(), UTF_8); + } else { + return null; } } + + private static native byte[] getContents(final long writeBatchHandle); } /** diff --git a/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java b/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java index 1c5e34234e8..061af2b8fdc 100644 --- a/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java +++ b/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java @@ -14,11 +14,11 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; import java.util.Arrays; import static org.assertj.core.api.Assertions.assertThat; +import static java.nio.charset.StandardCharsets.UTF_8; public class WriteBatchWithIndexTest { @@ -75,8 +75,8 @@ public void readYourOwnWrites() throws RocksDBException { assertThat(it.key()).isEqualTo(k2); assertThat(it.value()).isEqualTo(v2Other); - //remove k1 and make sure we can read back the write - wbwi.remove(k1); + //delete k1 and make sure we can read back the write + wbwi.delete(k1); it.seek(k1); assertThat(it.key()).isNotEqualTo(k1); @@ -87,6 +87,19 @@ public void readYourOwnWrites() throws RocksDBException { assertThat(it.isValid()).isTrue(); assertThat(it.key()).isEqualTo(k1); assertThat(it.value()).isEqualTo(v1Other); + + //single remove k3 and make sure we can read back the write + wbwi.singleDelete(k3); + it.seek(k3); + assertThat(it.isValid()).isEqualTo(false); + + //reinsert k3 and make sure we see the new value + final byte[] v3Other = "otherValue3".getBytes(); + wbwi.put(k3, v3Other); + it.seek(k3); + assertThat(it.isValid()).isTrue(); + assertThat(it.key()).isEqualTo(k3); + assertThat(it.value()).isEqualTo(v3Other); } } } @@ -124,22 +137,39 @@ public void iterator() throws RocksDBException { final String v2 = "value2"; final String k3 = "key3"; final String v3 = "value3"; - final byte[] k1b = k1.getBytes(); - final byte[] v1b = v1.getBytes(); - final byte[] k2b = k2.getBytes(); - final byte[] v2b = v2.getBytes(); - final byte[] k3b = k3.getBytes(); - final byte[] v3b = v3.getBytes(); - - //add put records + final String k4 = "key4"; + final String k5 = "key5"; + final String k6 = "key6"; + final String k7 = "key7"; + final String v8 = "value8"; + final byte[] k1b = k1.getBytes(UTF_8); + final byte[] v1b = v1.getBytes(UTF_8); + final byte[] k2b = k2.getBytes(UTF_8); + final byte[] v2b = v2.getBytes(UTF_8); + final byte[] k3b = k3.getBytes(UTF_8); + final byte[] v3b = v3.getBytes(UTF_8); + final byte[] k4b = k4.getBytes(UTF_8); + final byte[] k5b = k5.getBytes(UTF_8); + final byte[] k6b = k6.getBytes(UTF_8); + final byte[] k7b = k7.getBytes(UTF_8); + final byte[] v8b = v8.getBytes(UTF_8); + + // add put records wbwi.put(k1b, v1b); wbwi.put(k2b, v2b); wbwi.put(k3b, v3b); - //add a deletion record - final String k4 = "key4"; - final byte[] k4b = k4.getBytes(); - wbwi.remove(k4b); + // add a deletion record + wbwi.delete(k4b); + + // add a single deletion record + wbwi.singleDelete(k5b); + + // add a delete range record + wbwi.deleteRange(k6b, k7b); + + // add a log record + wbwi.putLogData(v8b); final WBWIRocksIterator.WriteEntry[] expected = { new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT, @@ -149,12 +179,16 @@ public void iterator() throws RocksDBException { new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT, new DirectSlice(k3), new DirectSlice(v3)), new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.DELETE, - new DirectSlice(k4), DirectSlice.NONE) + new DirectSlice(k4), DirectSlice.NONE), + new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.SINGLE_DELETE, + new DirectSlice(k5), DirectSlice.NONE), + new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.DELETE_RANGE, + new DirectSlice(k6), new DirectSlice(k7)), }; try (final WBWIRocksIterator it = wbwi.newIterator()) { //direct access - seek to key offsets - final int[] testOffsets = {2, 0, 1, 3}; + final int[] testOffsets = {2, 0, 3, 4, 1, 5}; for (int i = 0; i < testOffsets.length; i++) { final int testOffset = testOffsets[i]; @@ -164,26 +198,26 @@ public void iterator() throws RocksDBException { assertThat(it.isValid()).isTrue(); final WBWIRocksIterator.WriteEntry entry = it.entry(); - assertThat(entry.equals(expected[testOffset])).isTrue(); + assertThat(entry).isEqualTo(expected[testOffset]); } //forward iterative access int i = 0; for (it.seekToFirst(); it.isValid(); it.next()) { - assertThat(it.entry().equals(expected[i++])).isTrue(); + assertThat(it.entry()).isEqualTo(expected[i++]); } //reverse iterative access i = expected.length - 1; for (it.seekToLast(); it.isValid(); it.prev()) { - assertThat(it.entry().equals(expected[i--])).isTrue(); + assertThat(it.entry()).isEqualTo(expected[i--]); } } } } @Test - public void zeroByteTests() { + public void zeroByteTests() throws RocksDBException { try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true)) { final byte[] zeroByteValue = new byte[]{0, 0}; //add zero byte value @@ -207,8 +241,7 @@ public void zeroByteTests() { } @Test - public void savePoints() - throws UnsupportedEncodingException, RocksDBException { + public void savePoints() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true); final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { @@ -228,7 +261,7 @@ public void savePoints() wbwi.setSavePoint(); - wbwi.remove("k2".getBytes()); + wbwi.delete("k2".getBytes()); wbwi.put("k3".getBytes(), "v3-2".getBytes()); assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k2")) @@ -272,6 +305,27 @@ public void savePoints() } } + @Test + public void restorePoints() throws RocksDBException { + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) { + + wbwi.put("k1".getBytes(UTF_8), "v1".getBytes(UTF_8)); + wbwi.put("k2".getBytes(UTF_8), "v2".getBytes(UTF_8)); + + wbwi.setSavePoint(); + + wbwi.put("k1".getBytes(UTF_8), "123456789".getBytes(UTF_8)); + wbwi.delete("k2".getBytes(UTF_8)); + + wbwi.rollbackToSavePoint(); + + try(final DBOptions options = new DBOptions()) { + assertThat(wbwi.getFromBatch(options,"k1".getBytes(UTF_8))).isEqualTo("v1".getBytes()); + assertThat(wbwi.getFromBatch(options,"k2".getBytes(UTF_8))).isEqualTo("v2".getBytes()); + } + } + } + @Test(expected = RocksDBException.class) public void restorePoints_withoutSavePoints() throws RocksDBException { try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) { @@ -291,6 +345,78 @@ public void restorePoints_withoutSavePoints_nested() throws RocksDBException { } } + @Test + public void popSavePoint() throws RocksDBException { + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) { + + wbwi.put("k1".getBytes(), "v1".getBytes()); + wbwi.put("k2".getBytes(), "v2".getBytes()); + + wbwi.setSavePoint(); + + wbwi.put("k1".getBytes(), "123456789".getBytes()); + wbwi.delete("k2".getBytes()); + + wbwi.setSavePoint(); + + wbwi.popSavePoint(); + + wbwi.rollbackToSavePoint(); + + try(final DBOptions options = new DBOptions()) { + assertThat(wbwi.getFromBatch(options,"k1".getBytes(UTF_8))).isEqualTo("v1".getBytes()); + assertThat(wbwi.getFromBatch(options,"k2".getBytes(UTF_8))).isEqualTo("v2".getBytes()); + } + } + } + + @Test(expected = RocksDBException.class) + public void popSavePoint_withoutSavePoints() throws RocksDBException { + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) { + wbwi.popSavePoint(); + } + } + + @Test(expected = RocksDBException.class) + public void popSavePoint_withoutSavePoints_nested() throws RocksDBException { + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) { + + wbwi.setSavePoint(); + wbwi.popSavePoint(); + + // without previous corresponding setSavePoint + wbwi.popSavePoint(); + } + } + + @Test + public void maxBytes() throws RocksDBException { + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) { + wbwi.setMaxBytes(19); + + wbwi.put("k1".getBytes(), "v1".getBytes()); + } + } + + @Test(expected = RocksDBException.class) + public void maxBytes_over() throws RocksDBException { + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) { + wbwi.setMaxBytes(1); + + wbwi.put("k1".getBytes(), "v1".getBytes()); + } + } + + @Test + public void getWriteBatch() { + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) { + + final WriteBatch wb = wbwi.getWriteBatch(); + assertThat(wb).isNotNull(); + assertThat(wb.isOwningHandle()).isFalse(); + } + } + private static String getFromWriteBatchWithIndex(final RocksDB db, final ReadOptions readOptions, final WriteBatchWithIndex wbwi, final String skey) { @@ -329,7 +455,7 @@ public void getFromBatch() throws RocksDBException { assertThat(wbwi.getFromBatch(dbOptions, k3)).isEqualTo(v3); assertThat(wbwi.getFromBatch(dbOptions, k4)).isNull(); - wbwi.remove(k2); + wbwi.delete(k2); assertThat(wbwi.getFromBatch(dbOptions, k2)).isNull(); } @@ -372,7 +498,7 @@ public void getFromBatchAndDB() throws RocksDBException { assertThat(wbwi.getFromBatchAndDB(db, readOptions, k3)).isEqualTo(v3); assertThat(wbwi.getFromBatchAndDB(db, readOptions, k4)).isEqualTo(v4); - wbwi.remove(k4); + wbwi.delete(k4); assertThat(wbwi.getFromBatchAndDB(db, readOptions, k4)).isNull(); } diff --git a/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java b/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java index 02ad0380ee9..42d3148ef27 100644 --- a/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java +++ b/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java @@ -10,10 +10,17 @@ import org.junit.runner.Description; import org.junit.runner.JUnitCore; import org.junit.runner.Result; +import org.junit.runner.notification.Failure; +import org.rocksdb.RocksDB; +import java.io.PrintStream; +import java.text.DecimalFormat; +import java.text.NumberFormat; import java.util.ArrayList; import java.util.List; +import static org.rocksdb.test.RocksJunitRunner.RocksJunitListener.Status.*; + /** * Custom Junit Runner to print also Test classes * and executed methods to command prompt. @@ -26,20 +33,117 @@ public class RocksJunitRunner { */ static class RocksJunitListener extends TextListener { + private final static NumberFormat secsFormat = + new DecimalFormat("###,###.###"); + + private final PrintStream writer; + + private String currentClassName = null; + private String currentMethodName = null; + private Status currentStatus = null; + private long currentTestsStartTime; + private int currentTestsCount = 0; + private int currentTestsIgnoredCount = 0; + private int currentTestsFailureCount = 0; + private int currentTestsErrorCount = 0; + + enum Status { + IGNORED, + FAILURE, + ERROR, + OK + } + /** * RocksJunitListener constructor * * @param system JUnitSystem */ public RocksJunitListener(final JUnitSystem system) { - super(system); + this(system.out()); + } + + public RocksJunitListener(final PrintStream writer) { + super(writer); + this.writer = writer; + } + + @Override + public void testRunStarted(final Description description) { + writer.format("Starting RocksJava Tests...%n"); + } @Override public void testStarted(final Description description) { - System.out.format("Run: %s testing now -> %s \n", - description.getClassName(), - description.getMethodName()); + if(currentClassName == null + || !currentClassName.equals(description.getClassName())) { + if(currentClassName != null) { + printTestsSummary(); + } else { + currentTestsStartTime = System.currentTimeMillis(); + } + writer.format("%nRunning: %s%n", description.getClassName()); + currentClassName = description.getClassName(); + } + currentMethodName = description.getMethodName(); + currentStatus = OK; + currentTestsCount++; + } + + private void printTestsSummary() { + // print summary of last test set + writer.format("Tests run: %d, Failures: %d, Errors: %d, Ignored: %d, Time elapsed: %s sec%n", + currentTestsCount, + currentTestsFailureCount, + currentTestsErrorCount, + currentTestsIgnoredCount, + formatSecs(System.currentTimeMillis() - currentTestsStartTime)); + + // reset counters + currentTestsCount = 0; + currentTestsFailureCount = 0; + currentTestsErrorCount = 0; + currentTestsIgnoredCount = 0; + currentTestsStartTime = System.currentTimeMillis(); + } + + private static String formatSecs(final double milliseconds) { + final double seconds = milliseconds / 1000; + return secsFormat.format(seconds); + } + + @Override + public void testFailure(final Failure failure) { + if (failure.getException() != null + && failure.getException() instanceof AssertionError) { + currentStatus = FAILURE; + currentTestsFailureCount++; + } else { + currentStatus = ERROR; + currentTestsErrorCount++; + } + } + + @Override + public void testIgnored(final Description description) { + currentStatus = IGNORED; + currentTestsIgnoredCount++; + } + + @Override + public void testFinished(final Description description) { + if(currentStatus == OK) { + writer.format("\t%s OK%n",currentMethodName); + } else { + writer.format(" [%s] %s%n", currentStatus.name(), currentMethodName); + } + } + + @Override + public void testRunFinished(final Result result) { + printTestsSummary(); + super.testRunFinished(result); } } diff --git a/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java b/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java new file mode 100644 index 00000000000..83ac5d3d27f --- /dev/null +++ b/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java @@ -0,0 +1,171 @@ +package org.rocksdb.util; + +import org.rocksdb.RocksDBException; +import org.rocksdb.WriteBatch; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; + +/** + * A simple WriteBatch Handler which adds a record + * of each event that it receives to a list + */ +public class CapturingWriteBatchHandler extends WriteBatch.Handler { + + private final List events = new ArrayList<>(); + + /** + * Returns a copy of the current events list + * + * @return a list of the events which have happened upto now + */ + public List getEvents() { + return new ArrayList<>(events); + } + + @Override + public void put(final int columnFamilyId, final byte[] key, + final byte[] value) { + events.add(new Event(Action.PUT, columnFamilyId, key, value)); + } + + @Override + public void put(final byte[] key, final byte[] value) { + events.add(new Event(Action.PUT, key, value)); + } + + @Override + public void merge(final int columnFamilyId, final byte[] key, + final byte[] value) { + events.add(new Event(Action.MERGE, columnFamilyId, key, value)); + } + + @Override + public void merge(final byte[] key, final byte[] value) { + events.add(new Event(Action.MERGE, key, value)); + } + + @Override + public void delete(final int columnFamilyId, final byte[] key) { + events.add(new Event(Action.DELETE, columnFamilyId, key, (byte[])null)); + } + + @Override + public void delete(final byte[] key) { + events.add(new Event(Action.DELETE, key, (byte[])null)); + } + + @Override + public void singleDelete(final int columnFamilyId, final byte[] key) { + events.add(new Event(Action.SINGLE_DELETE, + columnFamilyId, key, (byte[])null)); + } + + @Override + public void singleDelete(final byte[] key) { + events.add(new Event(Action.SINGLE_DELETE, key, (byte[])null)); + } + + @Override + public void deleteRange(final int columnFamilyId, final byte[] beginKey, + final byte[] endKey) { + events.add(new Event(Action.DELETE_RANGE, columnFamilyId, beginKey, + endKey)); + } + + @Override + public void deleteRange(final byte[] beginKey, final byte[] endKey) { + events.add(new Event(Action.DELETE_RANGE, beginKey, endKey)); + } + + @Override + public void logData(final byte[] blob) { + events.add(new Event(Action.LOG, (byte[])null, blob)); + } + + @Override + public void putBlobIndex(final int columnFamilyId, final byte[] key, + final byte[] value) { + events.add(new Event(Action.PUT_BLOB_INDEX, key, value)); + } + + @Override + public void markBeginPrepare() throws RocksDBException { + events.add(new Event(Action.MARK_BEGIN_PREPARE, (byte[])null, + (byte[])null)); + } + + @Override + public void markEndPrepare(final byte[] xid) throws RocksDBException { + events.add(new Event(Action.MARK_END_PREPARE, (byte[])null, + (byte[])null)); + } + + @Override + public void markNoop(final boolean emptyBatch) throws RocksDBException { + events.add(new Event(Action.MARK_NOOP, (byte[])null, (byte[])null)); + } + + @Override + public void markRollback(final byte[] xid) throws RocksDBException { + events.add(new Event(Action.MARK_ROLLBACK, (byte[])null, (byte[])null)); + } + + @Override + public void markCommit(final byte[] xid) throws RocksDBException { + events.add(new Event(Action.MARK_COMMIT, (byte[])null, (byte[])null)); + } + + public static class Event { + public final Action action; + public final int columnFamilyId; + public final byte[] key; + public final byte[] value; + + public Event(final Action action, final byte[] key, final byte[] value) { + this(action, 0, key, value); + } + + public Event(final Action action, final int columnFamilyId, final byte[] key, + final byte[] value) { + this.action = action; + this.columnFamilyId = columnFamilyId; + this.key = key; + this.value = value; + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final Event event = (Event) o; + return columnFamilyId == event.columnFamilyId && + action == event.action && + ((key == null && event.key == null) + || Arrays.equals(key, event.key)) && + ((value == null && event.value == null) + || Arrays.equals(value, event.value)); + } + + @Override + public int hashCode() { + + return Objects.hash(action, columnFamilyId, key, value); + } + } + + /** + * Enumeration of Write Batch + * event actions + */ + public enum Action { + PUT, MERGE, DELETE, SINGLE_DELETE, DELETE_RANGE, LOG, PUT_BLOB_INDEX, + MARK_BEGIN_PREPARE, MARK_END_PREPARE, MARK_NOOP, MARK_COMMIT, + MARK_ROLLBACK } +} diff --git a/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java b/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java new file mode 100644 index 00000000000..a0d8d669d23 --- /dev/null +++ b/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java @@ -0,0 +1,133 @@ +package org.rocksdb.util; + +import org.rocksdb.RocksDBException; +import org.rocksdb.WriteBatch; + +import java.util.Arrays; + +public class WriteBatchGetter extends WriteBatch.Handler { + + private int columnFamilyId = -1; + private final byte[] key; + private byte[] value; + + public WriteBatchGetter(final byte[] key) { + this.key = key; + } + + public byte[] getValue() { + return value; + } + + @Override + public void put(final int columnFamilyId, final byte[] key, + final byte[] value) { + if(Arrays.equals(this.key, key)) { + this.columnFamilyId = columnFamilyId; + this.value = value; + } + } + + @Override + public void put(final byte[] key, final byte[] value) { + if(Arrays.equals(this.key, key)) { + this.value = value; + } + } + + @Override + public void merge(final int columnFamilyId, final byte[] key, + final byte[] value) { + if(Arrays.equals(this.key, key)) { + this.columnFamilyId = columnFamilyId; + this.value = value; + } + } + + @Override + public void merge(final byte[] key, final byte[] value) { + if(Arrays.equals(this.key, key)) { + this.value = value; + } + } + + @Override + public void delete(final int columnFamilyId, final byte[] key) { + if(Arrays.equals(this.key, key)) { + this.columnFamilyId = columnFamilyId; + this.value = null; + } + } + + @Override + public void delete(final byte[] key) { + if(Arrays.equals(this.key, key)) { + this.value = null; + } + } + + @Override + public void singleDelete(final int columnFamilyId, final byte[] key) { + if(Arrays.equals(this.key, key)) { + this.columnFamilyId = columnFamilyId; + this.value = null; + } + } + + @Override + public void singleDelete(final byte[] key) { + if(Arrays.equals(this.key, key)) { + this.value = null; + } + } + + @Override + public void deleteRange(final int columnFamilyId, final byte[] beginKey, + final byte[] endKey) { + throw new UnsupportedOperationException(); + } + + @Override + public void deleteRange(final byte[] beginKey, final byte[] endKey) { + throw new UnsupportedOperationException(); + } + + @Override + public void logData(final byte[] blob) { + throw new UnsupportedOperationException(); + } + + @Override + public void putBlobIndex(final int columnFamilyId, final byte[] key, + final byte[] value) { + if(Arrays.equals(this.key, key)) { + this.columnFamilyId = columnFamilyId; + this.value = value; + } + } + + @Override + public void markBeginPrepare() throws RocksDBException { + throw new UnsupportedOperationException(); + } + + @Override + public void markEndPrepare(final byte[] xid) throws RocksDBException { + throw new UnsupportedOperationException(); + } + + @Override + public void markNoop(final boolean emptyBatch) throws RocksDBException { + throw new UnsupportedOperationException(); + } + + @Override + public void markRollback(final byte[] xid) throws RocksDBException { + throw new UnsupportedOperationException(); + } + + @Override + public void markCommit(final byte[] xid) throws RocksDBException { + throw new UnsupportedOperationException(); + } +} diff --git a/memtable/hash_cuckoo_rep.cc b/memtable/hash_cuckoo_rep.cc index 034bf5858b6..39078633f67 100644 --- a/memtable/hash_cuckoo_rep.cc +++ b/memtable/hash_cuckoo_rep.cc @@ -597,8 +597,8 @@ void HashCuckooRep::Iterator::Seek(const Slice& user_key, } // Retreat to the last entry with a key <= target -void HashCuckooRep::Iterator::SeekForPrev(const Slice& user_key, - const char* memtable_key) { +void HashCuckooRep::Iterator::SeekForPrev(const Slice& /*user_key*/, + const char* /*memtable_key*/) { assert(false); } @@ -623,7 +623,7 @@ void HashCuckooRep::Iterator::SeekToLast() { MemTableRep* HashCuckooRepFactory::CreateMemTableRep( const MemTableRep::KeyComparator& compare, Allocator* allocator, - const SliceTransform* transform, Logger* logger) { + const SliceTransform* /*transform*/, Logger* /*logger*/) { // The estimated average fullness. The write performance of any close hash // degrades as the fullness of the mem-table increases. Setting kFullness // to a value around 0.7 can better avoid write performance degradation while diff --git a/memtable/hash_linklist_rep.cc b/memtable/hash_linklist_rep.cc index 932b62a3460..b23a9f5e51d 100644 --- a/memtable/hash_linklist_rep.cc +++ b/memtable/hash_linklist_rep.cc @@ -362,14 +362,14 @@ class HashLinkListRep : public MemTableRep { // Advance to the first entry with a key >= target virtual void Seek(const Slice& internal_key, - const char* memtable_key) override { + const char* /*memtable_key*/) override { node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_, internal_key); } // Retreat to the last entry with a key <= target - virtual void SeekForPrev(const Slice& internal_key, - const char* memtable_key) override { + virtual void SeekForPrev(const Slice& /*internal_key*/, + const char* /*memtable_key*/) override { // Since we do not support Prev() // We simply do not support SeekForPrev Reset(nullptr); @@ -483,10 +483,10 @@ class HashLinkListRep : public MemTableRep { } virtual void Next() override {} virtual void Prev() override {} - virtual void Seek(const Slice& user_key, - const char* memtable_key) override {} - virtual void SeekForPrev(const Slice& user_key, - const char* memtable_key) override {} + virtual void Seek(const Slice& /*user_key*/, + const char* /*memtable_key*/) override {} + virtual void SeekForPrev(const Slice& /*user_key*/, + const char* /*memtable_key*/) override {} virtual void SeekToFirst() override {} virtual void SeekToLast() override {} diff --git a/memtable/hash_skiplist_rep.cc b/memtable/hash_skiplist_rep.cc index e34743eb2c7..93082b1ec28 100644 --- a/memtable/hash_skiplist_rep.cc +++ b/memtable/hash_skiplist_rep.cc @@ -131,8 +131,8 @@ class HashSkipListRep : public MemTableRep { } // Retreat to the last entry with a key <= target - virtual void SeekForPrev(const Slice& internal_key, - const char* memtable_key) override { + virtual void SeekForPrev(const Slice& /*internal_key*/, + const char* /*memtable_key*/) override { // not supported assert(false); } @@ -219,10 +219,10 @@ class HashSkipListRep : public MemTableRep { } virtual void Next() override {} virtual void Prev() override {} - virtual void Seek(const Slice& internal_key, - const char* memtable_key) override {} - virtual void SeekForPrev(const Slice& internal_key, - const char* memtable_key) override {} + virtual void Seek(const Slice& /*internal_key*/, + const char* /*memtable_key*/) override {} + virtual void SeekForPrev(const Slice& /*internal_key*/, + const char* /*memtable_key*/) override {} virtual void SeekToFirst() override {} virtual void SeekToLast() override {} @@ -335,7 +335,7 @@ MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator(Arena* arena) { MemTableRep* HashSkipListRepFactory::CreateMemTableRep( const MemTableRep::KeyComparator& compare, Allocator* allocator, - const SliceTransform* transform, Logger* logger) { + const SliceTransform* transform, Logger* /*logger*/) { return new HashSkipListRep(compare, allocator, transform, bucket_count_, skiplist_height_, skiplist_branching_factor_); } diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h index 702a7336dbc..efcb93c85b0 100644 --- a/memtable/inlineskiplist.h +++ b/memtable/inlineskiplist.h @@ -45,9 +45,12 @@ #include #include #include +#include #include "port/likely.h" #include "port/port.h" +#include "rocksdb/slice.h" #include "util/allocator.h" +#include "util/coding.h" #include "util/random.h" namespace rocksdb { @@ -59,6 +62,9 @@ class InlineSkipList { struct Splice; public: + using DecodedKey = \ + typename std::remove_reference::type::DecodedType; + static const uint16_t kMaxPossibleHeight = 32; // Create a new InlineSkipList object that will use "cmp" for comparing @@ -212,6 +218,7 @@ class InlineSkipList { // Return true if key is greater than the data stored in "n". Null n // is considered infinite. n should not be head_. bool KeyIsAfterNode(const char* key, Node* n) const; + bool KeyIsAfterNode(const DecodedKey& key, Node* n) const; // Returns the earliest node with a key >= key. // Return nullptr if there is no such node. @@ -241,12 +248,12 @@ class InlineSkipList { // a node that is after the key. after should be nullptr if a good after // node isn't conveniently available. template - void FindSpliceForLevel(const char* key, Node* before, Node* after, int level, + void FindSpliceForLevel(const DecodedKey& key, Node* before, Node* after, int level, Node** out_prev, Node** out_next); // Recomputes Splice levels from highest_level (inclusive) down to // lowest_level (inclusive). - void RecomputeSpliceLevels(const char* key, Splice* splice, + void RecomputeSpliceLevels(const DecodedKey& key, Splice* splice, int recompute_level); // No copying allowed @@ -435,6 +442,14 @@ bool InlineSkipList::KeyIsAfterNode(const char* key, return (n != nullptr) && (compare_(n->Key(), key) < 0); } +template +bool InlineSkipList::KeyIsAfterNode(const DecodedKey& key, + Node* n) const { + // nullptr n is considered infinite + assert(n != head_); + return (n != nullptr) && (compare_(n->Key(), key) < 0); +} + template typename InlineSkipList::Node* InlineSkipList::FindGreaterOrEqual(const char* key) const { @@ -446,6 +461,7 @@ InlineSkipList::FindGreaterOrEqual(const char* key) const { Node* x = head_; int level = GetMaxHeight() - 1; Node* last_bigger = nullptr; + const DecodedKey key_decoded = compare_.decode_key(key); while (true) { Node* next = x->Next(level); if (next != nullptr) { @@ -454,10 +470,10 @@ InlineSkipList::FindGreaterOrEqual(const char* key) const { // Make sure the lists are sorted assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); // Make sure we haven't overshot during our search - assert(x == head_ || KeyIsAfterNode(key, x)); + assert(x == head_ || KeyIsAfterNode(key_decoded, x)); int cmp = (next == nullptr || next == last_bigger) ? 1 - : compare_(next->Key(), key); + : compare_(next->Key(), key_decoded); if (cmp == 0 || (cmp > 0 && level == 0)) { return next; } else if (cmp < 0) { @@ -487,6 +503,7 @@ InlineSkipList::FindLessThan(const char* key, Node** prev, Node* x = root; // KeyIsAfter(key, last_not_after) is definitely false Node* last_not_after = nullptr; + const DecodedKey key_decoded = compare_.decode_key(key); while (true) { assert(x != nullptr); Node* next = x->Next(level); @@ -494,8 +511,8 @@ InlineSkipList::FindLessThan(const char* key, Node** prev, PREFETCH(next->Next(level), 0, 1); } assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); - assert(x == head_ || KeyIsAfterNode(key, x)); - if (next != last_not_after && KeyIsAfterNode(key, next)) { + assert(x == head_ || KeyIsAfterNode(key_decoded, x)); + if (next != last_not_after && KeyIsAfterNode(key_decoded, next)) { // Keep searching in this list assert(next != nullptr); x = next; @@ -540,13 +557,14 @@ uint64_t InlineSkipList::EstimateCount(const char* key) const { Node* x = head_; int level = GetMaxHeight() - 1; + const DecodedKey key_decoded = compare_.decode_key(key); while (true) { - assert(x == head_ || compare_(x->Key(), key) < 0); + assert(x == head_ || compare_(x->Key(), key_decoded) < 0); Node* next = x->Next(level); if (next != nullptr) { PREFETCH(next->Next(level), 0, 1); } - if (next == nullptr || compare_(next->Key(), key) >= 0) { + if (next == nullptr || compare_(next->Key(), key_decoded) >= 0) { if (level == 0) { return count; } else { @@ -654,7 +672,7 @@ bool InlineSkipList::InsertWithHint(const char* key, void** hint) { template template -void InlineSkipList::FindSpliceForLevel(const char* key, +void InlineSkipList::FindSpliceForLevel(const DecodedKey& key, Node* before, Node* after, int level, Node** out_prev, Node** out_next) { @@ -682,7 +700,7 @@ void InlineSkipList::FindSpliceForLevel(const char* key, } template -void InlineSkipList::RecomputeSpliceLevels(const char* key, +void InlineSkipList::RecomputeSpliceLevels(const DecodedKey& key, Splice* splice, int recompute_level) { assert(recompute_level > 0); @@ -698,6 +716,7 @@ template bool InlineSkipList::Insert(const char* key, Splice* splice, bool allow_partial_splice_fix) { Node* x = reinterpret_cast(const_cast(key)) - 1; + const DecodedKey key_decoded = compare_.decode_key(key); int height = x->UnstashHeight(); assert(height >= 1 && height <= kMaxHeight_); @@ -765,7 +784,8 @@ bool InlineSkipList::Insert(const char* key, Splice* splice, // our chances of success. ++recompute_height; } else if (splice->prev_[recompute_height] != head_ && - !KeyIsAfterNode(key, splice->prev_[recompute_height])) { + !KeyIsAfterNode(key_decoded, + splice->prev_[recompute_height])) { // key is from before splice if (allow_partial_splice_fix) { // skip all levels with the same node without more comparisons @@ -777,7 +797,8 @@ bool InlineSkipList::Insert(const char* key, Splice* splice, // we're pessimistic, recompute everything recompute_height = max_height; } - } else if (KeyIsAfterNode(key, splice->next_[recompute_height])) { + } else if (KeyIsAfterNode(key_decoded, + splice->next_[recompute_height])) { // key is from after splice if (allow_partial_splice_fix) { Node* bad = splice->next_[recompute_height]; @@ -795,7 +816,7 @@ bool InlineSkipList::Insert(const char* key, Splice* splice, } assert(recompute_height <= max_height); if (recompute_height > 0) { - RecomputeSpliceLevels(key, splice, recompute_height); + RecomputeSpliceLevels(key_decoded, splice, recompute_height); } bool splice_is_valid = true; @@ -827,8 +848,8 @@ bool InlineSkipList::Insert(const char* key, Splice* splice, // search, because it should be unlikely that lots of nodes have // been inserted between prev[i] and next[i]. No point in using // next[i] as the after hint, because we know it is stale. - FindSpliceForLevel(key, splice->prev_[i], nullptr, i, &splice->prev_[i], - &splice->next_[i]); + FindSpliceForLevel(key_decoded, splice->prev_[i], nullptr, i, + &splice->prev_[i], &splice->next_[i]); // Since we've narrowed the bracket for level i, we might have // violated the Splice constraint between i and i-1. Make sure @@ -842,8 +863,8 @@ bool InlineSkipList::Insert(const char* key, Splice* splice, for (int i = 0; i < height; ++i) { if (i >= recompute_height && splice->prev_[i]->Next(i) != splice->next_[i]) { - FindSpliceForLevel(key, splice->prev_[i], nullptr, i, &splice->prev_[i], - &splice->next_[i]); + FindSpliceForLevel(key_decoded, splice->prev_[i], nullptr, i, + &splice->prev_[i], &splice->next_[i]); } // Checking for duplicate keys on the level 0 is sufficient if (UNLIKELY(i == 0 && splice->next_[i] != nullptr && diff --git a/memtable/inlineskiplist_test.cc b/memtable/inlineskiplist_test.cc index 70fd11a7692..10667396cc1 100644 --- a/memtable/inlineskiplist_test.cc +++ b/memtable/inlineskiplist_test.cc @@ -32,6 +32,12 @@ static Key Decode(const char* key) { } struct TestComparator { + typedef Key DecodedType; + + static DecodedType decode_key(const char* b) { + return Decode(b); + } + int operator()(const char* a, const char* b) const { if (Decode(a) < Decode(b)) { return -1; @@ -41,6 +47,16 @@ struct TestComparator { return 0; } } + + int operator()(const char* a, const DecodedType b) const { + if (Decode(a) < b) { + return -1; + } else if (Decode(a) > b) { + return +1; + } else { + return 0; + } + } }; typedef InlineSkipList TestInlineSkipList; diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc index 63f7a4246bc..1e56e1a9840 100644 --- a/memtable/skiplistrep.cc +++ b/memtable/skiplistrep.cc @@ -282,7 +282,7 @@ class SkipListRep : public MemTableRep { MemTableRep* SkipListFactory::CreateMemTableRep( const MemTableRep::KeyComparator& compare, Allocator* allocator, - const SliceTransform* transform, Logger* logger) { + const SliceTransform* transform, Logger* /*logger*/) { return new SkipListRep(compare, allocator, transform, lookahead_); } diff --git a/memtable/vectorrep.cc b/memtable/vectorrep.cc index e54025c2d3d..378b29624af 100644 --- a/memtable/vectorrep.cc +++ b/memtable/vectorrep.cc @@ -227,8 +227,8 @@ void VectorRep::Iterator::Seek(const Slice& user_key, } // Advance to the first entry with a key <= target -void VectorRep::Iterator::SeekForPrev(const Slice& user_key, - const char* memtable_key) { +void VectorRep::Iterator::SeekForPrev(const Slice& /*user_key*/, + const char* /*memtable_key*/) { assert(false); } @@ -296,7 +296,7 @@ MemTableRep::Iterator* VectorRep::GetIterator(Arena* arena) { MemTableRep* VectorRepFactory::CreateMemTableRep( const MemTableRep::KeyComparator& compare, Allocator* allocator, - const SliceTransform*, Logger* logger) { + const SliceTransform*, Logger* /*logger*/) { return new VectorRep(compare, allocator, count_); } } // namespace rocksdb diff --git a/monitoring/histogram_windowing.cc b/monitoring/histogram_windowing.cc index 28d8265f263..5c49fcd16b0 100644 --- a/monitoring/histogram_windowing.cc +++ b/monitoring/histogram_windowing.cc @@ -60,7 +60,7 @@ void HistogramWindowingImpl::Add(uint64_t value){ stats_.Add(value); // Current window update - window_stats_[current_window()].Add(value); + window_stats_[static_cast(current_window())].Add(value); } void HistogramWindowingImpl::Merge(const Histogram& other) { @@ -89,8 +89,11 @@ void HistogramWindowingImpl::Merge(const HistogramWindowingImpl& other) { (cur_window + num_windows_ - i) % num_windows_; uint64_t other_window_index = (other_cur_window + other.num_windows_ - i) % other.num_windows_; + size_t windex = static_cast(window_index); + size_t other_windex = static_cast(other_window_index); - window_stats_[window_index].Merge(other.window_stats_[other_window_index]); + window_stats_[windex].Merge( + other.window_stats_[other_windex]); } } @@ -129,8 +132,9 @@ void HistogramWindowingImpl::Data(HistogramData * const data) const { void HistogramWindowingImpl::TimerTick() { uint64_t curr_time = env_->NowMicros(); + size_t curr_window_ = static_cast(current_window()); if (curr_time - last_swap_time() > micros_per_window_ && - window_stats_[current_window()].num() >= min_num_per_window_) { + window_stats_[curr_window_].num() >= min_num_per_window_) { SwapHistoryBucket(); } } @@ -149,7 +153,8 @@ void HistogramWindowingImpl::SwapHistoryBucket() { 0 : curr_window + 1; // subtract next buckets from totals and swap to next buckets - HistogramStat& stats_to_drop = window_stats_[next_window]; + HistogramStat& stats_to_drop = + window_stats_[static_cast(next_window)]; if (!stats_to_drop.Empty()) { for (size_t b = 0; b < stats_.num_buckets_; b++){ diff --git a/options/options.cc b/options/options.cc index 03591b61eb9..0ebf1547356 100644 --- a/options/options.cc +++ b/options/options.cc @@ -168,6 +168,10 @@ void ColumnFamilyOptions::Dump(Logger* log) const { log, " Options.compression_opts.max_dict_bytes: %" ROCKSDB_PRIszt, compression_opts.max_dict_bytes); + ROCKS_LOG_HEADER(log, + " Options.compression_opts.zstd_max_train_bytes: " + "%" ROCKSDB_PRIszt, + compression_opts.zstd_max_train_bytes); ROCKS_LOG_HEADER(log, " Options.level0_file_num_compaction_trigger: %d", level0_file_num_compaction_trigger); ROCKS_LOG_HEADER(log, " Options.level0_slowdown_writes_trigger: %d", diff --git a/options/options_helper.cc b/options/options_helper.cc index 7e4feff4a91..afcad5bf45a 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -96,6 +96,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.listeners = immutable_db_options.listeners; options.enable_thread_tracking = immutable_db_options.enable_thread_tracking; options.delayed_write_rate = mutable_db_options.delayed_write_rate; + options.enable_pipelined_write = immutable_db_options.enable_pipelined_write; options.allow_concurrent_memtable_write = immutable_db_options.allow_concurrent_memtable_write; options.enable_write_thread_adaptive_yield = @@ -918,6 +919,17 @@ Status ParseColumnFamilyOption(const std::string& name, } new_options->compression_opts.max_dict_bytes = ParseInt(value.substr(start, value.size() - start)); + end = value.find(':', start); + } + // zstd_max_train_bytes is optional for backwards compatibility + if (end != std::string::npos) { + start = end + 1; + if (start >= value.size()) { + return Status::InvalidArgument( + "unable to parse the specified CF option " + name); + } + new_options->compression_opts.zstd_max_train_bytes = + ParseInt(value.substr(start, value.size() - start)); } } else { auto iter = cf_options_type_info.find(name); diff --git a/options/options_parser.cc b/options/options_parser.cc index 0095c3ab70b..c7a7c6d377d 100644 --- a/options/options_parser.cc +++ b/options/options_parser.cc @@ -735,7 +735,7 @@ Status RocksDBOptionsParser::VerifyRocksDBOptionsFromFile( Status RocksDBOptionsParser::VerifyDBOptions( const DBOptions& base_opt, const DBOptions& persisted_opt, - const std::unordered_map* opt_map, + const std::unordered_map* /*opt_map*/, OptionsSanityCheckLevel sanity_check_level) { for (auto pair : db_options_type_info) { if (pair.second.verification == OptionVerificationType::kDeprecated) { diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index d64473c3a38..11f708fe7d4 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -151,7 +151,8 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { "format_version=1;" "hash_index_allow_collision=false;" "verify_compression=true;read_amp_bytes_per_bit=0;" - "enable_index_compression=false", + "enable_index_compression=false;" + "block_align=true", new_bbto)); ASSERT_EQ(unset_bytes_base, diff --git a/port/port_posix.cc b/port/port_posix.cc index 129933bb1f9..2408beeb030 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -36,6 +36,7 @@ static int PthreadCall(const char* label, int result) { } Mutex::Mutex(bool adaptive) { + (void) adaptive; #ifdef ROCKSDB_PTHREAD_ADAPTIVE_MUTEX if (!adaptive) { PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr)); @@ -192,7 +193,7 @@ void *cacheline_aligned_alloc(size_t size) { #elif ( _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || defined(__APPLE__)) void *m; errno = posix_memalign(&m, CACHE_LINE_SIZE, size); - return errno ? NULL : m; + return errno ? nullptr : m; #else return malloc(size); #endif diff --git a/port/stack_trace.cc b/port/stack_trace.cc index 2ed0016dbe2..8f8135a446c 100644 --- a/port/stack_trace.cc +++ b/port/stack_trace.cc @@ -13,7 +13,7 @@ namespace rocksdb { namespace port { void InstallStackTraceHandler() {} -void PrintStack(int first_frames_to_skip) {} +void PrintStack(int /*first_frames_to_skip*/) {} } // namespace port } // namespace rocksdb diff --git a/port/win/env_win.cc b/port/win/env_win.cc index 8cf61e52d25..1fb9b183af0 100644 --- a/port/win/env_win.cc +++ b/port/win/env_win.cc @@ -35,6 +35,10 @@ #include // for uuid generation #include +#include +#include "strsafe.h" + +#include namespace rocksdb { @@ -44,10 +48,15 @@ ThreadStatusUpdater* CreateThreadStatusUpdater() { namespace { +static const size_t kSectorSize = 512; // Sector size used when physical sector size could not be obtained from device. + // RAII helpers for HANDLEs const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); }; typedef std::unique_ptr UniqueCloseHandlePtr; +const auto FindCloseFunc = [](HANDLE h) { ::FindClose(h); }; +typedef std::unique_ptr UniqueFindClosePtr; + void WinthreadCall(const char* label, std::error_code result) { if (0 != result.value()) { fprintf(stderr, "pthread %s: %s\n", label, strerror(result.value())); @@ -61,7 +70,7 @@ namespace port { WinEnvIO::WinEnvIO(Env* hosted_env) : hosted_env_(hosted_env), - page_size_(4 * 1012), + page_size_(4 * 1024), allocation_granularity_(page_size_), perf_counter_frequency_(0), GetSystemTimePreciseAsFileTime_(NULL) { @@ -93,8 +102,11 @@ WinEnvIO::~WinEnvIO() { Status WinEnvIO::DeleteFile(const std::string& fname) { Status result; - if (_unlink(fname.c_str())) { - result = IOError("Failed to delete: " + fname, errno); + BOOL ret = DeleteFileA(fname.c_str()); + if(!ret) { + auto lastError = GetLastError(); + result = IOErrorFromWindowsError("Failed to delete: " + fname, + lastError); } return result; @@ -231,7 +243,8 @@ Status WinEnvIO::NewRandomAccessFile(const std::string& fname, fileGuard.release(); } } else { - result->reset(new WinRandomAccessFile(fname, hFile, page_size_, options)); + result->reset(new WinRandomAccessFile(fname, hFile, + std::max(GetSectorSize(fname), page_size_), options)); fileGuard.release(); } return s; @@ -265,8 +278,7 @@ Status WinEnvIO::OpenWritableFile(const std::string& fname, if (local_options.use_mmap_writes) { desired_access |= GENERIC_READ; - } - else { + } else { // Adding this solely for tests to pass (fault_injection_test, // wal_manager_test). shared_mode |= (FILE_SHARE_WRITE | FILE_SHARE_DELETE); @@ -317,7 +329,7 @@ Status WinEnvIO::OpenWritableFile(const std::string& fname, } else { // Here we want the buffer allocation to be aligned by the SSD page size // and to be a multiple of it - result->reset(new WinWritableFile(fname, hFile, page_size_, + result->reset(new WinWritableFile(fname, hFile, std::max(GetSectorSize(fname), GetPageSize()), c_BufferCapacity, local_options)); } return s; @@ -361,7 +373,8 @@ Status WinEnvIO::NewRandomRWFile(const std::string & fname, } UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc); - result->reset(new WinRandomRWFile(fname, hFile, page_size_, options)); + result->reset(new WinRandomRWFile(fname, hFile, std::max(GetSectorSize(fname), GetPageSize()), + options)); fileGuard.release(); return s; @@ -372,67 +385,128 @@ Status WinEnvIO::NewDirectory(const std::string& name, Status s; // Must be nullptr on failure result->reset(); - // Must fail if directory does not exist + if (!DirExists(name)) { - s = IOError("Directory does not exist: " + name, EEXIST); - } else { + s = IOErrorFromWindowsError( + "open folder: " + name, ERROR_DIRECTORY); + return s; + } + + HANDLE handle = INVALID_HANDLE_VALUE; + // 0 - for access means read metadata + { IOSTATS_TIMER_GUARD(open_nanos); - result->reset(new WinDirectory); + handle = ::CreateFileA(name.c_str(), 0, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, + NULL, + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + NULL); } + + if (INVALID_HANDLE_VALUE == handle) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "open folder: " + name, lastError); + return s; + } + + result->reset(new WinDirectory(handle)); + return s; } Status WinEnvIO::FileExists(const std::string& fname) { - // F_OK == 0 - const int F_OK_ = 0; - return _access(fname.c_str(), F_OK_) == 0 ? Status::OK() - : Status::NotFound(); + Status s; + // TODO: This does not follow symbolic links at this point + // which is consistent with _access() impl on windows + // but can be added + WIN32_FILE_ATTRIBUTE_DATA attrs; + if (FALSE == GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard, + &attrs)) { + auto lastError = GetLastError(); + switch (lastError) { + case ERROR_ACCESS_DENIED: + case ERROR_NOT_FOUND: + case ERROR_FILE_NOT_FOUND: + case ERROR_PATH_NOT_FOUND: + s = Status::NotFound(); + break; + default: + s = IOErrorFromWindowsError("Unexpected error for: " + fname, + lastError); + break; + } + } + return s; } Status WinEnvIO::GetChildren(const std::string& dir, std::vector* result) { + Status status; result->clear(); std::vector output; - Status status; + WIN32_FIND_DATA data; + std::string pattern(dir); + pattern.append("\\").append("*"); - auto CloseDir = [](DIR* p) { closedir(p); }; - std::unique_ptr dirp(opendir(dir.c_str()), - CloseDir); - - if (!dirp) { - switch (errno) { - case EACCES: - case ENOENT: - case ENOTDIR: - return Status::NotFound(); - default: - return IOError(dir, errno); - } - } else { - if (result->capacity() > 0) { - output.reserve(result->capacity()); - } + HANDLE handle = ::FindFirstFileExA(pattern.c_str(), + FindExInfoBasic, // Do not want alternative name + &data, + FindExSearchNameMatch, + NULL, // lpSearchFilter + 0); - struct dirent* ent = readdir(dirp.get()); - while (ent) { - output.push_back(ent->d_name); - ent = readdir(dirp.get()); + if (handle == INVALID_HANDLE_VALUE) { + auto lastError = GetLastError(); + switch (lastError) { + case ERROR_NOT_FOUND: + case ERROR_ACCESS_DENIED: + case ERROR_FILE_NOT_FOUND: + case ERROR_PATH_NOT_FOUND: + status = Status::NotFound(); + break; + default: + status = IOErrorFromWindowsError( + "Failed to GetChhildren for: " + dir, lastError); } + return status; } - output.swap(*result); + UniqueFindClosePtr fc(handle, FindCloseFunc); + + if (result->capacity() > 0) { + output.reserve(result->capacity()); + } + // For safety + data.cFileName[MAX_PATH - 1] = 0; + + while (true) { + output.emplace_back(data.cFileName); + BOOL ret =- ::FindNextFileA(handle, &data); + // If the function fails the return value is zero + // and non-zero otherwise. Not TRUE or FALSE. + if (ret == FALSE) { + // Posix does not care why we stopped + break; + } + data.cFileName[MAX_PATH - 1] = 0; + } + output.swap(*result); return status; } Status WinEnvIO::CreateDir(const std::string& name) { Status result; - if (_mkdir(name.c_str()) != 0) { - auto code = errno; - result = IOError("Failed to create dir: " + name, code); + BOOL ret = CreateDirectoryA(name.c_str(), NULL); + if (!ret) { + auto lastError = GetLastError(); + result = IOErrorFromWindowsError( + "Failed to create a directory: " + name, lastError); } return result; @@ -441,28 +515,26 @@ Status WinEnvIO::CreateDir(const std::string& name) { Status WinEnvIO::CreateDirIfMissing(const std::string& name) { Status result; - if (DirExists(name)) { - return result; - } - - if (_mkdir(name.c_str()) != 0) { - if (errno == EEXIST) { + BOOL ret = CreateDirectoryA(name.c_str(), NULL); + if (!ret) { + auto lastError = GetLastError(); + if (lastError != ERROR_ALREADY_EXISTS) { + result = IOErrorFromWindowsError( + "Failed to create a directory: " + name, lastError); + } else if (!DirExists(name)) { result = Status::IOError("`" + name + "' exists but is not a directory"); - } else { - auto code = errno; - result = IOError("Failed to create dir: " + name, code); } } - return result; } Status WinEnvIO::DeleteDir(const std::string& name) { Status result; - if (_rmdir(name.c_str()) != 0) { - auto code = errno; - result = IOError("Failed to remove dir: " + name, code); + BOOL ret = RemoveDirectoryA(name.c_str()); + if (!ret) { + auto lastError = GetLastError(); + result = IOErrorFromWindowsError("Failed to remove dir: " + name, lastError); } return result; } @@ -553,6 +625,81 @@ Status WinEnvIO::LinkFile(const std::string& src, return result; } +Status WinEnvIO::AreFilesSame(const std::string& first, + const std::string& second, bool* res) { +// For MinGW builds +#if (_WIN32_WINNT == _WIN32_WINNT_VISTA) + Status s = Status::NotSupported(); +#else + assert(res != nullptr); + Status s; + if (res == nullptr) { + s = Status::InvalidArgument("res"); + return s; + } + + // 0 - for access means read metadata + HANDLE file_1 = ::CreateFileA(first.c_str(), 0, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, + NULL, + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + NULL); + + if (INVALID_HANDLE_VALUE == file_1) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "open file: " + first, lastError); + return s; + } + UniqueCloseHandlePtr g_1(file_1, CloseHandleFunc); + + HANDLE file_2 = ::CreateFileA(second.c_str(), 0, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, + NULL, OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + NULL); + + if (INVALID_HANDLE_VALUE == file_2) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "open file: " + second, lastError); + return s; + } + UniqueCloseHandlePtr g_2(file_2, CloseHandleFunc); + + FILE_ID_INFO FileInfo_1; + BOOL result = GetFileInformationByHandleEx(file_1, FileIdInfo, &FileInfo_1, + sizeof(FileInfo_1)); + + if (!result) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "stat file: " + first, lastError); + return s; + } + + FILE_ID_INFO FileInfo_2; + result = GetFileInformationByHandleEx(file_2, FileIdInfo, &FileInfo_2, + sizeof(FileInfo_2)); + + if (!result) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "stat file: " + second, lastError); + return s; + } + + if (FileInfo_1.VolumeSerialNumber == FileInfo_2.VolumeSerialNumber) { + *res = (0 == memcmp(FileInfo_1.FileId.Identifier, FileInfo_2.FileId.Identifier, + sizeof(FileInfo_1.FileId.Identifier))); + } else { + *res = false; + } +#endif + return s; +} + Status WinEnvIO::LockFile(const std::string& lockFname, FileLock** lock) { assert(lock != nullptr); @@ -596,12 +743,12 @@ Status WinEnvIO::UnlockFile(FileLock* lock) { } Status WinEnvIO::GetTestDirectory(std::string* result) { + std::string output; const char* env = getenv("TEST_TMPDIR"); if (env && env[0] != '\0') { output = env; - CreateDir(output); } else { env = getenv("TMP"); @@ -610,9 +757,8 @@ Status WinEnvIO::GetTestDirectory(std::string* result) { } else { output = "c:\\tmp"; } - - CreateDir(output); } + CreateDir(output); output.append("\\testrocksdb-"); output.append(std::to_string(_getpid())); @@ -722,26 +868,29 @@ Status WinEnvIO::GetHostName(char* name, uint64_t len) { Status WinEnvIO::GetAbsolutePath(const std::string& db_path, std::string* output_path) { + // Check if we already have an absolute path - // that starts with non dot and has a semicolon in it - if ((!db_path.empty() && (db_path[0] == '/' || db_path[0] == '\\')) || - (db_path.size() > 2 && db_path[0] != '.' && - ((db_path[1] == ':' && db_path[2] == '\\') || - (db_path[1] == ':' && db_path[2] == '/')))) { + // For test compatibility we will consider starting slash as an + // absolute path + if ((!db_path.empty() && (db_path[0] == '\\' || db_path[0] == '/')) || + !PathIsRelativeA(db_path.c_str())) { *output_path = db_path; return Status::OK(); } std::string result; - result.resize(_MAX_PATH); + result.resize(MAX_PATH); - char* ret = _getcwd(&result[0], _MAX_PATH); - if (ret == nullptr) { - return Status::IOError("Failed to get current working directory", - strerror(errno)); + // Hopefully no changes the current directory while we do this + // however _getcwd also suffers from the same limitation + DWORD len = GetCurrentDirectoryA(MAX_PATH, &result[0]); + if (len == 0) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError("Failed to get current working directory", + lastError); } - result.resize(strlen(result.data())); + result.resize(len); result.swap(*output_path); return Status::OK(); @@ -808,6 +957,62 @@ bool WinEnvIO::DirExists(const std::string& dname) { return false; } +size_t WinEnvIO::GetSectorSize(const std::string& fname) { + size_t sector_size = kSectorSize; + + if (PathIsRelativeA(fname.c_str())) { + return sector_size; + } + + // obtain device handle + char devicename[7] = "\\\\.\\"; + int erresult = strncat_s(devicename, sizeof(devicename), fname.c_str(), 2); + + if (erresult) { + assert(false); + return sector_size; + } + + HANDLE hDevice = CreateFile(devicename, 0, 0, + nullptr, OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL, nullptr); + + if (hDevice == INVALID_HANDLE_VALUE) { + return sector_size; + } + + STORAGE_PROPERTY_QUERY spropertyquery; + spropertyquery.PropertyId = StorageAccessAlignmentProperty; + spropertyquery.QueryType = PropertyStandardQuery; + + BYTE output_buffer[sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR)]; + DWORD output_bytes = 0; + + BOOL ret = DeviceIoControl(hDevice, IOCTL_STORAGE_QUERY_PROPERTY, + &spropertyquery, sizeof(spropertyquery), output_buffer, + sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR), &output_bytes, nullptr); + + if (ret) { + sector_size = ((STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR *)output_buffer)->BytesPerLogicalSector; + } else { + // many devices do not support StorageProcessAlignmentProperty. Any failure here and we + // fall back to logical alignment + + DISK_GEOMETRY_EX geometry = { 0 }; + ret = DeviceIoControl(hDevice, IOCTL_DISK_GET_DRIVE_GEOMETRY, + nullptr, 0, &geometry, sizeof(geometry), nullptr, nullptr); + if (ret) { + sector_size = geometry.Geometry.BytesPerSector; + } + } + + if (hDevice != INVALID_HANDLE_VALUE) { + CloseHandle(hDevice); + } + + return sector_size; +} + //////////////////////////////////////////////////////////////////////// // WinEnvThreads @@ -1014,6 +1219,11 @@ Status WinEnv::LinkFile(const std::string& src, return winenv_io_.LinkFile(src, target); } +Status WinEnv::AreFilesSame(const std::string& first, + const std::string& second, bool* res) { + return winenv_io_.AreFilesSame(first, second, res); +} + Status WinEnv::LockFile(const std::string& lockFname, FileLock** lock) { return winenv_io_.LockFile(lockFname, lock); diff --git a/port/win/env_win.h b/port/win/env_win.h index ce1a61d4161..ef35fab3fa8 100644 --- a/port/win/env_win.h +++ b/port/win/env_win.h @@ -138,6 +138,9 @@ class WinEnvIO { virtual Status LinkFile(const std::string& src, const std::string& target); + virtual Status AreFilesSame(const std::string& first, + const std::string& second, bool* res); + virtual Status LockFile(const std::string& lockFname, FileLock** lock); @@ -171,6 +174,8 @@ class WinEnvIO { uint64_t GetPerfCounterFrequency() const { return perf_counter_frequency_; } + static size_t GetSectorSize(const std::string& fname); + private: // Returns true iff the named directory exists and is a directory. virtual bool DirExists(const std::string& dname); @@ -248,6 +253,9 @@ class WinEnv : public Env { Status LinkFile(const std::string& src, const std::string& target) override; + Status AreFilesSame(const std::string& first, + const std::string& second, bool* res) override; + Status LockFile(const std::string& lockFname, FileLock** lock) override; diff --git a/port/win/io_win.cc b/port/win/io_win.cc index 7d83896c62d..549cc3a86cc 100644 --- a/port/win/io_win.cc +++ b/port/win/io_win.cc @@ -157,9 +157,11 @@ size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size) { if (max_size < kMaxVarint64Length * 3) { return 0; } - - // This function has to be re-worked for cases when - // ReFS file system introduced on Windows Server 2012 is used +#if (_WIN32_WINNT == _WIN32_WINNT_VISTA) + // MINGGW as defined by CMake file. + // yuslepukhin: I hate the guts of the above macros. + // This impl does not guarantee uniqueness everywhere + // is reasonably good BY_HANDLE_FILE_INFORMATION FileInfo; BOOL result = GetFileInformationByHandle(hFile, &FileInfo); @@ -177,6 +179,33 @@ size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size) { assert(rid >= id); return static_cast(rid - id); +#else + FILE_ID_INFO FileInfo; + BOOL result = GetFileInformationByHandleEx(hFile, FileIdInfo, &FileInfo, + sizeof(FileInfo)); + + TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result); + + if (!result) { + return 0; + } + + static_assert(sizeof(uint64_t) == sizeof(FileInfo.VolumeSerialNumber), + "Wrong sizeof expectations"); + // FileId.Identifier is an array of 16 BYTEs, we encode them as two uint64_t + static_assert(sizeof(uint64_t) * 2 == sizeof(FileInfo.FileId.Identifier), + "Wrong sizeof expectations"); + + char* rid = id; + rid = EncodeVarint64(rid, uint64_t(FileInfo.VolumeSerialNumber)); + uint64_t* file_id = reinterpret_cast(&FileInfo.FileId.Identifier[0]); + rid = EncodeVarint64(rid, *file_id); + ++file_id; + rid = EncodeVarint64(rid, *file_id); + + assert(rid >= id); + return static_cast(rid - id); +#endif } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -782,8 +811,7 @@ Status WinWritableImpl::AppendImpl(const Slice& data) { auto lastError = GetLastError(); s = IOErrorFromWindowsError( "Failed to pwrite for: " + file_data_->GetName(), lastError); - } - else { + } else { written = ret; } @@ -828,8 +856,7 @@ Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset) auto lastError = GetLastError(); s = IOErrorFromWindowsError( "Failed to pwrite for: " + file_data_->GetName(), lastError); - } - else { + } else { assert(size_t(ret) == data.size()); // For sequential write this would be simple // size extension by data.size() @@ -1019,6 +1046,9 @@ Status WinRandomRWFile::Close() { Status WinDirectory::Fsync() { return Status::OK(); } +size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(handle_, id, max_size); +} ////////////////////////////////////////////////////////////////////////// /// WinFileLock diff --git a/port/win/io_win.h b/port/win/io_win.h index 74a99ec93b4..2b9a7564222 100644 --- a/port/win/io_win.h +++ b/port/win/io_win.h @@ -421,10 +421,19 @@ class WinRandomRWFile : private WinFileData, }; class WinDirectory : public Directory { + HANDLE handle_; public: - WinDirectory() {} - + explicit + WinDirectory(HANDLE h) noexcept : + handle_(h) { + assert(handle_ != INVALID_HANDLE_VALUE); + } + ~WinDirectory() { + ::CloseHandle(handle_); + } virtual Status Fsync() override; + + size_t GetUniqueId(char* id, size_t max_size) const override; }; class WinFileLock : public FileLock { diff --git a/port/win/port_win.cc b/port/win/port_win.cc index b3fccbd9308..75b4ec6de90 100644 --- a/port/win/port_win.cc +++ b/port/win/port_win.cc @@ -108,19 +108,20 @@ void InitOnce(OnceType* once, void (*initializer)()) { // Private structure, exposed only by pointer struct DIR { - intptr_t handle_; - bool firstread_; - struct __finddata64_t data_; + HANDLE handle_; + bool firstread_; + WIN32_FIND_DATA data_; dirent entry_; - DIR() : handle_(-1), firstread_(true) {} + DIR() : handle_(INVALID_HANDLE_VALUE), + firstread_(true) {} DIR(const DIR&) = delete; DIR& operator=(const DIR&) = delete; ~DIR() { - if (-1 != handle_) { - _findclose(handle_); + if (INVALID_HANDLE_VALUE != handle_) { + ::FindClose(handle_); } } }; @@ -136,19 +137,25 @@ DIR* opendir(const char* name) { std::unique_ptr

dir(new DIR); - dir->handle_ = _findfirst64(pattern.c_str(), &dir->data_); + dir->handle_ = ::FindFirstFileExA(pattern.c_str(), + FindExInfoBasic, // Do not want alternative name + &dir->data_, + FindExSearchNameMatch, + NULL, // lpSearchFilter + 0); - if (dir->handle_ == -1) { + if (dir->handle_ == INVALID_HANDLE_VALUE) { return nullptr; } - strcpy_s(dir->entry_.d_name, sizeof(dir->entry_.d_name), dir->data_.name); + strcpy_s(dir->entry_.d_name, sizeof(dir->entry_.d_name), + dir->data_.cFileName); return dir.release(); } struct dirent* readdir(DIR* dirp) { - if (!dirp || dirp->handle_ == -1) { + if (!dirp || dirp->handle_ == INVALID_HANDLE_VALUE) { errno = EBADF; return nullptr; } @@ -158,13 +165,14 @@ struct dirent* readdir(DIR* dirp) { return &dirp->entry_; } - auto ret = _findnext64(dirp->handle_, &dirp->data_); + auto ret = ::FindNextFileA(dirp->handle_, &dirp->data_); - if (ret != 0) { + if (ret == 0) { return nullptr; } - strcpy_s(dirp->entry_.d_name, sizeof(dirp->entry_.d_name), dirp->data_.name); + strcpy_s(dirp->entry_.d_name, sizeof(dirp->entry_.d_name), + dirp->data_.cFileName); return &dirp->entry_; } diff --git a/port/win/win_logger.cc b/port/win/win_logger.cc index 0bace9f31f8..af722d9054e 100644 --- a/port/win/win_logger.cc +++ b/port/win/win_logger.cc @@ -36,9 +36,13 @@ WinLogger::WinLogger(uint64_t (*gettid)(), Env* env, HANDLE file, log_size_(0), last_flush_micros_(0), env_(env), - flush_pending_(false) {} + flush_pending_(false) { + assert(file_ != NULL); + assert(file_ != INVALID_HANDLE_VALUE); +} void WinLogger::DebugWriter(const char* str, int len) { + assert(file_ != INVALID_HANDLE_VALUE); DWORD bytesWritten = 0; BOOL ret = WriteFile(file_, str, len, &bytesWritten, NULL); if (ret == FALSE) { @@ -47,11 +51,38 @@ void WinLogger::DebugWriter(const char* str, int len) { } } -WinLogger::~WinLogger() { close(); } +WinLogger::~WinLogger() { + CloseInternal(); +} + +Status WinLogger::CloseImpl() { + return CloseInternal(); +} -void WinLogger::close() { CloseHandle(file_); } +Status WinLogger::CloseInternal() { + Status s; + if (INVALID_HANDLE_VALUE != file_) { + BOOL ret = FlushFileBuffers(file_); + if (ret == 0) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", + lastError); + } + ret = CloseHandle(file_); + // On error the return value is zero + if (ret == 0 && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", + lastError); + } + file_ = INVALID_HANDLE_VALUE; + closed_ = true; + } + return s; +} void WinLogger::Flush() { + assert(file_ != INVALID_HANDLE_VALUE); if (flush_pending_) { flush_pending_ = false; // With Windows API writes go to OS buffers directly so no fflush needed @@ -64,6 +95,7 @@ void WinLogger::Flush() { void WinLogger::Logv(const char* format, va_list ap) { IOSTATS_TIMER_GUARD(logger_nanos); + assert(file_ != INVALID_HANDLE_VALUE); const uint64_t thread_id = (*gettid_)(); diff --git a/port/win/win_logger.h b/port/win/win_logger.h index 2d44f506d1a..0982f142f66 100644 --- a/port/win/win_logger.h +++ b/port/win/win_logger.h @@ -36,8 +36,6 @@ class WinLogger : public rocksdb::Logger { WinLogger& operator=(const WinLogger&) = delete; - void close(); - void Flush() override; using rocksdb::Logger::Logv; @@ -47,6 +45,10 @@ class WinLogger : public rocksdb::Logger { void DebugWriter(const char* str, int len); +protected: + + Status CloseImpl() override; + private: HANDLE file_; uint64_t (*gettid_)(); // Return the thread id for the current thread @@ -55,6 +57,8 @@ class WinLogger : public rocksdb::Logger { Env* env_; bool flush_pending_; + Status CloseInternal(); + const static uint64_t flush_every_seconds_ = 5; }; diff --git a/port/win/win_thread.cc b/port/win/win_thread.cc index 74933203265..b48af2370fc 100644 --- a/port/win/win_thread.cc +++ b/port/win/win_thread.cc @@ -138,7 +138,9 @@ void WindowsThread::join() { "WaitForSingleObjectFailed: thread join"); } - CloseHandle(reinterpret_cast(data_->handle_)); + BOOL rc; + rc = CloseHandle(reinterpret_cast(data_->handle_)); + assert(rc != 0); data_->handle_ = 0; } @@ -154,7 +156,7 @@ bool WindowsThread::detach() { BOOL ret = CloseHandle(reinterpret_cast(data_->handle_)); data_->handle_ = 0; - return (ret == TRUE); + return (ret != 0); } void WindowsThread::swap(WindowsThread& o) { @@ -166,7 +168,6 @@ unsigned int __stdcall WindowsThread::Data::ThreadProc(void* arg) { auto ptr = reinterpret_cast*>(arg); std::unique_ptr> data(ptr); (*data)->func_(); - _endthreadex(0); return 0; } } // namespace port diff --git a/src.mk b/src.mk index 1182af889b3..4089bf0f33a 100644 --- a/src.mk +++ b/src.mk @@ -148,11 +148,13 @@ LIB_SOURCES = \ util/status_message.cc \ util/string_util.cc \ util/sync_point.cc \ + util/sync_point_impl.cc \ util/thread_local.cc \ util/threadpool_imp.cc \ util/transaction_test_util.cc \ util/xxhash.cc \ utilities/backupable/backupable_db.cc \ + utilities/blob_db/blob_compaction_filter.cc \ utilities/blob_db/blob_db.cc \ utilities/blob_db/blob_db_impl.cc \ utilities/blob_db/blob_file.cc \ @@ -182,6 +184,7 @@ LIB_SOURCES = \ utilities/merge_operators/string_append/stringappend.cc \ utilities/merge_operators/string_append/stringappend2.cc \ utilities/merge_operators/uint64add.cc \ + utilities/merge_operators/bytesxor.cc \ utilities/option_change_migration/option_change_migration.cc \ utilities/options/options_util.cc \ utilities/persistent_cache/block_cache_tier.cc \ @@ -396,6 +399,9 @@ JNI_NATIVE_SOURCES = \ java/rocksjni/lru_cache.cc \ java/rocksjni/memtablejni.cc \ java/rocksjni/merge_operator.cc \ + java/rocksjni/native_comparator_wrapper_test.cc \ + java/rocksjni/optimistic_transaction_db.cc \ + java/rocksjni/optimistic_transaction_options.cc \ java/rocksjni/options.cc \ java/rocksjni/options_util.cc \ java/rocksjni/ratelimiterjni.cc \ @@ -412,7 +418,13 @@ JNI_NATIVE_SOURCES = \ java/rocksjni/statistics.cc \ java/rocksjni/statisticsjni.cc \ java/rocksjni/table.cc \ + java/rocksjni/transaction.cc \ + java/rocksjni/transaction_db.cc \ + java/rocksjni/transaction_options.cc \ + java/rocksjni/transaction_db_options.cc \ java/rocksjni/transaction_log.cc \ + java/rocksjni/transaction_notifier.cc \ + java/rocksjni/transaction_notifier_jnicallback.cc \ java/rocksjni/ttl.cc \ java/rocksjni/write_batch.cc \ java/rocksjni/writebatchhandlerjnicallback.cc \ diff --git a/table/adaptive_table_factory.cc b/table/adaptive_table_factory.cc index 47069f86695..0a3e9415ad7 100644 --- a/table/adaptive_table_factory.cc +++ b/table/adaptive_table_factory.cc @@ -44,7 +44,7 @@ Status AdaptiveTableFactory::NewTableReader( const TableReaderOptions& table_reader_options, unique_ptr&& file, uint64_t file_size, unique_ptr* table, - bool prefetch_index_and_filter_in_cache) const { + bool /*prefetch_index_and_filter_in_cache*/) const { Footer footer; auto s = ReadFooterFromFile(file.get(), nullptr /* prefetch_buffer */, file_size, &footer); diff --git a/table/adaptive_table_factory.h b/table/adaptive_table_factory.h index b7b52ba96fc..00af6a76e95 100644 --- a/table/adaptive_table_factory.h +++ b/table/adaptive_table_factory.h @@ -44,8 +44,9 @@ class AdaptiveTableFactory : public TableFactory { uint32_t column_family_id, WritableFileWriter* file) const override; // Sanitizes the specified DB Options. - Status SanitizeOptions(const DBOptions& db_opts, - const ColumnFamilyOptions& cf_opts) const override { + Status SanitizeOptions( + const DBOptions& /*db_opts*/, + const ColumnFamilyOptions& /*cf_opts*/) const override { return Status::OK(); } diff --git a/table/block_based_filter_block.cc b/table/block_based_filter_block.cc index 6e300e8105d..fc24f41a6be 100644 --- a/table/block_based_filter_block.cc +++ b/table/block_based_filter_block.cc @@ -67,7 +67,8 @@ BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder( prefix_extractor_(prefix_extractor), whole_key_filtering_(table_opt.whole_key_filtering), prev_prefix_start_(0), - prev_prefix_size_(0) { + prev_prefix_size_(0), + num_added_(0) { assert(policy_); } @@ -91,6 +92,7 @@ void BlockBasedFilterBlockBuilder::Add(const Slice& key) { // Add key to filter if needed inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) { + num_added_++; start_.push_back(entries_.size()); entries_.append(key.data(), key.size()); } @@ -106,14 +108,13 @@ inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) { Slice prefix = prefix_extractor_->Transform(key); // insert prefix only when it's different from the previous prefix. if (prev.size() == 0 || prefix != prev) { - start_.push_back(entries_.size()); prev_prefix_start_ = entries_.size(); prev_prefix_size_ = prefix.size(); - entries_.append(prefix.data(), prefix.size()); + AddKey(prefix); } } -Slice BlockBasedFilterBlockBuilder::Finish(const BlockHandle& tmp, +Slice BlockBasedFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/, Status* status) { // In this impl we ignore BlockHandle *status = Status::OK(); @@ -185,8 +186,8 @@ BlockBasedFilterBlockReader::BlockBasedFilterBlockReader( } bool BlockBasedFilterBlockReader::KeyMayMatch( - const Slice& key, uint64_t block_offset, const bool no_io, - const Slice* const const_ikey_ptr) { + const Slice& key, uint64_t block_offset, const bool /*no_io*/, + const Slice* const /*const_ikey_ptr*/) { assert(block_offset != kNotValid); if (!whole_key_filtering_) { return true; @@ -195,8 +196,8 @@ bool BlockBasedFilterBlockReader::KeyMayMatch( } bool BlockBasedFilterBlockReader::PrefixMayMatch( - const Slice& prefix, uint64_t block_offset, const bool no_io, - const Slice* const const_ikey_ptr) { + const Slice& prefix, uint64_t block_offset, const bool /*no_io*/, + const Slice* const /*const_ikey_ptr*/) { assert(block_offset != kNotValid); if (!prefix_extractor_) { return true; diff --git a/table/block_based_filter_block.h b/table/block_based_filter_block.h index 52b79fea501..3bfb3b24ae7 100644 --- a/table/block_based_filter_block.h +++ b/table/block_based_filter_block.h @@ -41,6 +41,7 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder { virtual bool IsBlockBased() override { return true; } virtual void StartBlock(uint64_t block_offset) override; virtual void Add(const Slice& key) override; + virtual size_t NumAdded() const override { return num_added_; } virtual Slice Finish(const BlockHandle& tmp, Status* status) override; using FilterBlockBuilder::Finish; @@ -65,6 +66,7 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder { std::string result_; // Filter data computed so far std::vector tmp_entries_; // policy_->CreateFilter() argument std::vector filter_offsets_; + size_t num_added_; // Number of keys added // No copying allowed BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&); diff --git a/table/block_based_filter_block_test.cc b/table/block_based_filter_block_test.cc index f666ba25242..dece461e33d 100644 --- a/table/block_based_filter_block_test.cc +++ b/table/block_based_filter_block_test.cc @@ -65,6 +65,7 @@ TEST_F(FilterBlockTest, EmptyBuilder) { TEST_F(FilterBlockTest, SingleChunk) { BlockBasedFilterBlockBuilder builder(nullptr, table_options_); + ASSERT_EQ(0, builder.NumAdded()); builder.StartBlock(100); builder.Add("foo"); builder.Add("bar"); @@ -73,6 +74,7 @@ TEST_F(FilterBlockTest, SingleChunk) { builder.Add("box"); builder.StartBlock(300); builder.Add("hello"); + ASSERT_EQ(5, builder.NumAdded()); BlockContents block(builder.Finish(), false, kNoCompression); BlockBasedFilterBlockReader reader(nullptr, table_options_, true, std::move(block), nullptr); diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index 71f01b9b0fa..607d2a5340f 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -208,8 +208,8 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector whole_key_filtering_(whole_key_filtering), prefix_filtering_(prefix_filtering) {} - virtual Status InternalAdd(const Slice& key, const Slice& value, - uint64_t file_size) override { + virtual Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/, + uint64_t /*file_size*/) override { // Intentionally left blank. Have no interest in collecting stats for // individual key/value pairs. return Status::OK(); @@ -249,6 +249,7 @@ struct BlockBasedTableBuilder::Rep { WritableFileWriter* file; uint64_t offset = 0; Status status; + size_t alignment; BlockBuilder data_block; BlockBuilder range_del_block; @@ -294,6 +295,9 @@ struct BlockBasedTableBuilder::Rep { table_options(table_opt), internal_comparator(icomparator), file(f), + alignment(table_options.block_align + ? std::min(table_options.block_size, kDefaultPageSize) + : 0), data_block(table_options.block_restart_interval, table_options.use_delta_encoding), range_del_block(1), // TODO(andrewkr): restart_interval unnecessary @@ -537,13 +541,14 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents, RecordTick(r->ioptions.statistics, NUMBER_BLOCK_COMPRESSED); } - WriteRawBlock(block_contents, type, handle); + WriteRawBlock(block_contents, type, handle, is_data_block); r->compressed_output.clear(); } void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, CompressionType type, - BlockHandle* handle) { + BlockHandle* handle, + bool is_data_block) { Rep* r = rep_; StopWatch sw(r->ioptions.env, r->ioptions.statistics, WRITE_RAW_BLOCK_MICROS); handle->set_offset(r->offset); @@ -581,6 +586,16 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, } if (r->status.ok()) { r->offset += block_contents.size() + kBlockTrailerSize; + if (r->table_options.block_align && is_data_block) { + size_t pad_bytes = + (r->alignment - ((block_contents.size() + kBlockTrailerSize) & + (r->alignment - 1))) & + (r->alignment - 1); + r->status = r->file->Pad(pad_bytes); + if (r->status.ok()) { + r->offset += pad_bytes; + } + } } } } @@ -589,7 +604,7 @@ Status BlockBasedTableBuilder::status() const { return rep_->status; } -static void DeleteCachedBlock(const Slice& key, void* value) { +static void DeleteCachedBlock(const Slice& /*key*/, void* value) { Block* block = reinterpret_cast(value); delete block; } @@ -650,8 +665,11 @@ Status BlockBasedTableBuilder::Finish() { BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle, compression_dict_block_handle, range_del_block_handle; + // Write filter block - if (ok() && r->filter_builder != nullptr) { + bool empty_filter_block = (r->filter_builder == nullptr || + r->filter_builder->NumAdded() == 0); + if (ok() && !empty_filter_block) { Status s = Status::Incomplete(); while (s.IsIncomplete()) { Slice filter_content = r->filter_builder->Finish(filter_block_handle, &s); @@ -687,7 +705,7 @@ Status BlockBasedTableBuilder::Finish() { } if (ok()) { - if (r->filter_builder != nullptr) { + if (!empty_filter_block) { // Add mapping from ".Name" to location // of filter data. std::string key; diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h index 36dfce1f0fb..392dedc1ff2 100644 --- a/table/block_based_table_builder.h +++ b/table/block_based_table_builder.h @@ -96,7 +96,8 @@ class BlockBasedTableBuilder : public TableBuilder { void WriteBlock(const Slice& block_contents, BlockHandle* handle, bool is_data_block); // Directly write data to the file. - void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle); + void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle, + bool is_data_block = false); Status InsertBlockInCache(const Slice& block_contents, const CompressionType type, const BlockHandle* handle); diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index 5c83b3d13aa..d468eaa2458 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -92,8 +92,7 @@ TableBuilder* BlockBasedTableFactory::NewTableBuilder( } Status BlockBasedTableFactory::SanitizeOptions( - const DBOptions& db_opts, - const ColumnFamilyOptions& cf_opts) const { + const DBOptions& /*db_opts*/, const ColumnFamilyOptions& cf_opts) const { if (table_options_.index_type == BlockBasedTableOptions::kHashSearch && cf_opts.prefix_extractor == nullptr) { return Status::InvalidArgument("Hash index is specified for block-based " @@ -115,6 +114,15 @@ Status BlockBasedTableFactory::SanitizeOptions( "Unsupported BlockBasedTable format_version. Please check " "include/rocksdb/table.h for more info"); } + if (table_options_.block_align && (cf_opts.compression != kNoCompression)) { + return Status::InvalidArgument("Enable block_align, but compression " + "enabled"); + } + if (table_options_.block_align && + (table_options_.block_size & (table_options_.block_size - 1))) { + return Status::InvalidArgument( + "Block alignment requested but block size is not a power of 2"); + } return Status::OK(); } @@ -226,6 +234,9 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const { snprintf(buffer, kBufferSize, " enable_index_compression: %d\n", table_options_.enable_index_compression); ret.append(buffer); + snprintf(buffer, kBufferSize, " block_align: %d\n", + table_options_.block_align); + ret.append(buffer); return ret; } diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h index a5eba7eff61..b9d3a97d6ef 100644 --- a/table/block_based_table_factory.h +++ b/table/block_based_table_factory.h @@ -155,6 +155,9 @@ static std::unordered_map OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}}, {"enable_index_compression", {offsetof(struct BlockBasedTableOptions, enable_index_compression), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"block_align", + {offsetof(struct BlockBasedTableOptions, block_align), OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}}; #endif // !ROCKSDB_LITE } // namespace rocksdb diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index da9b856e450..d861f1a5573 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -95,13 +95,13 @@ Status ReadBlockFromFile( // Delete the resource that is held by the iterator. template -void DeleteHeldResource(void* arg, void* ignored) { +void DeleteHeldResource(void* arg, void* /*ignored*/) { delete reinterpret_cast(arg); } // Delete the entry resided in the cache. template -void DeleteCachedEntry(const Slice& key, void* value) { +void DeleteCachedEntry(const Slice& /*key*/, void* value) { auto entry = reinterpret_cast(value); delete entry; } @@ -207,8 +207,8 @@ class PartitionIndexReader : public IndexReader, public Cleanable { } // return a two-level iterator: first level is on the partition index - virtual InternalIterator* NewIterator(BlockIter* iter = nullptr, - bool dont_care = true) override { + virtual InternalIterator* NewIterator(BlockIter* /*iter*/ = nullptr, + bool /*dont_care*/ = true) override { // Filters are already checked before seeking the index if (!partition_map_.empty()) { return NewTwoLevelIterator( @@ -260,7 +260,8 @@ class PartitionIndexReader : public IndexReader, public Cleanable { std::unique_ptr prefetch_buffer; auto& file = table_->rep_->file; prefetch_buffer.reset(new FilePrefetchBuffer()); - s = prefetch_buffer->Prefetch(file.get(), prefetch_off, prefetch_len); + s = prefetch_buffer->Prefetch(file.get(), prefetch_off, + static_cast(prefetch_len)); // After prefetch, read the partitions one by one biter.SeekToFirst(); @@ -319,7 +320,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable { PartitionIndexReader(BlockBasedTable* table, const InternalKeyComparator* icomparator, std::unique_ptr&& index_block, Statistics* stats, - const int level) + const int /*level*/) : IndexReader(icomparator, stats), table_(table), index_block_(std::move(index_block)) { @@ -363,7 +364,7 @@ class BinarySearchIndexReader : public IndexReader { } virtual InternalIterator* NewIterator(BlockIter* iter = nullptr, - bool dont_care = true) override { + bool /*dont_care*/ = true) override { return index_block_->NewIterator(icomparator_, iter, true); } @@ -399,7 +400,7 @@ class HashIndexReader : public IndexReader { const BlockHandle& index_handle, InternalIterator* meta_index_iter, IndexReader** index_reader, - bool hash_index_allow_collision, + bool /*hash_index_allow_collision*/, const PersistentCacheOptions& cache_options) { std::unique_ptr index_block; auto s = ReadBlockFromFile( @@ -654,9 +655,9 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, size_t prefetch_len; if (file_size < kTailPrefetchSize) { prefetch_off = 0; - prefetch_len = file_size; + prefetch_len = static_cast(file_size); } else { - prefetch_off = file_size - kTailPrefetchSize; + prefetch_off = static_cast(file_size - kTailPrefetchSize); prefetch_len = kTailPrefetchSize; } Status s; @@ -1109,7 +1110,7 @@ Status BlockBasedTable::GetDataBlockFromCache( Status BlockBasedTable::PutDataBlockToCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, - const ReadOptions& read_options, const ImmutableCFOptions& ioptions, + const ReadOptions& /*read_options*/, const ImmutableCFOptions& ioptions, CachableEntry* block, Block* raw_block, uint32_t format_version, const Slice& compression_dict, size_t read_amp_bytes_per_bit, bool is_index, Cache::Priority priority, GetContext* get_context) { @@ -1896,7 +1897,8 @@ void BlockBasedTableIterator::InitDataBlock() { readahead_size_ = std::min(kMaxReadaheadSize, readahead_size_); table_->get_rep()->file->Prefetch(data_block_handle.offset(), readahead_size_); - readahead_limit_ = data_block_handle.offset() + readahead_size_; + readahead_limit_ = static_cast(data_block_handle.offset() + + readahead_size_); // Keep exponentially increasing readahead size until kMaxReadaheadSize. readahead_size_ *= 2; } @@ -2774,7 +2776,7 @@ void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value, namespace { -void DeleteCachedFilterEntry(const Slice& key, void* value) { +void DeleteCachedFilterEntry(const Slice& /*key*/, void* value) { FilterBlockReader* filter = reinterpret_cast(value); if (filter->statistics() != nullptr) { RecordTick(filter->statistics(), BLOCK_CACHE_FILTER_BYTES_EVICT, @@ -2783,7 +2785,7 @@ void DeleteCachedFilterEntry(const Slice& key, void* value) { delete filter; } -void DeleteCachedIndexEntry(const Slice& key, void* value) { +void DeleteCachedIndexEntry(const Slice& /*key*/, void* value) { IndexReader* index_reader = reinterpret_cast(value); if (index_reader->statistics() != nullptr) { RecordTick(index_reader->statistics(), BLOCK_CACHE_INDEX_BYTES_EVICT, diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 08b0bec7e95..f2c5082afd3 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -545,9 +545,10 @@ class BlockBasedTableIterator : public InternalIterator { } bool IsKeyPinned() const override { return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && - block_iter_points_to_real_block_; + block_iter_points_to_real_block_ && data_block_iter_.IsKeyPinned(); } bool IsValuePinned() const override { + // BlockIter::IsValuePinned() is always true. No need to check return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && block_iter_points_to_real_block_; } @@ -566,7 +567,7 @@ class BlockBasedTableIterator : public InternalIterator { void ResetDataIter() { if (block_iter_points_to_real_block_) { - if (pinned_iters_mgr_ != nullptr) { + if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) { data_block_iter_.DelegateCleanupsTo(pinned_iters_mgr_); } data_block_iter_.~BlockIter(); diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index 09d930fdc73..fd01ac1c6da 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -30,6 +30,7 @@ namespace rocksdb { +inline void BlockFetcher::CheckBlockChecksum() { // Check the crc of the type and the block contents if (read_options_.verify_checksums) { @@ -62,6 +63,7 @@ void BlockFetcher::CheckBlockChecksum() { } } +inline bool BlockFetcher::TryGetUncompressBlockFromPersistentCache() { if (cache_options_.persistent_cache && !cache_options_.persistent_cache->IsCompressed()) { @@ -83,6 +85,7 @@ bool BlockFetcher::TryGetUncompressBlockFromPersistentCache() { return false; } +inline bool BlockFetcher::TryGetFromPrefetchBuffer() { if (prefetch_buffer_ != nullptr && prefetch_buffer_->TryReadFromCache( @@ -99,6 +102,7 @@ bool BlockFetcher::TryGetFromPrefetchBuffer() { return got_from_prefetch_buffer_; } +inline bool BlockFetcher::TryGetCompressedBlockFromPersistentCache() { if (cache_options_.persistent_cache && cache_options_.persistent_cache->IsCompressed()) { @@ -119,6 +123,7 @@ bool BlockFetcher::TryGetCompressedBlockFromPersistentCache() { return false; } +inline void BlockFetcher::PrepareBufferForBlockFromFile() { // cache miss read from device if (do_uncompress_ && @@ -127,12 +132,12 @@ void BlockFetcher::PrepareBufferForBlockFromFile() { // trivially allocated stack buffer instead of needing a full malloc() used_buf_ = &stack_buf_[0]; } else { - heap_buf_ = - std::unique_ptr(new char[block_size_ + kBlockTrailerSize]); + heap_buf_.reset(new char[block_size_ + kBlockTrailerSize]); used_buf_ = heap_buf_.get(); } } +inline void BlockFetcher::InsertCompressedBlockToPersistentCacheIfNeeded() { if (status_.ok() && read_options_.fill_cache && cache_options_.persistent_cache && @@ -143,6 +148,7 @@ void BlockFetcher::InsertCompressedBlockToPersistentCacheIfNeeded() { } } +inline void BlockFetcher::InsertUncompressedBlockToPersistentCacheIfNeeded() { if (status_.ok() && !got_from_prefetch_buffer_ && read_options_.fill_cache && cache_options_.persistent_cache && @@ -153,6 +159,7 @@ void BlockFetcher::InsertUncompressedBlockToPersistentCacheIfNeeded() { } } +inline void BlockFetcher::GetBlockContents() { if (slice_.data() != used_buf_) { // the slice content is not the buffer provided @@ -161,7 +168,7 @@ void BlockFetcher::GetBlockContents() { } else { // page is uncompressed, the buffer either stack or heap provided if (got_from_prefetch_buffer_ || used_buf_ == &stack_buf_[0]) { - heap_buf_ = std::unique_ptr(new char[block_size_]); + heap_buf_.reset(new char[block_size_]); memcpy(heap_buf_.get(), used_buf_, block_size_); } *contents_ = BlockContents(std::move(heap_buf_), block_size_, true, diff --git a/table/block_test.cc b/table/block_test.cc index 45e454c7496..968951e6c3f 100644 --- a/table/block_test.cc +++ b/table/block_test.cc @@ -133,7 +133,7 @@ TEST_F(BlockTest, SimpleTest) { BlockContents GetBlockContents(std::unique_ptr *builder, const std::vector &keys, const std::vector &values, - const int prefix_group_size = 1) { + const int /*prefix_group_size*/ = 1) { builder->reset(new BlockBuilder(1 /* restart interval */)); // Add only half of the keys diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc index d896ed4b1bc..8f77b515c45 100644 --- a/table/cuckoo_table_builder_test.cc +++ b/table/cuckoo_table_builder_test.cc @@ -23,7 +23,7 @@ namespace { std::unordered_map> hash_map; uint64_t GetSliceHash(const Slice& s, uint32_t index, - uint64_t max_num_buckets) { + uint64_t /*max_num_buckets*/) { return hash_map[s.ToString()][index]; } } // namespace diff --git a/table/cuckoo_table_factory.cc b/table/cuckoo_table_factory.cc index 2325bcf77c4..84d22468eb9 100644 --- a/table/cuckoo_table_factory.cc +++ b/table/cuckoo_table_factory.cc @@ -16,7 +16,7 @@ Status CuckooTableFactory::NewTableReader( const TableReaderOptions& table_reader_options, unique_ptr&& file, uint64_t file_size, std::unique_ptr* table, - bool prefetch_index_and_filter_in_cache) const { + bool /*prefetch_index_and_filter_in_cache*/) const { std::unique_ptr new_reader(new CuckooTableReader( table_reader_options.ioptions, std::move(file), file_size, table_reader_options.internal_comparator.user_comparator(), nullptr)); diff --git a/table/cuckoo_table_factory.h b/table/cuckoo_table_factory.h index db860c3d002..fba0903d750 100644 --- a/table/cuckoo_table_factory.h +++ b/table/cuckoo_table_factory.h @@ -67,8 +67,9 @@ class CuckooTableFactory : public TableFactory { uint32_t column_family_id, WritableFileWriter* file) const override; // Sanitizes the specified DB Options. - Status SanitizeOptions(const DBOptions& db_opts, - const ColumnFamilyOptions& cf_opts) const override { + Status SanitizeOptions( + const DBOptions& /*db_opts*/, + const ColumnFamilyOptions& /*cf_opts*/) const override { return Status::OK(); } @@ -76,8 +77,8 @@ class CuckooTableFactory : public TableFactory { void* GetOptions() override { return &table_options_; } - Status GetOptionString(std::string* opt_string, - const std::string& delimiter) const override { + Status GetOptionString(std::string* /*opt_string*/, + const std::string& /*delimiter*/) const override { return Status::OK(); } diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc index 937f598d103..d48290900f5 100644 --- a/table/cuckoo_table_reader.cc +++ b/table/cuckoo_table_reader.cc @@ -139,8 +139,9 @@ CuckooTableReader::CuckooTableReader( status_ = file_->Read(0, file_size, &file_data_, nullptr); } -Status CuckooTableReader::Get(const ReadOptions& readOptions, const Slice& key, - GetContext* get_context, bool skip_filters) { +Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/, + const Slice& key, GetContext* get_context, + bool /*skip_filters*/) { assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0)); Slice user_key = ExtractUserKey(key); for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { @@ -311,7 +312,7 @@ void CuckooTableIterator::Seek(const Slice& target) { PrepareKVAtCurrIdx(); } -void CuckooTableIterator::SeekForPrev(const Slice& target) { +void CuckooTableIterator::SeekForPrev(const Slice& /*target*/) { // Not supported assert(false); } @@ -376,7 +377,7 @@ extern InternalIterator* NewErrorInternalIterator(const Status& status, Arena* arena); InternalIterator* CuckooTableReader::NewIterator( - const ReadOptions& read_options, Arena* arena, bool skip_filters) { + const ReadOptions& /*read_options*/, Arena* arena, bool /*skip_filters*/) { if (!status().ok()) { return NewErrorInternalIterator( Status::Corruption("CuckooTableReader status is not okay."), arena); diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h index 4beac8f9d07..2988c88e04b 100644 --- a/table/cuckoo_table_reader.h +++ b/table/cuckoo_table_reader.h @@ -54,7 +54,7 @@ class CuckooTableReader: public TableReader { size_t ApproximateMemoryUsage() const override; // Following methods are not implemented for Cuckoo Table Reader - uint64_t ApproximateOffsetOf(const Slice& key) override { return 0; } + uint64_t ApproximateOffsetOf(const Slice& /*key*/) override { return 0; } void SetupForCompaction() override {} // End of methods not implemented. diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc index 981b9804c49..aeb08a3087c 100644 --- a/table/cuckoo_table_reader_test.cc +++ b/table/cuckoo_table_reader_test.cc @@ -61,7 +61,7 @@ void AddHashLookups(const std::string& s, uint64_t bucket_id, } uint64_t GetSliceHash(const Slice& s, uint32_t index, - uint64_t max_num_buckets) { + uint64_t /*max_num_buckets*/) { return hash_map[s.ToString()][index]; } } // namespace diff --git a/table/filter_block.h b/table/filter_block.h index 7bf3b31324d..97f493fb708 100644 --- a/table/filter_block.h +++ b/table/filter_block.h @@ -51,6 +51,7 @@ class FilterBlockBuilder { virtual bool IsBlockBased() = 0; // If is blockbased filter virtual void StartBlock(uint64_t block_offset) = 0; // Start new block filter virtual void Add(const Slice& key) = 0; // Add a key to current filter + virtual size_t NumAdded() const = 0; // Number of keys added Slice Finish() { // Generate Filter const BlockHandle empty_handle; Status dont_care_status; @@ -114,7 +115,7 @@ class FilterBlockReader { return error_msg; } - virtual void CacheDependencies(bool pin) {} + virtual void CacheDependencies(bool /*pin*/) {} protected: bool whole_key_filtering_; diff --git a/table/flush_block_policy.cc b/table/flush_block_policy.cc index 9a8dea4cb0c..d2a4b962718 100644 --- a/table/flush_block_policy.cc +++ b/table/flush_block_policy.cc @@ -3,10 +3,11 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "rocksdb/options.h" #include "rocksdb/flush_block_policy.h" +#include "rocksdb/options.h" #include "rocksdb/slice.h" #include "table/block_builder.h" +#include "table/format.h" #include @@ -21,10 +22,12 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy { // reaches the configured FlushBlockBySizePolicy(const uint64_t block_size, const uint64_t block_size_deviation, + const bool align, const BlockBuilder& data_block_builder) : block_size_(block_size), block_size_deviation_limit_( ((block_size * (100 - block_size_deviation)) + 99) / 100), + align_(align), data_block_builder_(data_block_builder) {} virtual bool Update(const Slice& key, @@ -51,8 +54,13 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy { } const auto curr_size = data_block_builder_.CurrentSizeEstimate(); - const auto estimated_size_after = - data_block_builder_.EstimateSizeAfterKV(key, value); + auto estimated_size_after = + data_block_builder_.EstimateSizeAfterKV(key, value); + + if (align_) { + estimated_size_after += kBlockTrailerSize; + return estimated_size_after > block_size_; + } return estimated_size_after > block_size_ && curr_size > block_size_deviation_limit_; @@ -60,6 +68,7 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy { const uint64_t block_size_; const uint64_t block_size_deviation_limit_; + const bool align_; const BlockBuilder& data_block_builder_; }; @@ -68,13 +77,13 @@ FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( const BlockBuilder& data_block_builder) const { return new FlushBlockBySizePolicy( table_options.block_size, table_options.block_size_deviation, - data_block_builder); + table_options.block_align, data_block_builder); } FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( const uint64_t size, const int deviation, const BlockBuilder& data_block_builder) { - return new FlushBlockBySizePolicy(size, deviation, data_block_builder); + return new FlushBlockBySizePolicy(size, deviation, false, data_block_builder); } } // namespace rocksdb diff --git a/table/full_filter_block.cc b/table/full_filter_block.cc index 5739494e8dd..448b827847b 100644 --- a/table/full_filter_block.cc +++ b/table/full_filter_block.cc @@ -43,7 +43,8 @@ inline void FullFilterBlockBuilder::AddPrefix(const Slice& key) { AddKey(prefix); } -Slice FullFilterBlockBuilder::Finish(const BlockHandle& tmp, Status* status) { +Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/, + Status* status) { // In this impl we ignore BlockHandle *status = Status::OK(); if (num_added_ != 0) { @@ -74,8 +75,8 @@ FullFilterBlockReader::FullFilterBlockReader( } bool FullFilterBlockReader::KeyMayMatch(const Slice& key, uint64_t block_offset, - const bool no_io, - const Slice* const const_ikey_ptr) { + const bool /*no_io*/, + const Slice* const /*const_ikey_ptr*/) { assert(block_offset == kNotValid); if (!whole_key_filtering_) { return true; @@ -83,10 +84,9 @@ bool FullFilterBlockReader::KeyMayMatch(const Slice& key, uint64_t block_offset, return MayMatch(key); } -bool FullFilterBlockReader::PrefixMayMatch(const Slice& prefix, - uint64_t block_offset, - const bool no_io, - const Slice* const const_ikey_ptr) { +bool FullFilterBlockReader::PrefixMayMatch( + const Slice& prefix, uint64_t block_offset, const bool /*no_io*/, + const Slice* const /*const_ikey_ptr*/) { assert(block_offset == kNotValid); if (!prefix_extractor_) { return true; diff --git a/table/full_filter_block.h b/table/full_filter_block.h index be27c58b61d..6aec5530a7d 100644 --- a/table/full_filter_block.h +++ b/table/full_filter_block.h @@ -43,8 +43,9 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { ~FullFilterBlockBuilder() {} virtual bool IsBlockBased() override { return false; } - virtual void StartBlock(uint64_t block_offset) override {} + virtual void StartBlock(uint64_t /*block_offset*/) override {} virtual void Add(const Slice& key) override; + virtual size_t NumAdded() const override { return num_added_; } virtual Slice Finish(const BlockHandle& tmp, Status* status) override; using FilterBlockBuilder::Finish; diff --git a/table/full_filter_block_test.cc b/table/full_filter_block_test.cc index 5fbda4c6f03..2e73612d13e 100644 --- a/table/full_filter_block_test.cc +++ b/table/full_filter_block_test.cc @@ -163,11 +163,13 @@ TEST_F(FullFilterBlockTest, EmptyBuilder) { TEST_F(FullFilterBlockTest, SingleChunk) { FullFilterBlockBuilder builder( nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder()); + ASSERT_EQ(0, builder.NumAdded()); builder.Add("foo"); builder.Add("bar"); builder.Add("box"); builder.Add("box"); builder.Add("hello"); + ASSERT_EQ(5, builder.NumAdded()); Slice block = builder.Finish(); FullFilterBlockReader reader( nullptr, true, block, diff --git a/table/get_context.cc b/table/get_context.cc index 692857da54f..0003385a94d 100644 --- a/table/get_context.cc +++ b/table/get_context.cc @@ -77,7 +77,7 @@ void GetContext::MarkKeyMayExist() { } } -void GetContext::SaveValue(const Slice& value, SequenceNumber seq) { +void GetContext::SaveValue(const Slice& value, SequenceNumber /*seq*/) { assert(state_ == kNotFound); appendToReplayLog(replay_log_, kTypeValue, value); diff --git a/table/index_builder.h b/table/index_builder.h index d591e0e533c..3793cebc258 100644 --- a/table/index_builder.h +++ b/table/index_builder.h @@ -69,7 +69,7 @@ class IndexBuilder { // This method will be called whenever a key is added. The subclasses may // override OnKeyAdded() if they need to collect additional information. - virtual void OnKeyAdded(const Slice& key) {} + virtual void OnKeyAdded(const Slice& /*key*/) {} // Inform the index builder that all entries has been written. Block builder // may therefore perform any operation required for block finalization. @@ -137,7 +137,7 @@ class ShortenedIndexBuilder : public IndexBuilder { using IndexBuilder::Finish; virtual Status Finish( IndexBlocks* index_blocks, - const BlockHandle& last_partition_block_handle) override { + const BlockHandle& /*last_partition_block_handle*/) override { index_blocks->index_block_contents = index_block_builder_.Finish(); return Status::OK(); } diff --git a/table/internal_iterator.h b/table/internal_iterator.h index 705044a3a28..ff7b4d2cbcf 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -78,7 +78,8 @@ class InternalIterator : public Cleanable { // but for Iterators that need to communicate with PinnedIteratorsManager // they will implement this function and use the passed pointer to communicate // with PinnedIteratorsManager. - virtual void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) {} + virtual void SetPinnedItersMgr(PinnedIteratorsManager* /*pinned_iters_mgr*/) { + } // If true, this means that the Slice returned by key() is valid as long as // PinnedIteratorsManager::ReleasePinnedData is not called and the @@ -95,7 +96,7 @@ class InternalIterator : public Cleanable { // Iterator is not deleted. virtual bool IsValuePinned() const { return false; } - virtual Status GetProperty(std::string prop_name, std::string* prop) { + virtual Status GetProperty(std::string /*prop_name*/, std::string* /*prop*/) { return Status::NotSupported(""); } diff --git a/table/iterator.cc b/table/iterator.cc index ed6a2cdea44..0411b374a44 100644 --- a/table/iterator.cc +++ b/table/iterator.cc @@ -111,8 +111,8 @@ class EmptyIterator : public Iterator { public: explicit EmptyIterator(const Status& s) : status_(s) { } virtual bool Valid() const override { return false; } - virtual void Seek(const Slice& target) override {} - virtual void SeekForPrev(const Slice& target) override {} + virtual void Seek(const Slice& /*target*/) override {} + virtual void SeekForPrev(const Slice& /*target*/) override {} virtual void SeekToFirst() override {} virtual void SeekToLast() override {} virtual void Next() override { assert(false); } @@ -135,8 +135,8 @@ class EmptyInternalIterator : public InternalIterator { public: explicit EmptyInternalIterator(const Status& s) : status_(s) {} virtual bool Valid() const override { return false; } - virtual void Seek(const Slice& target) override {} - virtual void SeekForPrev(const Slice& target) override {} + virtual void Seek(const Slice& /*target*/) override {} + virtual void SeekForPrev(const Slice& /*target*/) override {} virtual void SeekToFirst() override {} virtual void SeekToLast() override {} virtual void Next() override { assert(false); } diff --git a/table/mock_table.cc b/table/mock_table.cc index 86c380865c6..f03ab352196 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -27,13 +27,13 @@ stl_wrappers::KVMap MakeMockFile( } InternalIterator* MockTableReader::NewIterator(const ReadOptions&, - Arena* arena, - bool skip_filters) { + Arena* /*arena*/, + bool /*skip_filters*/) { return new MockTableIterator(table_); } Status MockTableReader::Get(const ReadOptions&, const Slice& key, - GetContext* get_context, bool skip_filters) { + GetContext* get_context, bool /*skip_filters*/) { std::unique_ptr iter(new MockTableIterator(table_)); for (iter->Seek(key); iter->Valid(); iter->Next()) { ParsedInternalKey parsed_key; @@ -56,10 +56,10 @@ std::shared_ptr MockTableReader::GetTableProperties() MockTableFactory::MockTableFactory() : next_id_(1) {} Status MockTableFactory::NewTableReader( - const TableReaderOptions& table_reader_options, - unique_ptr&& file, uint64_t file_size, + const TableReaderOptions& /*table_reader_options*/, + unique_ptr&& file, uint64_t /*file_size*/, unique_ptr* table_reader, - bool prefetch_index_and_filter_in_cache) const { + bool /*prefetch_index_and_filter_in_cache*/) const { uint32_t id = GetIDFromFile(file.get()); MutexLock lock_guard(&file_system_.mutex); @@ -75,8 +75,8 @@ Status MockTableFactory::NewTableReader( } TableBuilder* MockTableFactory::NewTableBuilder( - const TableBuilderOptions& table_builder_options, uint32_t column_family_id, - WritableFileWriter* file) const { + const TableBuilderOptions& /*table_builder_options*/, + uint32_t /*column_family_id*/, WritableFileWriter* file) const { uint32_t id = GetAndWriteNextID(file); return new MockTableBuilder(id, &file_system_); diff --git a/table/mock_table.h b/table/mock_table.h index 71609a173fb..f5fb4009ef8 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -45,7 +45,7 @@ class MockTableReader : public TableReader { Status Get(const ReadOptions&, const Slice& key, GetContext* get_context, bool skip_filters = false) override; - uint64_t ApproximateOffsetOf(const Slice& key) override { return 0; } + uint64_t ApproximateOffsetOf(const Slice& /*key*/) override { return 0; } virtual size_t ApproximateMemoryUsage() const override { return 0; } @@ -168,8 +168,8 @@ class MockTableFactory : public TableFactory { stl_wrappers::KVMap file_contents); virtual Status SanitizeOptions( - const DBOptions& db_opts, - const ColumnFamilyOptions& cf_opts) const override { + const DBOptions& /*db_opts*/, + const ColumnFamilyOptions& /*cf_opts*/) const override { return Status::OK(); } diff --git a/table/partitioned_filter_block.cc b/table/partitioned_filter_block.cc index 8d6df35e144..146f3b3e415 100644 --- a/table/partitioned_filter_block.cc +++ b/table/partitioned_filter_block.cc @@ -25,7 +25,8 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( filter_bits_builder), index_on_filter_block_builder_(index_block_restart_interval), p_index_builder_(p_index_builder), - filters_in_partition_(0) { + filters_in_partition_(0), + num_added_(0) { filters_per_partition_ = filter_bits_builder_->CalculateNumEntry(partition_size); } @@ -53,6 +54,7 @@ void PartitionedFilterBlockBuilder::AddKey(const Slice& key) { MaybeCutAFilterBlock(); filter_bits_builder_->AddKey(key); filters_in_partition_++; + num_added_++; } Slice PartitionedFilterBlockBuilder::Finish( @@ -88,7 +90,7 @@ Slice PartitionedFilterBlockBuilder::Finish( PartitionedFilterBlockReader::PartitionedFilterBlockReader( const SliceTransform* prefix_extractor, bool _whole_key_filtering, - BlockContents&& contents, FilterBitsReader* filter_bits_reader, + BlockContents&& contents, FilterBitsReader* /*filter_bits_reader*/, Statistics* stats, const Comparator& comparator, const BlockBasedTable* table) : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering), @@ -279,7 +281,8 @@ void PartitionedFilterBlockReader::CacheDependencies(bool pin) { std::unique_ptr prefetch_buffer; auto& file = table_->rep_->file; prefetch_buffer.reset(new FilePrefetchBuffer()); - s = prefetch_buffer->Prefetch(file.get(), prefetch_off, prefetch_len); + s = prefetch_buffer->Prefetch(file.get(), prefetch_off, + static_cast(prefetch_len)); // After prefetch, read the partitions one by one biter.SeekToFirst(); diff --git a/table/partitioned_filter_block.h b/table/partitioned_filter_block.h index 1a00a86e6ce..fb7d7cd1050 100644 --- a/table/partitioned_filter_block.h +++ b/table/partitioned_filter_block.h @@ -33,6 +33,8 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { void AddKey(const Slice& key) override; + size_t NumAdded() const override { return num_added_; } + virtual Slice Finish(const BlockHandle& last_partition_block_handle, Status* status) override; @@ -59,6 +61,8 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { uint32_t filters_per_partition_; // The current number of filters in the last partition uint32_t filters_in_partition_; + // Number of keys added + size_t num_added_; }; class PartitionedFilterBlockReader : public FilterBlockReader { diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc index 7cf71b0e599..ac0c4c80f61 100644 --- a/table/plain_table_factory.cc +++ b/table/plain_table_factory.cc @@ -21,7 +21,7 @@ Status PlainTableFactory::NewTableReader( const TableReaderOptions& table_reader_options, unique_ptr&& file, uint64_t file_size, unique_ptr* table, - bool prefetch_index_and_filter_in_cache) const { + bool /*prefetch_index_and_filter_in_cache*/) const { return PlainTableReader::Open( table_reader_options.ioptions, table_reader_options.env_options, table_reader_options.internal_comparator, std::move(file), file_size, @@ -195,7 +195,7 @@ Status GetPlainTableOptionsFromMap( const PlainTableOptions& table_options, const std::unordered_map& opts_map, PlainTableOptions* new_table_options, bool input_strings_escaped, - bool ignore_unknown_options) { + bool /*ignore_unknown_options*/) { assert(new_table_options); *new_table_options = table_options; for (const auto& o : opts_map) { diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h index e86f6dc8e0b..f540a92b89d 100644 --- a/table/plain_table_factory.h +++ b/table/plain_table_factory.h @@ -164,15 +164,16 @@ class PlainTableFactory : public TableFactory { static const char kValueTypeSeqId0 = char(~0); // Sanitizes the specified DB Options. - Status SanitizeOptions(const DBOptions& db_opts, - const ColumnFamilyOptions& cf_opts) const override { + Status SanitizeOptions( + const DBOptions& /*db_opts*/, + const ColumnFamilyOptions& /*cf_opts*/) const override { return Status::OK(); } void* GetOptions() override { return &table_options_; } - Status GetOptionString(std::string* opt_string, - const std::string& delimiter) const override { + Status GetOptionString(std::string* /*opt_string*/, + const std::string& /*delimiter*/) const override { return Status::OK(); } diff --git a/table/plain_table_key_coding.cc b/table/plain_table_key_coding.cc index 3e87c03d13f..6f5ee9b4ad2 100644 --- a/table/plain_table_key_coding.cc +++ b/table/plain_table_key_coding.cc @@ -288,7 +288,7 @@ Status PlainTableKeyDecoder::NextPlainEncodingKey(uint32_t start_offset, ParsedInternalKey* parsed_key, Slice* internal_key, uint32_t* bytes_read, - bool* seekable) { + bool* /*seekable*/) { uint32_t user_key_size = 0; Status s; if (fixed_user_key_len_ != kPlainTableVariableLength) { diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index d4d9edb7412..9c9f82ee497 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -191,7 +191,7 @@ void PlainTableReader::SetupForCompaction() { InternalIterator* PlainTableReader::NewIterator(const ReadOptions& options, Arena* arena, - bool skip_filters) { + bool /*skip_filters*/) { bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek; if (arena == nullptr) { return new PlainTableIterator(this, use_prefix_seek); @@ -537,8 +537,8 @@ void PlainTableReader::Prepare(const Slice& target) { } } -Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, - GetContext* get_context, bool skip_filters) { +Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target, + GetContext* get_context, bool /*skip_filters*/) { // Check bloom filter first. Slice prefix_slice; uint32_t prefix_hash; @@ -602,7 +602,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, return Status::OK(); } -uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) { +uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/) { return 0; } @@ -706,7 +706,7 @@ void PlainTableIterator::Seek(const Slice& target) { } } -void PlainTableIterator::SeekForPrev(const Slice& target) { +void PlainTableIterator::SeekForPrev(const Slice& /*target*/) { assert(false); status_ = Status::NotSupported("SeekForPrev() is not supported in PlainTable"); diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc index 31b408892ab..2ed387446f1 100644 --- a/table/sst_file_writer.cc +++ b/table/sst_file_writer.cc @@ -50,7 +50,7 @@ struct SstFileWriter::Rep { std::string column_family_name; ColumnFamilyHandle* cfh; // If true, We will give the OS a hint that this file pages is not needed - // everytime we write 1MB to the file. + // every time we write 1MB to the file. bool invalidate_page_cache; // The size of the file during the last time we called Fadvise to remove // cached pages from page cache. diff --git a/table/sst_file_writer_collectors.h b/table/sst_file_writer_collectors.h index ce3a45f5a74..89e0970d816 100644 --- a/table/sst_file_writer_collectors.h +++ b/table/sst_file_writer_collectors.h @@ -26,8 +26,8 @@ class SstFileWriterPropertiesCollector : public IntTblPropCollector { SequenceNumber global_seqno) : version_(version), global_seqno_(global_seqno) {} - virtual Status InternalAdd(const Slice& key, const Slice& value, - uint64_t file_size) override { + virtual Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/, + uint64_t /*file_size*/) override { // Intentionally left blank. Have no interest in collecting stats for // individual key/value pairs. return Status::OK(); @@ -68,7 +68,7 @@ class SstFileWriterPropertiesCollectorFactory : version_(version), global_seqno_(global_seqno) {} virtual IntTblPropCollector* CreateIntTblPropCollector( - uint32_t column_family_id) override { + uint32_t /*column_family_id*/) override { return new SstFileWriterPropertiesCollector(version_, global_seqno_); } diff --git a/table/table_reader.h b/table/table_reader.h index 18fcda27370..37a282b103f 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -43,7 +43,7 @@ class TableReader { bool skip_filters = false) = 0; virtual InternalIterator* NewRangeTombstoneIterator( - const ReadOptions& read_options) { + const ReadOptions& /*read_options*/) { return nullptr; } @@ -62,7 +62,7 @@ class TableReader { virtual std::shared_ptr GetTableProperties() const = 0; // Prepare work that can be done before the real Get() - virtual void Prepare(const Slice& target) {} + virtual void Prepare(const Slice& /*target*/) {} // Report an approximation of how much memory has been used. virtual size_t ApproximateMemoryUsage() const = 0; @@ -94,7 +94,7 @@ class TableReader { } // convert db file to a human readable form - virtual Status DumpTable(WritableFile* out_file) { + virtual Status DumpTable(WritableFile* /*out_file*/) { return Status::NotSupported("DumpTable() not supported"); } diff --git a/table/table_test.cc b/table/table_test.cc index 4bdf6ba1950..bb4e7d85020 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -66,9 +66,13 @@ class DummyPropertiesCollector : public TablePropertiesCollector { public: const char* Name() const { return ""; } - Status Finish(UserCollectedProperties* properties) { return Status::OK(); } + Status Finish(UserCollectedProperties* /*properties*/) { + return Status::OK(); + } - Status Add(const Slice& user_key, const Slice& value) { return Status::OK(); } + Status Add(const Slice& /*user_key*/, const Slice& /*value*/) { + return Status::OK(); + } virtual UserCollectedProperties GetReadableProperties() const { return UserCollectedProperties{}; @@ -79,7 +83,7 @@ class DummyPropertiesCollectorFactory1 : public TablePropertiesCollectorFactory { public: virtual TablePropertiesCollector* CreateTablePropertiesCollector( - TablePropertiesCollectorFactory::Context context) { + TablePropertiesCollectorFactory::Context /*context*/) { return new DummyPropertiesCollector(); } const char* Name() const { return "DummyPropertiesCollector1"; } @@ -89,7 +93,7 @@ class DummyPropertiesCollectorFactory2 : public TablePropertiesCollectorFactory { public: virtual TablePropertiesCollector* CreateTablePropertiesCollector( - TablePropertiesCollectorFactory::Context context) { + TablePropertiesCollectorFactory::Context /*context*/) { return new DummyPropertiesCollector(); } const char* Name() const { return "DummyPropertiesCollector2"; } @@ -207,11 +211,11 @@ class BlockConstructor: public Constructor { ~BlockConstructor() { delete block_; } - virtual Status FinishImpl(const Options& options, - const ImmutableCFOptions& ioptions, - const BlockBasedTableOptions& table_options, - const InternalKeyComparator& internal_comparator, - const stl_wrappers::KVMap& kv_map) override { + virtual Status FinishImpl( + const Options& /*options*/, const ImmutableCFOptions& /*ioptions*/, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& /*internal_comparator*/, + const stl_wrappers::KVMap& kv_map) override { delete block_; block_ = nullptr; BlockBuilder builder(table_options.block_restart_interval); @@ -305,7 +309,7 @@ class TableConstructor: public Constructor { virtual Status FinishImpl(const Options& options, const ImmutableCFOptions& ioptions, - const BlockBasedTableOptions& table_options, + const BlockBasedTableOptions& /*table_options*/, const InternalKeyComparator& internal_comparator, const stl_wrappers::KVMap& kv_map) override { Reset(); @@ -433,10 +437,11 @@ class MemTableConstructor: public Constructor { ~MemTableConstructor() { delete memtable_->Unref(); } - virtual Status FinishImpl(const Options&, const ImmutableCFOptions& ioptions, - const BlockBasedTableOptions& table_options, - const InternalKeyComparator& internal_comparator, - const stl_wrappers::KVMap& kv_map) override { + virtual Status FinishImpl( + const Options&, const ImmutableCFOptions& ioptions, + const BlockBasedTableOptions& /*table_options*/, + const InternalKeyComparator& /*internal_comparator*/, + const stl_wrappers::KVMap& kv_map) override { delete memtable_->Unref(); ImmutableCFOptions mem_ioptions(ioptions); memtable_ = new MemTable(internal_comparator_, mem_ioptions, @@ -499,11 +504,11 @@ class DBConstructor: public Constructor { ~DBConstructor() { delete db_; } - virtual Status FinishImpl(const Options& options, - const ImmutableCFOptions& ioptions, - const BlockBasedTableOptions& table_options, - const InternalKeyComparator& internal_comparator, - const stl_wrappers::KVMap& kv_map) override { + virtual Status FinishImpl( + const Options& /*options*/, const ImmutableCFOptions& /*ioptions*/, + const BlockBasedTableOptions& /*table_options*/, + const InternalKeyComparator& /*internal_comparator*/, + const stl_wrappers::KVMap& kv_map) override { delete db_; db_ = nullptr; NewDB(); @@ -665,7 +670,7 @@ class FixedOrLessPrefixTransform : public SliceTransform { return Slice(src.data(), prefix_len_); } - virtual bool InDomain(const Slice& src) const override { return true; } + virtual bool InDomain(const Slice& /*src*/) const override { return true; } virtual bool InRange(const Slice& dst) const override { return (dst.size() <= prefix_len_); @@ -795,7 +800,7 @@ class HarnessTest : public testing::Test { TestRandomAccess(rnd, keys, data); } - void TestForwardScan(const std::vector& keys, + void TestForwardScan(const std::vector& /*keys*/, const stl_wrappers::KVMap& data) { InternalIterator* iter = constructor_->NewIterator(); ASSERT_TRUE(!iter->Valid()); @@ -813,7 +818,7 @@ class HarnessTest : public testing::Test { } } - void TestBackwardScan(const std::vector& keys, + void TestBackwardScan(const std::vector& /*keys*/, const stl_wrappers::KVMap& data) { InternalIterator* iter = constructor_->NewIterator(); ASSERT_TRUE(!iter->Valid()); @@ -1595,7 +1600,7 @@ static std::string RandomString(Random* rnd, int len) { } void AddInternalKey(TableConstructor* c, const std::string& prefix, - int suffix_len = 800) { + int /*suffix_len*/ = 800) { static Random rnd(1023); InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue); c->Add(k.Encode().ToString(), "v"); @@ -2957,7 +2962,7 @@ class TestPrefixExtractor : public rocksdb::SliceTransform { return true; } - bool InRange(const rocksdb::Slice& dst) const override { return true; } + bool InRange(const rocksdb::Slice& /*dst*/) const override { return true; } bool IsValid(const rocksdb::Slice& src) const { if (src.size() != 4) { @@ -3192,6 +3197,110 @@ TEST_F(BlockBasedTableTest, TableWithGlobalSeqno) { delete iter; } +TEST_F(BlockBasedTableTest, BlockAlignTest) { + BlockBasedTableOptions bbto; + bbto.block_align = true; + test::StringSink* sink = new test::StringSink(); + unique_ptr file_writer(test::GetWritableFileWriter(sink)); + Options options; + options.compression = kNoCompression; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const ImmutableCFOptions ioptions(options); + InternalKeyComparator ikc(options.comparator); + std::vector> + int_tbl_prop_collector_factories; + std::string column_family_name; + std::unique_ptr builder(options.table_factory->NewTableBuilder( + TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories, + kNoCompression, CompressionOptions(), + nullptr /* compression_dict */, + false /* skip_filters */, column_family_name, -1), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + file_writer.get())); + + for (int i = 1; i <= 10000; ++i) { + std::ostringstream ostr; + ostr << std::setfill('0') << std::setw(5) << i; + std::string key = ostr.str(); + std::string value = "val"; + InternalKey ik(key, 0, kTypeValue); + + builder->Add(ik.Encode(), value); + } + ASSERT_OK(builder->Finish()); + file_writer->Flush(); + + test::RandomRWStringSink ss_rw(sink); + unique_ptr file_reader( + test::GetRandomAccessFileReader( + new test::StringSource(ss_rw.contents(), 73342, true))); + + // Helper function to get version, global_seqno, global_seqno_offset + std::function VerifyBlockAlignment = [&]() { + TableProperties* props = nullptr; + ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(), + kBlockBasedTableMagicNumber, ioptions, + &props)); + + uint64_t data_block_size = props->data_size / props->num_data_blocks; + ASSERT_EQ(data_block_size, 4096); + ASSERT_EQ(props->data_size, data_block_size * props->num_data_blocks); + delete props; + }; + + VerifyBlockAlignment(); + + // The below block of code verifies that we can read back the keys. Set + // block_align to false when creating the reader to ensure we can flip between + // the two modes without any issues + std::unique_ptr table_reader; + bbto.block_align = false; + Options options2; + options2.table_factory.reset(NewBlockBasedTableFactory(bbto)); + ImmutableCFOptions ioptions2(options2); + ASSERT_OK(ioptions.table_factory->NewTableReader( + TableReaderOptions(ioptions2, EnvOptions(), + GetPlainInternalComparator(options2.comparator)), + std::move(file_reader), ss_rw.contents().size(), &table_reader)); + + std::unique_ptr db_iter( + table_reader->NewIterator(ReadOptions())); + + int expected_key = 1; + for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { + std::ostringstream ostr; + ostr << std::setfill('0') << std::setw(5) << expected_key++; + std::string key = ostr.str(); + std::string value = "val"; + + ASSERT_OK(db_iter->status()); + ASSERT_EQ(ExtractUserKey(db_iter->key()).ToString(), key); + ASSERT_EQ(db_iter->value().ToString(), value); + } + expected_key--; + ASSERT_EQ(expected_key, 10000); + table_reader.reset(); +} + +TEST_F(BlockBasedTableTest, BadOptions) { + rocksdb::Options options; + options.compression = kNoCompression; + rocksdb::BlockBasedTableOptions bbto; + bbto.block_size = 4000; + bbto.block_align = true; + + const std::string kDBPath = test::TmpDir() + "/table_prefix_test"; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyDB(kDBPath, options); + rocksdb::DB* db; + ASSERT_NOK(rocksdb::DB::Open(options, kDBPath, &db)); + + bbto.block_size = 4096; + options.compression = kSnappyCompression; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + ASSERT_NOK(rocksdb::DB::Open(options, kDBPath, &db)); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index dbdf4a9fdb5..0cb2855517b 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -54,8 +54,7 @@ class TwoLevelIterator : public InternalIterator { } } virtual void SetPinnedItersMgr( - PinnedIteratorsManager* pinned_iters_mgr) override { - } + PinnedIteratorsManager* /*pinned_iters_mgr*/) override {} virtual bool IsKeyPinned() const override { return false; } virtual bool IsValuePinned() const override { return false; } diff --git a/third-party/fbson/FbsonDocument.h b/third-party/fbson/FbsonDocument.h index 6fb8a93f171..fc7ca76ff38 100644 --- a/third-party/fbson/FbsonDocument.h +++ b/third-party/fbson/FbsonDocument.h @@ -355,7 +355,7 @@ class NumberValT : public FbsonValue { unsigned int numPackedBytes() const { return sizeof(FbsonValue) + sizeof(T); } // catch all unknow specialization of the template class - bool setVal(T value) { return false; } + bool setVal(T /*value*/) { return false; } private: T num_; diff --git a/tools/benchmark.sh b/tools/benchmark.sh index b997b2c1115..b7c2eefdb28 100755 --- a/tools/benchmark.sh +++ b/tools/benchmark.sh @@ -108,14 +108,12 @@ fi params_w="$const_params \ $l0_config \ - --max_background_compactions=16 \ - --max_write_buffer_number=8 \ - --max_background_flushes=7" + --max_background_jobs=20 \ + --max_write_buffer_number=8" params_bulkload="$const_params \ - --max_background_compactions=16 \ + --max_background_jobs=20 \ --max_write_buffer_number=8 \ - --max_background_flushes=7 \ --level0_file_num_compaction_trigger=$((10 * M)) \ --level0_slowdown_writes_trigger=$((10 * M)) \ --level0_stop_writes_trigger=$((10 * M))" @@ -126,14 +124,14 @@ params_bulkload="$const_params \ # LSM. In level-based compaction, it means number of L0 files. # params_level_compact="$const_params \ - --max_background_flushes=4 \ + --max_background_jobs=16 \ --max_write_buffer_number=4 \ --level0_file_num_compaction_trigger=4 \ --level0_slowdown_writes_trigger=16 \ --level0_stop_writes_trigger=20" params_univ_compact="$const_params \ - --max_background_flushes=4 \ + --max_background_jobs=20 \ --max_write_buffer_number=4 \ --level0_file_num_compaction_trigger=8 \ --level0_slowdown_writes_trigger=16 \ @@ -232,7 +230,7 @@ function run_manual_compaction_worker { --subcompactions=$3 \ --memtablerep=vector \ --disable_wal=1 \ - --max_background_compactions=$4 \ + --max_background_jobs=$4 \ --seed=$( date +%s ) \ 2>&1 | tee -a $fillrandom_output_file" @@ -276,7 +274,7 @@ function run_univ_compaction { # Define a set of benchmarks. subcompactions=(1 2 4 8 16) - max_background_compactions=(16 16 8 4 2) + max_background_jobs=(20 20 10 5 4) i=0 total=${#subcompactions[@]} @@ -285,7 +283,7 @@ function run_univ_compaction { while [ "$i" -lt "$total" ] do run_manual_compaction_worker $io_stats $compaction_style ${subcompactions[$i]} \ - ${max_background_compactions[$i]} + ${max_background_jobs[$i]} ((i++)) done } diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh index 0b379cc3dd3..fc1de36ebd2 100755 --- a/tools/check_format_compatible.sh +++ b/tools/check_format_compatible.sh @@ -42,7 +42,7 @@ EOF done declare -a backward_compatible_checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb") -declare -a forward_compatible_checkout_objs=("3.10.fb" "3.11.fb" "3.12.fb" "3.13.fb" "4.0.fb" "4.1.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb") +declare -a forward_compatible_checkout_objs=("3.10.fb" "3.11.fb" "3.12.fb" "3.13.fb" "4.0.fb" "4.1.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb" "5.11.fb" "5.12.fb") declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]}) generate_db() diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 748ce57a9a7..10b3c364b5c 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -70,6 +70,7 @@ #include "util/xxhash.h" #include "utilities/blob_db/blob_db.h" #include "utilities/merge_operators.h" +#include "utilities/merge_operators/bytesxor.h" #include "utilities/persistent_cache/block_cache_tier.h" #ifdef OS_WIN @@ -107,6 +108,7 @@ DEFINE_string( "readwhilemerging," "readrandomwriterandom," "updaterandom," + "xorupdaterandom," "randomwithverify," "fill100K," "crc32c," @@ -151,6 +153,8 @@ DEFINE_string( "\tprefixscanrandom -- prefix scan N times in random order\n" "\tupdaterandom -- N threads doing read-modify-write for random " "keys\n" + "\txorupdaterandom -- N threads doing read-XOR-write for " + "random keys\n" "\tappendrandom -- N threads doing read-modify-write with " "growing values\n" "\tmergerandom -- same as updaterandom/appendrandom using merge" @@ -246,7 +250,7 @@ DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator"); DEFINE_int64(batch_size, 1, "Batch size"); -static bool ValidateKeySize(const char* flagname, int32_t value) { +static bool ValidateKeySize(const char* /*flagname*/, int32_t /*value*/) { return true; } @@ -448,6 +452,9 @@ DEFINE_bool(enable_index_compression, rocksdb::BlockBasedTableOptions().enable_index_compression, "Compress the index block"); +DEFINE_bool(block_align, rocksdb::BlockBasedTableOptions().block_align, + "Align data blocks on page size"); + DEFINE_int64(compressed_cache_size, -1, "Number of bytes to use as a cache of compressed data."); @@ -682,7 +689,7 @@ DEFINE_bool(blob_db_enable_gc, false, "Enable BlobDB garbage collection."); DEFINE_bool(blob_db_is_fifo, false, "Enable FIFO eviction strategy in BlobDB."); -DEFINE_uint64(blob_db_dir_size, 0, +DEFINE_uint64(blob_db_max_db_size, 0, "Max size limit of the directory where blob files are stored."); DEFINE_uint64(blob_db_max_ttl_range, 86400, @@ -2129,8 +2136,9 @@ class Benchmark { explicit ExpiredTimeFilter( const std::shared_ptr& timestamp_emulator) : timestamp_emulator_(timestamp_emulator) {} - bool Filter(int level, const Slice& key, const Slice& existing_value, - std::string* new_value, bool* value_changed) const override { + bool Filter(int /*level*/, const Slice& key, + const Slice& /*existing_value*/, std::string* /*new_value*/, + bool* /*value_changed*/) const override { return KeyExpired(timestamp_emulator_.get(), key); } const char* Name() const override { return "ExpiredTimeFilter"; } @@ -2525,6 +2533,8 @@ void VerifyDBFromDB(std::string& truth_db_name) { method = &Benchmark::ReadRandomMergeRandom; } else if (name == "updaterandom") { method = &Benchmark::UpdateRandom; + } else if (name == "xorupdaterandom") { + method = &Benchmark::XORUpdateRandom; } else if (name == "appendrandom") { method = &Benchmark::AppendRandom; } else if (name == "mergerandom") { @@ -3133,6 +3143,7 @@ void VerifyDBFromDB(std::string& truth_db_name) { block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit; block_based_options.enable_index_compression = FLAGS_enable_index_compression; + block_based_options.block_align = FLAGS_block_align; if (FLAGS_read_cache_path != "") { #ifndef ROCKSDB_LITE Status rc_status; @@ -3439,7 +3450,7 @@ void VerifyDBFromDB(std::string& truth_db_name) { blob_db::BlobDBOptions blob_db_options; blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc; blob_db_options.is_fifo = FLAGS_blob_db_is_fifo; - blob_db_options.blob_dir_size = FLAGS_blob_db_dir_size; + blob_db_options.max_db_size = FLAGS_blob_db_max_db_size; blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs; blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size; blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync; @@ -3486,12 +3497,9 @@ void VerifyDBFromDB(std::string& truth_db_name) { class KeyGenerator { public: - KeyGenerator(Random64* rand, WriteMode mode, - uint64_t num, uint64_t num_per_set = 64 * 1024) - : rand_(rand), - mode_(mode), - num_(num), - next_(0) { + KeyGenerator(Random64* rand, WriteMode mode, uint64_t num, + uint64_t /*num_per_set*/ = 64 * 1024) + : rand_(rand), mode_(mode), num_(num), next_(0) { if (mode_ == UNIQUE_RANDOM) { // NOTE: if memory consumption of this approach becomes a concern, // we can either break it into pieces and only random shuffle a section @@ -4745,6 +4753,58 @@ void VerifyDBFromDB(std::string& truth_db_name) { thread->stats.AddMessage(msg); } + // Read-XOR-write for random keys. Xors the existing value with a randomly + // generated value, and stores the result. Assuming A in the array of bytes + // representing the existing value, we generate an array B of the same size, + // then compute C = A^B as C[i]=A[i]^B[i], and store C + void XORUpdateRandom(ThreadState* thread) { + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + std::string existing_value; + int64_t found = 0; + Duration duration(FLAGS_duration, readwrites_); + + BytesXOROperator xor_operator; + + std::unique_ptr key_guard; + Slice key = AllocateKey(&key_guard); + // the number of iterations is the larger of read_ or write_ + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); + + auto status = db->Get(options, key, &existing_value); + if (status.ok()) { + ++found; + } else if (!status.IsNotFound()) { + fprintf(stderr, "Get returned an error: %s\n", + status.ToString().c_str()); + exit(1); + } + + Slice value = gen.Generate(value_size_); + std::string new_value; + + if (status.ok()) { + Slice existing_value_slice = Slice(existing_value); + xor_operator.XOR(&existing_value_slice, value, &new_value); + } else { + xor_operator.XOR(nullptr, value, &new_value); + } + + Status s = db->Put(write_options_, key, Slice(new_value)); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + thread->stats.FinishedOps(nullptr, db, 1); + } + char msg[100]; + snprintf(msg, sizeof(msg), + "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found); + thread->stats.AddMessage(msg); + } + // Read-modify-write for random keys. // Each operation causes the key grow by value_size (simulating an append). // Generally used for benchmarking against merges of similar type diff --git a/tools/db_stress.cc b/tools/db_stress.cc index d7d3405ea83..1105b2a6670 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -40,6 +40,7 @@ int main() { #include #include #include +#include #include #include "db/db_impl.h" @@ -996,11 +997,11 @@ struct ThreadState { Stats stats; struct SnapshotState { const Snapshot* snapshot; - // The cf from which we did a Get at this stapshot + // The cf from which we did a Get at this snapshot int cf_at; - // The name of the cf at the the time that we did a read + // The name of the cf at the time that we did a read std::string cf_at_name; - // The key with which we did a Get at this stapshot + // The key with which we did a Get at this snapshot std::string key; // The status of the Get Status status; diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index 96138ffa6ed..a22e6135908 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -112,7 +112,7 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs( LDBCommand* LDBCommand::InitFromCmdLineArgs( const std::vector& args, const Options& options, const LDBOptions& ldb_options, - const std::vector* column_families, + const std::vector* /*column_families*/, const std::function& selector) { // --x=y command line arguments are added as x->y map entries in // parsed_params.option_map. @@ -451,7 +451,7 @@ std::vector LDBCommand::BuildCmdLineOptions( * updated. */ bool LDBCommand::ParseIntOption( - const std::map& options, + const std::map& /*options*/, const std::string& option, int& value, LDBCommandExecuteResult& exec_state) { std::map::const_iterator itr = @@ -481,7 +481,7 @@ bool LDBCommand::ParseIntOption( * Returns false otherwise. */ bool LDBCommand::ParseStringOption( - const std::map& options, + const std::map& /*options*/, const std::string& option, std::string* value) { auto itr = option_map_.find(option); if (itr != option_map_.end()) { @@ -772,7 +772,7 @@ bool LDBCommand::StringToBool(std::string val) { } CompactorCommand::CompactorCommand( - const std::vector& params, + const std::vector& /*params*/, const std::map& options, const std::vector& flags) : LDBCommand(options, flags, false, @@ -842,7 +842,7 @@ const std::string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load"; const std::string DBLoaderCommand::ARG_COMPACT = "compact"; DBLoaderCommand::DBLoaderCommand( - const std::vector& params, + const std::vector& /*params*/, const std::map& options, const std::vector& flags) : LDBCommand( @@ -958,7 +958,7 @@ void ManifestDumpCommand::Help(std::string& ret) { } ManifestDumpCommand::ManifestDumpCommand( - const std::vector& params, + const std::vector& /*params*/, const std::map& options, const std::vector& flags) : LDBCommand( @@ -1153,7 +1153,7 @@ const std::string InternalDumpCommand::ARG_STATS = "stats"; const std::string InternalDumpCommand::ARG_INPUT_KEY_HEX = "input_key_hex"; InternalDumpCommand::InternalDumpCommand( - const std::vector& params, + const std::vector& /*params*/, const std::map& options, const std::vector& flags) : LDBCommand( @@ -1292,7 +1292,7 @@ const std::string DBDumperCommand::ARG_STATS = "stats"; const std::string DBDumperCommand::ARG_TTL_BUCKET = "bucket"; DBDumperCommand::DBDumperCommand( - const std::vector& params, + const std::vector& /*params*/, const std::map& options, const std::vector& flags) : LDBCommand(options, flags, true, @@ -1579,7 +1579,7 @@ const std::string ReduceDBLevelsCommand::ARG_PRINT_OLD_LEVELS = "print_old_levels"; ReduceDBLevelsCommand::ReduceDBLevelsCommand( - const std::vector& params, + const std::vector& /*params*/, const std::map& options, const std::vector& flags) : LDBCommand(options, flags, false, @@ -1709,7 +1709,7 @@ const std::string ChangeCompactionStyleCommand::ARG_NEW_COMPACTION_STYLE = "new_compaction_style"; ChangeCompactionStyleCommand::ChangeCompactionStyleCommand( - const std::vector& params, + const std::vector& /*params*/, const std::map& options, const std::vector& flags) : LDBCommand(options, flags, false, @@ -1846,7 +1846,7 @@ void ChangeCompactionStyleCommand::DoCommand() { namespace { struct StdErrReporter : public log::Reader::Reporter { - virtual void Corruption(size_t bytes, const Status& s) override { + virtual void Corruption(size_t /*bytes*/, const Status& s) override { std::cerr << "Corruption detected in log file " << s.ToString() << "\n"; } }; @@ -2016,7 +2016,7 @@ const std::string WALDumperCommand::ARG_PRINT_VALUE = "print_value"; const std::string WALDumperCommand::ARG_PRINT_HEADER = "header"; WALDumperCommand::WALDumperCommand( - const std::vector& params, + const std::vector& /*params*/, const std::map& options, const std::vector& flags) : LDBCommand(options, flags, true, @@ -2100,7 +2100,7 @@ void GetCommand::DoCommand() { // ---------------------------------------------------------------------------- ApproxSizeCommand::ApproxSizeCommand( - const std::vector& params, + const std::vector& /*params*/, const std::map& options, const std::vector& flags) : LDBCommand(options, flags, true, @@ -2216,7 +2216,7 @@ Options BatchPutCommand::PrepareOptionsForOpenDB() { // ---------------------------------------------------------------------------- -ScanCommand::ScanCommand(const std::vector& params, +ScanCommand::ScanCommand(const std::vector& /*params*/, const std::map& options, const std::vector& flags) : LDBCommand( @@ -2499,7 +2499,7 @@ const char* DBQuerierCommand::PUT_CMD = "put"; const char* DBQuerierCommand::DELETE_CMD = "delete"; DBQuerierCommand::DBQuerierCommand( - const std::vector& params, + const std::vector& /*params*/, const std::map& options, const std::vector& flags) : LDBCommand( @@ -2578,7 +2578,7 @@ void DBQuerierCommand::DoCommand() { // ---------------------------------------------------------------------------- CheckConsistencyCommand::CheckConsistencyCommand( - const std::vector& params, + const std::vector& /*params*/, const std::map& options, const std::vector& flags) : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {} @@ -2610,7 +2610,7 @@ void CheckConsistencyCommand::DoCommand() { const std::string CheckPointCommand::ARG_CHECKPOINT_DIR = "checkpoint_dir"; CheckPointCommand::CheckPointCommand( - const std::vector& params, + const std::vector& /*params*/, const std::map& options, const std::vector& flags) : LDBCommand(options, flags, false /* is_read_only */, @@ -2645,7 +2645,7 @@ void CheckPointCommand::DoCommand() { // ---------------------------------------------------------------------------- -RepairCommand::RepairCommand(const std::vector& params, +RepairCommand::RepairCommand(const std::vector& /*params*/, const std::map& options, const std::vector& flags) : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {} @@ -2675,7 +2675,7 @@ const std::string BackupableCommand::ARG_BACKUP_DIR = "backup_dir"; const std::string BackupableCommand::ARG_STDERR_LOG_LEVEL = "stderr_log_level"; BackupableCommand::BackupableCommand( - const std::vector& params, + const std::vector& /*params*/, const std::map& options, const std::vector& flags) : LDBCommand(options, flags, false /* is_read_only */, @@ -2853,7 +2853,7 @@ void DumpSstFile(std::string filename, bool output_hex, bool show_properties) { } // namespace DBFileDumperCommand::DBFileDumperCommand( - const std::vector& params, + const std::vector& /*params*/, const std::map& options, const std::vector& flags) : LDBCommand(options, flags, true, BuildCmdLineOptions({})) {} diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc index e8229ef7b91..b09076ecc61 100644 --- a/tools/ldb_tool.cc +++ b/tools/ldb_tool.cc @@ -13,7 +13,7 @@ namespace rocksdb { LDBOptions::LDBOptions() {} void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options, - const char* exec_name) { + const char* /*exec_name*/) { std::string ret; ret.append(ldb_options.print_help_header); diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc index 29d8e429261..4480bb09efa 100644 --- a/tools/sst_dump_tool.cc +++ b/tools/sst_dump_tool.cc @@ -121,9 +121,9 @@ Status SstFileReader::GetTableReader(const std::string& file_path) { } Status SstFileReader::NewTableReader( - const ImmutableCFOptions& ioptions, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, uint64_t file_size, - unique_ptr* table_reader) { + const ImmutableCFOptions& /*ioptions*/, const EnvOptions& /*soptions*/, + const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size, + unique_ptr* /*table_reader*/) { // We need to turn off pre-fetching of index and filter nodes for // BlockBasedTable if (BlockBasedTableFactory::kName == options_.table_factory->Name()) { diff --git a/util/aligned_buffer.h b/util/aligned_buffer.h index 8d4a0be5845..0c36eca9d75 100644 --- a/util/aligned_buffer.h +++ b/util/aligned_buffer.h @@ -161,6 +161,12 @@ class AlignedBuffer { } } + void PadWith(size_t pad_size, int padding) { + assert((pad_size + cursize_) <= capacity_); + memset(bufstart_ + cursize_, padding, pad_size); + cursize_ += pad_size; + } + // After a partial flush move the tail to the beginning of the buffer void RefitTail(size_t tail_offset, size_t tail_size) { if (tail_size > 0) { diff --git a/util/compression.h b/util/compression.h index f12036d21fb..cfc4521a4a7 100644 --- a/util/compression.h +++ b/util/compression.h @@ -162,8 +162,9 @@ inline std::string CompressionTypeToString(CompressionType compression_type) { // 2 -- Zlib, BZip2 and LZ4 encode decompressed size as Varint32 just before the // start of compressed block. Snappy format is the same as version 1. -inline bool Snappy_Compress(const CompressionOptions& opts, const char* input, - size_t length, ::std::string* output) { +inline bool Snappy_Compress(const CompressionOptions& /*opts*/, + const char* input, size_t length, + ::std::string* output) { #ifdef SNAPPY output->resize(snappy::MaxCompressedLength(length)); size_t outlen; @@ -393,10 +394,9 @@ inline char* Zlib_Uncompress(const char* input_data, size_t input_length, // block header // compress_format_version == 2 -- decompressed size is included in the block // header in varint32 format -inline bool BZip2_Compress(const CompressionOptions& opts, - uint32_t compress_format_version, - const char* input, size_t length, - ::std::string* output) { +inline bool BZip2_Compress(const CompressionOptions& /*opts*/, + uint32_t compress_format_version, const char* input, + size_t length, ::std::string* output) { #ifdef BZIP2 if (length > std::numeric_limits::max()) { // Can't compress more than 4GB @@ -534,7 +534,7 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length, // header in varint32 format // @param compression_dict Data for presetting the compression library's // dictionary. -inline bool LZ4_Compress(const CompressionOptions& opts, +inline bool LZ4_Compress(const CompressionOptions& /*opts*/, uint32_t compress_format_version, const char* input, size_t length, ::std::string* output, const Slice compression_dict = Slice()) { @@ -722,22 +722,31 @@ inline bool LZ4HC_Compress(const CompressionOptions& opts, #endif } -inline bool XPRESS_Compress(const char* input, size_t length, std::string* output) { #ifdef XPRESS +inline bool XPRESS_Compress(const char* input, size_t length, + std::string* output) { return port::xpress::Compress(input, length, output); +} #else +inline bool XPRESS_Compress(const char* /*input*/, size_t /*length*/, + std::string* /*output*/) { return false; -#endif } +#endif -inline char* XPRESS_Uncompress(const char* input_data, size_t input_length, - int* decompress_size) { #ifdef XPRESS +inline char* XPRESS_Uncompress(const char* input_data, + size_t input_length, + int* decompress_size) { return port::xpress::Decompress(input_data, input_length, decompress_size); +} #else +inline char* XPRESS_Uncompress(const char* /*input_data*/, + size_t /*input_length*/, + int* /*decompress_size*/) { return nullptr; -#endif } +#endif // @param compression_dict Data for presetting the compression library's diff --git a/util/delete_scheduler.cc b/util/delete_scheduler.cc index ec7e2f4d271..8b05a5c90b8 100644 --- a/util/delete_scheduler.cc +++ b/util/delete_scheduler.cc @@ -22,11 +22,13 @@ namespace rocksdb { DeleteScheduler::DeleteScheduler(Env* env, int64_t rate_bytes_per_sec, Logger* info_log, SstFileManagerImpl* sst_file_manager, - double max_trash_db_ratio) + double max_trash_db_ratio, + uint64_t bytes_max_delete_chunk) : env_(env), total_trash_size_(0), rate_bytes_per_sec_(rate_bytes_per_sec), pending_files_(0), + bytes_max_delete_chunk_(bytes_max_delete_chunk), closing_(false), cv_(&mu_), info_log_(info_log), @@ -208,15 +210,18 @@ void DeleteScheduler::BackgroundEmptyTrash() { // Get new file to delete std::string path_in_trash = queue_.front(); - queue_.pop(); // We dont need to hold the lock while deleting the file mu_.Unlock(); uint64_t deleted_bytes = 0; + bool is_complete = true; // Delete file from trash and update total_penlty value - Status s = DeleteTrashFile(path_in_trash, &deleted_bytes); + Status s = DeleteTrashFile(path_in_trash, &deleted_bytes, &is_complete); total_deleted_bytes += deleted_bytes; mu_.Lock(); + if (is_complete) { + queue_.pop(); + } if (!s.ok()) { bg_errors_[path_in_trash] = s; @@ -236,7 +241,9 @@ void DeleteScheduler::BackgroundEmptyTrash() { TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait", &total_penlty); - pending_files_--; + if (is_complete) { + pending_files_--; + } if (pending_files_ == 0) { // Unblock WaitForEmptyTrash since there are no more files waiting // to be deleted @@ -247,23 +254,49 @@ void DeleteScheduler::BackgroundEmptyTrash() { } Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash, - uint64_t* deleted_bytes) { + uint64_t* deleted_bytes, + bool* is_complete) { uint64_t file_size; Status s = env_->GetFileSize(path_in_trash, &file_size); + *is_complete = true; + TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile"); if (s.ok()) { - TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile"); - s = env_->DeleteFile(path_in_trash); - } + bool need_full_delete = true; + if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) { + unique_ptr wf; + Status my_status = + env_->ReopenWritableFile(path_in_trash, &wf, EnvOptions()); + if (my_status.ok()) { + my_status = wf->Truncate(file_size - bytes_max_delete_chunk_); + if (my_status.ok()) { + TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:Fsync"); + my_status = wf->Fsync(); + } + } + if (my_status.ok()) { + *deleted_bytes = bytes_max_delete_chunk_; + need_full_delete = false; + *is_complete = false; + } else { + ROCKS_LOG_WARN(info_log_, + "Failed to partially delete %s from trash -- %s", + path_in_trash.c_str(), my_status.ToString().c_str()); + } + } + if (need_full_delete) { + s = env_->DeleteFile(path_in_trash); + *deleted_bytes = file_size; + sst_file_manager_->OnDeleteFile(path_in_trash); + } + } if (!s.ok()) { // Error while getting file size or while deleting ROCKS_LOG_ERROR(info_log_, "Failed to delete %s from trash -- %s", path_in_trash.c_str(), s.ToString().c_str()); *deleted_bytes = 0; } else { - *deleted_bytes = file_size; - total_trash_size_.fetch_sub(file_size); - sst_file_manager_->OnDeleteFile(path_in_trash); + total_trash_size_.fetch_sub(*deleted_bytes); } return s; diff --git a/util/delete_scheduler.h b/util/delete_scheduler.h index c142d07a4dd..cc456dcb9bd 100644 --- a/util/delete_scheduler.h +++ b/util/delete_scheduler.h @@ -34,7 +34,7 @@ class DeleteScheduler { public: DeleteScheduler(Env* env, int64_t rate_bytes_per_sec, Logger* info_log, SstFileManagerImpl* sst_file_manager, - double max_trash_db_ratio); + double max_trash_db_ratio, uint64_t bytes_max_delete_chunk); ~DeleteScheduler(); @@ -82,7 +82,7 @@ class DeleteScheduler { Status MarkAsTrash(const std::string& file_path, std::string* path_in_trash); Status DeleteTrashFile(const std::string& path_in_trash, - uint64_t* deleted_bytes); + uint64_t* deleted_bytes, bool* is_complete); void BackgroundEmptyTrash(); @@ -97,6 +97,7 @@ class DeleteScheduler { std::queue queue_; // Number of trash files that are waiting to be deleted int32_t pending_files_; + uint64_t bytes_max_delete_chunk_; // Errors that happened in BackgroundEmptyTrash (file_path => error) std::map bg_errors_; // Set to true in ~DeleteScheduler() to force BackgroundEmptyTrash to stop diff --git a/util/delete_scheduler_test.cc b/util/delete_scheduler_test.cc index 0ac7972e400..bb8e81f7dc8 100644 --- a/util/delete_scheduler_test.cc +++ b/util/delete_scheduler_test.cc @@ -28,15 +28,23 @@ namespace rocksdb { class DeleteSchedulerTest : public testing::Test { public: DeleteSchedulerTest() : env_(Env::Default()) { - dummy_files_dir_ = test::TmpDir(env_) + "/delete_scheduler_dummy_data_dir"; - DestroyAndCreateDir(dummy_files_dir_); + const int kNumDataDirs = 3; + dummy_files_dirs_.reserve(kNumDataDirs); + for (size_t i = 0; i < kNumDataDirs; ++i) { + dummy_files_dirs_.emplace_back(test::TmpDir(env_) + + "/delete_scheduler_dummy_data_dir" + + ToString(i)); + DestroyAndCreateDir(dummy_files_dirs_.back()); + } } ~DeleteSchedulerTest() { rocksdb::SyncPoint::GetInstance()->DisableProcessing(); rocksdb::SyncPoint::GetInstance()->LoadDependency({}); rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); - test::DestroyDir(env_, dummy_files_dir_); + for (const auto& dummy_files_dir : dummy_files_dirs_) { + test::DestroyDir(env_, dummy_files_dir); + } } void DestroyAndCreateDir(const std::string& dir) { @@ -44,23 +52,24 @@ class DeleteSchedulerTest : public testing::Test { EXPECT_OK(env_->CreateDir(dir)); } - int CountNormalFiles() { + int CountNormalFiles(size_t dummy_files_dirs_idx = 0) { std::vector files_in_dir; - EXPECT_OK(env_->GetChildren(dummy_files_dir_, &files_in_dir)); + EXPECT_OK(env_->GetChildren(dummy_files_dirs_[dummy_files_dirs_idx], + &files_in_dir)); int normal_cnt = 0; for (auto& f : files_in_dir) { if (!DeleteScheduler::IsTrashFile(f) && f != "." && f != "..") { - printf("%s\n", f.c_str()); normal_cnt++; } } return normal_cnt; } - int CountTrashFiles() { + int CountTrashFiles(size_t dummy_files_dirs_idx = 0) { std::vector files_in_dir; - EXPECT_OK(env_->GetChildren(dummy_files_dir_, &files_in_dir)); + EXPECT_OK(env_->GetChildren(dummy_files_dirs_[dummy_files_dirs_idx], + &files_in_dir)); int trash_cnt = 0; for (auto& f : files_in_dir) { @@ -71,8 +80,10 @@ class DeleteSchedulerTest : public testing::Test { return trash_cnt; } - std::string NewDummyFile(const std::string& file_name, uint64_t size = 1024) { - std::string file_path = dummy_files_dir_ + "/" + file_name; + std::string NewDummyFile(const std::string& file_name, uint64_t size = 1024, + size_t dummy_files_dirs_idx = 0) { + std::string file_path = + dummy_files_dirs_[dummy_files_dirs_idx] + "/" + file_name; std::unique_ptr f; env_->NewWritableFile(file_path, &f, EnvOptions()); std::string data(size, 'A'); @@ -88,12 +99,12 @@ class DeleteSchedulerTest : public testing::Test { // 25%) sst_file_mgr_.reset( new SstFileManagerImpl(env_, nullptr, rate_bytes_per_sec_, - /* max_trash_db_ratio= */ 1.1)); + /* max_trash_db_ratio= */ 1.1, 128 * 1024)); delete_scheduler_ = sst_file_mgr_->delete_scheduler(); } Env* env_; - std::string dummy_files_dir_; + std::vector dummy_files_dirs_; int64_t rate_bytes_per_sec_; DeleteScheduler* delete_scheduler_; std::unique_ptr sst_file_mgr_; @@ -126,7 +137,7 @@ TEST_F(DeleteSchedulerTest, BasicRateLimiting) { rocksdb::SyncPoint::GetInstance()->ClearTrace(); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - DestroyAndCreateDir(dummy_files_dir_); + DestroyAndCreateDir(dummy_files_dirs_[0]); rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024; NewDeleteScheduler(); @@ -166,6 +177,42 @@ TEST_F(DeleteSchedulerTest, BasicRateLimiting) { } } +TEST_F(DeleteSchedulerTest, MultiDirectoryDeletionsScheduled) { + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"DeleteSchedulerTest::MultiDbPathDeletionsScheduled:1", + "DeleteScheduler::BackgroundEmptyTrash"}, + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + rate_bytes_per_sec_ = 1 << 20; // 1MB + NewDeleteScheduler(); + + // Generate dummy files in multiple directories + const size_t kNumFiles = dummy_files_dirs_.size(); + const size_t kFileSize = 1 << 10; // 1KB + std::vector generated_files; + for (size_t i = 0; i < kNumFiles; i++) { + generated_files.push_back(NewDummyFile("file", kFileSize, i)); + ASSERT_EQ(1, CountNormalFiles(i)); + } + + // Mark dummy files as trash + for (size_t i = 0; i < kNumFiles; i++) { + ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i])); + ASSERT_EQ(0, CountNormalFiles(i)); + ASSERT_EQ(1, CountTrashFiles(i)); + } + TEST_SYNC_POINT("DeleteSchedulerTest::MultiDbPathDeletionsScheduled:1"); + delete_scheduler_->WaitForEmptyTrash(); + + // Verify dummy files eventually got deleted + for (size_t i = 0; i < kNumFiles; i++) { + ASSERT_EQ(0, CountNormalFiles(i)); + ASSERT_EQ(0, CountTrashFiles(i)); + } + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + // Same as the BasicRateLimiting test but delete files in multiple threads. // 1- Create 100 dummy files // 2- Delete the 100 dummy files using DeleteScheduler using 10 threads @@ -194,7 +241,7 @@ TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) { rocksdb::SyncPoint::GetInstance()->ClearTrace(); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - DestroyAndCreateDir(dummy_files_dir_); + DestroyAndCreateDir(dummy_files_dirs_[0]); rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024; NewDeleteScheduler(); @@ -342,7 +389,7 @@ TEST_F(DeleteSchedulerTest, BackgroundError) { // goind to delete for (int i = 0; i < 10; i++) { std::string file_name = "data_" + ToString(i) + ".data.trash"; - ASSERT_OK(env_->DeleteFile(dummy_files_dir_ + "/" + file_name)); + ASSERT_OK(env_->DeleteFile(dummy_files_dirs_[0] + "/" + file_name)); } // Hold BackgroundEmptyTrash @@ -389,6 +436,34 @@ TEST_F(DeleteSchedulerTest, StartBGEmptyTrashMultipleTimes) { rocksdb::SyncPoint::GetInstance()->EnableProcessing(); } +TEST_F(DeleteSchedulerTest, DeletePartialFile) { + int bg_delete_file = 0; + int bg_fsync = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void*) { bg_delete_file++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec + NewDeleteScheduler(); + + // Should delete in 4 batch + ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile("data_1", 500 * 1024))); + ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile("data_2", 100 * 1024))); + // Should delete in 2 batch + ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile("data_2", 200 * 1024))); + + delete_scheduler_->WaitForEmptyTrash(); + + auto bg_errors = delete_scheduler_->GetBackgroundErrors(); + ASSERT_EQ(bg_errors.size(), 0); + ASSERT_EQ(7, bg_delete_file); + ASSERT_EQ(4, bg_fsync); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); +} + // 1- Create a DeleteScheduler with very slow rate limit (1 Byte / sec) // 2- Delete 100 files using DeleteScheduler // 3- Delete the DeleteScheduler (call the destructor while queue is not empty) @@ -454,7 +529,7 @@ TEST_F(DeleteSchedulerTest, DISABLED_DynamicRateLimiting1) { rocksdb::SyncPoint::GetInstance()->ClearTrace(); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - DestroyAndCreateDir(dummy_files_dir_); + DestroyAndCreateDir(dummy_files_dirs_[0]); rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024; delete_scheduler_->SetRateBytesPerSecond(rate_bytes_per_sec_); diff --git a/util/duplicate_detector.h b/util/duplicate_detector.h new file mode 100644 index 00000000000..46549a98d08 --- /dev/null +++ b/util/duplicate_detector.h @@ -0,0 +1,48 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "util/set_comparator.h" + +namespace rocksdb { +// During recovery if the memtable is flushed we cannot rely on its help on +// duplicate key detection and as key insert will not be attempted. This class +// will be used as a emulator of memtable to tell if insertion of a key/seq +// would have resulted in duplication. +class DuplicateDetector { + public: + explicit DuplicateDetector(DBImpl* db) : db_(db) {} + bool IsDuplicateKeySeq(uint32_t cf, const Slice& key, SequenceNumber seq) { + assert(seq >= batch_seq_); + if (batch_seq_ != seq) { // it is a new batch + keys_.clear(); + } + batch_seq_ = seq; + CFKeys& cf_keys = keys_[cf]; + if (cf_keys.size() == 0) { // just inserted + InitWithComp(cf); + } + auto it = cf_keys.insert(key); + if (it.second == false) { // second is false if a element already existed. + keys_.clear(); + InitWithComp(cf); + keys_[cf].insert(key); + return true; + } + return false; + } + + private: + SequenceNumber batch_seq_ = 0; + DBImpl* db_; + using CFKeys = std::set; + std::map keys_; + void InitWithComp(const uint32_t cf) { + auto cmp = db_->GetColumnFamilyHandle(cf)->GetComparator(); + keys_[cf] = CFKeys(SetComparator(cmp)); + } +}; +} // namespace rocksdb diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc index 1dabf296814..635dd98afbb 100644 --- a/util/dynamic_bloom.cc +++ b/util/dynamic_bloom.cc @@ -45,7 +45,7 @@ DynamicBloom::DynamicBloom(uint32_t num_probes, kNumBlocks(0), kNumProbes(num_probes), hash_func_(hash_func == nullptr ? &BloomHash : hash_func), - data_(0) {} + data_(nullptr) {} void DynamicBloom::SetRawData(unsigned char* raw_data, uint32_t total_bits, uint32_t num_blocks) { diff --git a/util/fault_injection_test_env.h b/util/fault_injection_test_env.h index 5d0ae634456..1992ab52eac 100644 --- a/util/fault_injection_test_env.h +++ b/util/fault_injection_test_env.h @@ -68,6 +68,13 @@ class TestWritableFile : public WritableFile { virtual Status Flush() override; virtual Status Sync() override; virtual bool IsSyncThreadSafe() const override { return true; } + virtual Status PositionedAppend(const Slice& data, + uint64_t offset) override { + return target_->PositionedAppend(data, offset); + } + virtual bool use_direct_io() const override { + return target_->use_direct_io(); + }; private: FileState state_; diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index 677cf2076c5..9d4298b1e93 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -219,6 +219,31 @@ Status WritableFileWriter::Append(const Slice& data) { return s; } +Status WritableFileWriter::Pad(const size_t pad_bytes) { + assert(pad_bytes < kDefaultPageSize); + size_t left = pad_bytes; + size_t cap = buf_.Capacity() - buf_.CurrentSize(); + + // Assume pad_bytes is small compared to buf_ capacity. So we always + // use buf_ rather than write directly to file in certain cases like + // Append() does. + while (left) { + size_t append_bytes = std::min(cap, left); + buf_.PadWith(append_bytes, 0); + left -= append_bytes; + if (left > 0) { + Status s = Flush(); + if (!s.ok()) { + return s; + } + } + cap = buf_.Capacity() - buf_.CurrentSize(); + } + pending_sync_ = true; + filesize_ += pad_bytes; + return Status::OK(); +} + Status WritableFileWriter::Close() { // Do not quit immediately on failure the file MUST be closed @@ -516,7 +541,7 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { *result = Slice(scratch, cached_len); return Status::OK(); } - size_t advanced_offset = offset + cached_len; + size_t advanced_offset = static_cast(offset + cached_len); // In the case of cache hit advanced_offset is already aligned, means that // chunk_offset equals to advanced_offset size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset); @@ -549,12 +574,13 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { // `Read()` assumes a smaller prefetch buffer indicates EOF was reached. return Status::OK(); } - size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset); + size_t offset_ = static_cast(offset); + size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_); if (prefetch_offset == buffer_offset_) { return Status::OK(); } return ReadIntoBuffer(prefetch_offset, - Roundup(offset + n, alignment_) - prefetch_offset); + Roundup(offset_ + n, alignment_) - prefetch_offset); } virtual size_t GetUniqueId(char* id, size_t max_size) const override { @@ -614,17 +640,18 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n) { size_t alignment = reader->file()->GetRequiredBufferAlignment(); - uint64_t rounddown_offset = Rounddown(offset, alignment); - uint64_t roundup_end = Roundup(offset + n, alignment); + size_t offset_ = static_cast(offset); + uint64_t rounddown_offset = Rounddown(offset_, alignment); + uint64_t roundup_end = Roundup(offset_ + n, alignment); uint64_t roundup_len = roundup_end - rounddown_offset; assert(roundup_len >= alignment); assert(roundup_len % alignment == 0); buffer_.Alignment(alignment); - buffer_.AllocateNewBuffer(roundup_len); + buffer_.AllocateNewBuffer(static_cast(roundup_len)); Slice result; - Status s = reader->Read(rounddown_offset, roundup_len, &result, - buffer_.BufferStart()); + Status s = reader->Read(rounddown_offset, static_cast(roundup_len), + &result, buffer_.BufferStart()); if (s.ok()) { buffer_offset_ = rounddown_offset; buffer_len_ = result.size(); diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h index 9bc3b9437c3..9db12ba061d 100644 --- a/util/file_reader_writer.h +++ b/util/file_reader_writer.h @@ -166,6 +166,8 @@ class WritableFileWriter { Status Append(const Slice& data); + Status Pad(const size_t pad_bytes); + Status Flush(); Status Close(); diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc index 45675e9dd76..4425f87a0a1 100644 --- a/util/file_reader_writer_test.cc +++ b/util/file_reader_writer_test.cc @@ -26,9 +26,7 @@ TEST_F(WritableFileWriterTest, RangeSync) { size_ += data.size(); return Status::OK(); } - virtual Status Truncate(uint64_t size) override { - return Status::OK(); - } + virtual Status Truncate(uint64_t /*size*/) override { return Status::OK(); } Status Close() override { EXPECT_GE(size_, last_synced_ + kMb); EXPECT_LT(size_, last_synced_ + 2 * kMb); @@ -39,17 +37,21 @@ TEST_F(WritableFileWriterTest, RangeSync) { Status Flush() override { return Status::OK(); } Status Sync() override { return Status::OK(); } Status Fsync() override { return Status::OK(); } - void SetIOPriority(Env::IOPriority pri) override {} + void SetIOPriority(Env::IOPriority /*pri*/) override {} uint64_t GetFileSize() override { return size_; } - void GetPreallocationStatus(size_t* block_size, - size_t* last_allocated_block) override {} - size_t GetUniqueId(char* id, size_t max_size) const override { return 0; } - Status InvalidateCache(size_t offset, size_t length) override { + void GetPreallocationStatus(size_t* /*block_size*/, + size_t* /*last_allocated_block*/) override {} + size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override { + return 0; + } + Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override { return Status::OK(); } protected: - Status Allocate(uint64_t offset, uint64_t len) override { return Status::OK(); } + Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) override { + return Status::OK(); + } Status RangeSync(uint64_t offset, uint64_t nbytes) override { EXPECT_EQ(offset % 4096, 0u); EXPECT_EQ(nbytes % 4096, 0u); @@ -119,12 +121,14 @@ TEST_F(WritableFileWriterTest, IncrementalBuffer) { Status Flush() override { return Status::OK(); } Status Sync() override { return Status::OK(); } Status Fsync() override { return Status::OK(); } - void SetIOPriority(Env::IOPriority pri) override {} + void SetIOPriority(Env::IOPriority /*pri*/) override {} uint64_t GetFileSize() override { return size_; } - void GetPreallocationStatus(size_t* block_size, - size_t* last_allocated_block) override {} - size_t GetUniqueId(char* id, size_t max_size) const override { return 0; } - Status InvalidateCache(size_t offset, size_t length) override { + void GetPreallocationStatus(size_t* /*block_size*/, + size_t* /*last_allocated_block*/) override {} + size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override { + return 0; + } + Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override { return Status::OK(); } bool use_direct_io() const override { return use_direct_io_; } @@ -180,13 +184,13 @@ TEST_F(WritableFileWriterTest, AppendStatusReturn) { explicit FakeWF() : use_direct_io_(false), io_error_(false) {} virtual bool use_direct_io() const override { return use_direct_io_; } - Status Append(const Slice& data) override { + Status Append(const Slice& /*data*/) override { if (io_error_) { return Status::IOError("Fake IO error"); } return Status::OK(); } - Status PositionedAppend(const Slice& data, uint64_t) override { + Status PositionedAppend(const Slice& /*data*/, uint64_t) override { if (io_error_) { return Status::IOError("Fake IO error"); } diff --git a/util/file_util.cc b/util/file_util.cc index 80376b6dfb6..8a1adf2bd78 100644 --- a/util/file_util.cc +++ b/util/file_util.cc @@ -84,11 +84,10 @@ Status CreateFile(Env* env, const std::string& destination, Status DeleteSSTFile(const ImmutableDBOptions* db_options, const std::string& fname, uint32_t path_id) { - // TODO(tec): support sst_file_manager for multiple path_ids #ifndef ROCKSDB_LITE auto sfm = static_cast(db_options->sst_file_manager.get()); - if (sfm && path_id == 0) { + if (sfm) { return sfm->ScheduleFileDeletion(fname); } else { return db_options->env->DeleteFile(fname); diff --git a/util/mpsc.h b/util/mpsc.h deleted file mode 100644 index 7449fd35058..00000000000 --- a/util/mpsc.h +++ /dev/null @@ -1,158 +0,0 @@ -// Portions Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -// -// Large parts of this file is borrowed from the public domain code below. -// from https://github.com/mstump/queues - -// C++ implementation of Dmitry Vyukov's non-intrusive -// lock free unbound MPSC queue -// http://www.1024cores.net/home/ -// lock-free-algorithms/queues/non-intrusive-mpsc-node-based-queue - -// License from mstump/queues -// This is free and unencumbered software released into the public domain. -// -// Anyone is free to copy, modify, publish, use, compile, sell, or -// distribute this software, either in source code form or as a compiled -// binary, for any purpose, commercial or non-commercial, and by any -// means. -// -// In jurisdictions that recognize copyright laws, the author or authors -// of this software dedicate any and all copyright interest in the -// software to the public domain. We make this dedication for the benefit -// of the public at large and to the detriment of our heirs and -// successors. We intend this dedication to be an overt act of -// relinquishment in perpetuity of all present and future rights to this -// software under copyright law. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR -// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// -// For more information, please refer to - -// License from http://www.1024cores.net/home/ -// lock-free-algorithms/queues/non-intrusive-mpsc-node-based-queue -// Copyright (c) 2010-2011 Dmitry Vyukov. All rights reserved. -// Redistribution and use in source and binary forms, with or -// without modification, are permitted provided that the following -// conditions are met: -// 1. Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// THIS SOFTWARE IS PROVIDED BY DMITRY VYUKOV "AS IS" AND ANY EXPRESS OR -// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO -// EVENT SHALL DMITRY VYUKOV OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF -// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// The views and conclusions contained in the software and documentation -// are those of the authors and should not be interpreted as representing -// official policies, either expressed or implied, of Dmitry Vyukov. -// - -#ifndef UTIL_MPSC_H_ -#define UTIL_MPSC_H_ - -#include -#include -#include - -/** - * Multiple Producer Single Consumer Lockless Q - */ -template -class mpsc_queue_t { - public: - struct buffer_node_t { - T data; - std::atomic next; - }; - - mpsc_queue_t() { - buffer_node_aligned_t* al_st = new buffer_node_aligned_t; - buffer_node_t* node = new (al_st) buffer_node_t(); - _head.store(node); - _tail.store(node); - - node->next.store(nullptr, std::memory_order_relaxed); - } - - ~mpsc_queue_t() { - T output; - while (this->dequeue(&output)) { - } - buffer_node_t* front = _head.load(std::memory_order_relaxed); - front->~buffer_node_t(); - - ::operator delete(front); - } - - void enqueue(const T& input) { - buffer_node_aligned_t* al_st = new buffer_node_aligned_t; - buffer_node_t* node = new (al_st) buffer_node_t(); - - node->data = input; - node->next.store(nullptr, std::memory_order_relaxed); - - buffer_node_t* prev_head = _head.exchange(node, std::memory_order_acq_rel); - prev_head->next.store(node, std::memory_order_release); - } - - bool dequeue(T* output) { - buffer_node_t* tail = _tail.load(std::memory_order_relaxed); - buffer_node_t* next = tail->next.load(std::memory_order_acquire); - - if (next == nullptr) { - return false; - } - - *output = next->data; - _tail.store(next, std::memory_order_release); - - tail->~buffer_node_t(); - - ::operator delete(tail); - return true; - } - - // you can only use pop_all if the queue is SPSC - buffer_node_t* pop_all() { - // nobody else can move the tail pointer. - buffer_node_t* tptr = _tail.load(std::memory_order_relaxed); - buffer_node_t* next = - tptr->next.exchange(nullptr, std::memory_order_acquire); - _head.exchange(tptr, std::memory_order_acquire); - - // there is a race condition here - return next; - } - - private: - typedef typename std::aligned_storage< - sizeof(buffer_node_t), std::alignment_of::value>::type - buffer_node_aligned_t; - - std::atomic _head; - std::atomic _tail; - - mpsc_queue_t(const mpsc_queue_t&) = delete; - mpsc_queue_t& operator=(const mpsc_queue_t&) = delete; -}; - -#endif // UTIL_MPSC_H_ diff --git a/util/set_comparator.h b/util/set_comparator.h new file mode 100644 index 00000000000..4ecd0040366 --- /dev/null +++ b/util/set_comparator.h @@ -0,0 +1,22 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +namespace rocksdb { +// A comparator to be used in std::set +struct SetComparator { + explicit SetComparator() : user_comparator_(BytewiseComparator()) {} + explicit SetComparator(const Comparator* user_comparator) + : user_comparator_(user_comparator ? user_comparator + : BytewiseComparator()) {} + bool operator()(const Slice& lhs, const Slice& rhs) const { + return user_comparator_->Compare(lhs, rhs) < 0; + } + + private: + const Comparator* user_comparator_; +}; +} // namespace rocksdb diff --git a/util/slice.cc b/util/slice.cc index 10b19080b25..d344fbacff0 100644 --- a/util/slice.cc +++ b/util/slice.cc @@ -74,7 +74,7 @@ class CappedPrefixTransform : public SliceTransform { return Slice(src.data(), std::min(cap_len_, src.size())); } - virtual bool InDomain(const Slice& src) const override { return true; } + virtual bool InDomain(const Slice& /*src*/) const override { return true; } virtual bool InRange(const Slice& dst) const override { return (dst.size() <= cap_len_); @@ -93,11 +93,11 @@ class NoopTransform : public SliceTransform { virtual Slice Transform(const Slice& src) const override { return src; } - virtual bool InDomain(const Slice& src) const override { return true; } + virtual bool InDomain(const Slice& /*src*/) const override { return true; } - virtual bool InRange(const Slice& dst) const override { return true; } + virtual bool InRange(const Slice& /*dst*/) const override { return true; } - virtual bool SameResultWhenAppended(const Slice& prefix) const override { + virtual bool SameResultWhenAppended(const Slice& /*prefix*/) const override { return false; } }; diff --git a/util/sst_file_manager_impl.cc b/util/sst_file_manager_impl.cc index 61b08f23add..07d27695ca0 100644 --- a/util/sst_file_manager_impl.cc +++ b/util/sst_file_manager_impl.cc @@ -18,13 +18,16 @@ namespace rocksdb { #ifndef ROCKSDB_LITE SstFileManagerImpl::SstFileManagerImpl(Env* env, std::shared_ptr logger, int64_t rate_bytes_per_sec, - double max_trash_db_ratio) + double max_trash_db_ratio, + uint64_t bytes_max_delete_chunk) : env_(env), logger_(logger), total_files_size_(0), + compaction_buffer_size_(0), + cur_compactions_reserved_size_(0), max_allowed_space_(0), delete_scheduler_(env, rate_bytes_per_sec, logger.get(), this, - max_trash_db_ratio) {} + max_trash_db_ratio, bytes_max_delete_chunk) {} SstFileManagerImpl::~SstFileManagerImpl() {} @@ -48,6 +51,18 @@ Status SstFileManagerImpl::OnDeleteFile(const std::string& file_path) { return Status::OK(); } +void SstFileManagerImpl::OnCompactionCompletion(Compaction* c) { + MutexLock l(&mu_); + uint64_t size_added_by_compaction = 0; + for (size_t i = 0; i < c->num_input_levels(); i++) { + for (size_t j = 0; j < c->num_input_files(i); j++) { + FileMetaData* filemeta = c->input(i, j); + size_added_by_compaction += filemeta->fd.GetFileSize(); + } + } + cur_compactions_reserved_size_ -= size_added_by_compaction; +} + Status SstFileManagerImpl::OnMoveFile(const std::string& old_path, const std::string& new_path, uint64_t* file_size) { @@ -68,6 +83,12 @@ void SstFileManagerImpl::SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) { max_allowed_space_ = max_allowed_space; } +void SstFileManagerImpl::SetCompactionBufferSize( + uint64_t compaction_buffer_size) { + MutexLock l(&mu_); + compaction_buffer_size_ = compaction_buffer_size; +} + bool SstFileManagerImpl::IsMaxAllowedSpaceReached() { MutexLock l(&mu_); if (max_allowed_space_ <= 0) { @@ -76,6 +97,43 @@ bool SstFileManagerImpl::IsMaxAllowedSpaceReached() { return total_files_size_ >= max_allowed_space_; } +bool SstFileManagerImpl::IsMaxAllowedSpaceReachedIncludingCompactions() { + MutexLock l(&mu_); + if (max_allowed_space_ <= 0) { + return false; + } + return total_files_size_ + cur_compactions_reserved_size_ >= + max_allowed_space_; +} + +bool SstFileManagerImpl::EnoughRoomForCompaction(Compaction* c) { + MutexLock l(&mu_); + uint64_t size_added_by_compaction = 0; + // First check if we even have the space to do the compaction + for (size_t i = 0; i < c->num_input_levels(); i++) { + for (size_t j = 0; j < c->num_input_files(i); j++) { + FileMetaData* filemeta = c->input(i, j); + size_added_by_compaction += filemeta->fd.GetFileSize(); + } + } + + if (max_allowed_space_ != 0 && + (size_added_by_compaction + cur_compactions_reserved_size_ + + total_files_size_ + compaction_buffer_size_ > + max_allowed_space_)) { + return false; + } + // Update cur_compactions_reserved_size_ so concurrent compaction + // don't max out space + cur_compactions_reserved_size_ += size_added_by_compaction; + return true; +} + +uint64_t SstFileManagerImpl::GetCompactionsReservedSize() { + MutexLock l(&mu_); + return cur_compactions_reserved_size_; +} + uint64_t SstFileManagerImpl::GetTotalSize() { MutexLock l(&mu_); return total_files_size_; @@ -139,10 +197,11 @@ SstFileManager* NewSstFileManager(Env* env, std::shared_ptr info_log, std::string trash_dir, int64_t rate_bytes_per_sec, bool delete_existing_trash, Status* status, - double max_trash_db_ratio) { + double max_trash_db_ratio, + uint64_t bytes_max_delete_chunk) { SstFileManagerImpl* res = new SstFileManagerImpl(env, info_log, rate_bytes_per_sec, - max_trash_db_ratio); + max_trash_db_ratio, bytes_max_delete_chunk); // trash_dir is deprecated and not needed anymore, but if user passed it // we will still remove files in it. @@ -179,7 +238,8 @@ SstFileManager* NewSstFileManager(Env* env, std::shared_ptr info_log, std::string trash_dir, int64_t rate_bytes_per_sec, bool delete_existing_trash, Status* status, - double max_trash_db_ratio) { + double max_trash_db_ratio, + uint64_t bytes_max_delete_chunk) { if (status) { *status = Status::NotSupported("SstFileManager is not supported in ROCKSDB_LITE"); diff --git a/util/sst_file_manager_impl.h b/util/sst_file_manager_impl.h index 1cb1d4fb16b..b7a557d9982 100644 --- a/util/sst_file_manager_impl.h +++ b/util/sst_file_manager_impl.h @@ -11,6 +11,7 @@ #include "port/port.h" +#include "db/compaction.h" #include "rocksdb/sst_file_manager.h" #include "util/delete_scheduler.h" @@ -26,7 +27,8 @@ class SstFileManagerImpl : public SstFileManager { public: explicit SstFileManagerImpl(Env* env, std::shared_ptr logger, int64_t rate_bytes_per_sec, - double max_trash_db_ratio); + double max_trash_db_ratio, + uint64_t bytes_max_delete_chunk); ~SstFileManagerImpl(); @@ -50,12 +52,29 @@ class SstFileManagerImpl : public SstFileManager { // thread-safe. void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) override; + void SetCompactionBufferSize(uint64_t compaction_buffer_size) override; + // Return true if the total size of SST files exceeded the maximum allowed // space usage. // // thread-safe. bool IsMaxAllowedSpaceReached() override; + bool IsMaxAllowedSpaceReachedIncludingCompactions() override; + + // Returns true is there is enough (approximate) space for the specified + // compaction. Space is approximate because this function conservatively + // estimates how much space is currently being used by compactions (i.e. + // if a compaction has started, this function bumps the used space by + // the full compaction size). + bool EnoughRoomForCompaction(Compaction* c); + + // Bookkeeping so total_file_sizes_ goes back to normal after compaction + // finishes + void OnCompactionCompletion(Compaction* c); + + uint64_t GetCompactionsReservedSize(); + // Return the total size of all tracked files. uint64_t GetTotalSize() override; @@ -95,6 +114,11 @@ class SstFileManagerImpl : public SstFileManager { port::Mutex mu_; // The summation of the sizes of all files in tracked_files_ map uint64_t total_files_size_; + // Compactions should only execute if they can leave at least + // this amount of buffer space for logs and flushes + uint64_t compaction_buffer_size_; + // Estimated size of the current ongoing compactions + uint64_t cur_compactions_reserved_size_; // A map containing all tracked files and there sizes // file_path => file_size std::unordered_map tracked_files_; diff --git a/util/sync_point.cc b/util/sync_point.cc index c8c9fbc26a7..ce0fa0a9727 100644 --- a/util/sync_point.cc +++ b/util/sync_point.cc @@ -4,10 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #include "util/sync_point.h" -#include -#include -#include "port/port.h" -#include "util/random.h" +#include "util/sync_point_impl.h" int rocksdb_kill_odds = 0; std::vector rocksdb_kill_prefix_blacklist; @@ -15,156 +12,57 @@ std::vector rocksdb_kill_prefix_blacklist; #ifndef NDEBUG namespace rocksdb { -void TestKillRandom(std::string kill_point, int odds, - const std::string& srcfile, int srcline) { - for (auto& p : rocksdb_kill_prefix_blacklist) { - if (kill_point.substr(0, p.length()) == p) { - return; - } - } - - assert(odds > 0); - if (odds % 7 == 0) { - // class Random uses multiplier 16807, which is 7^5. If odds are - // multiplier of 7, there might be limited values generated. - odds++; - } - auto* r = Random::GetTLSInstance(); - bool crash = r->OneIn(odds); - if (crash) { - port::Crash(srcfile, srcline); - } -} - SyncPoint* SyncPoint::GetInstance() { static SyncPoint sync_point; return &sync_point; } -void SyncPoint::LoadDependency(const std::vector& dependencies) { - std::unique_lock lock(mutex_); - successors_.clear(); - predecessors_.clear(); - cleared_points_.clear(); - for (const auto& dependency : dependencies) { - successors_[dependency.predecessor].push_back(dependency.successor); - predecessors_[dependency.successor].push_back(dependency.predecessor); - } - cv_.notify_all(); +SyncPoint::SyncPoint() : + impl_(new Data) { } -void SyncPoint::LoadDependencyAndMarkers( - const std::vector& dependencies, - const std::vector& markers) { - std::unique_lock lock(mutex_); - successors_.clear(); - predecessors_.clear(); - cleared_points_.clear(); - markers_.clear(); - marked_thread_id_.clear(); - for (const auto& dependency : dependencies) { - successors_[dependency.predecessor].push_back(dependency.successor); - predecessors_[dependency.successor].push_back(dependency.predecessor); - } - for (const auto& marker : markers) { - successors_[marker.predecessor].push_back(marker.successor); - predecessors_[marker.successor].push_back(marker.predecessor); - markers_[marker.predecessor].push_back(marker.successor); - } - cv_.notify_all(); +SyncPoint:: ~SyncPoint() { + delete impl_; } -bool SyncPoint::PredecessorsAllCleared(const std::string& point) { - for (const auto& pred : predecessors_[point]) { - if (cleared_points_.count(pred) == 0) { - return false; - } - } - return true; +void SyncPoint::LoadDependency(const std::vector& dependencies) { + impl_->LoadDependency(dependencies); } -void SyncPoint::SetCallBack(const std::string point, - std::function callback) { - std::unique_lock lock(mutex_); - callbacks_[point] = callback; +void SyncPoint::LoadDependencyAndMarkers( + const std::vector& dependencies, + const std::vector& markers) { + impl_->LoadDependencyAndMarkers(dependencies, markers); } -void SyncPoint::ClearCallBack(const std::string point) { - std::unique_lock lock(mutex_); - while (num_callbacks_running_ > 0) { - cv_.wait(lock); - } - callbacks_.erase(point); +void SyncPoint::SetCallBack(const std::string& point, + const std::function& callback) { + impl_->SetCallBack(point, callback); +} + +void SyncPoint::ClearCallBack(const std::string& point) { + impl_->ClearCallBack(point); } void SyncPoint::ClearAllCallBacks() { - std::unique_lock lock(mutex_); - while (num_callbacks_running_ > 0) { - cv_.wait(lock); - } - callbacks_.clear(); + impl_->ClearAllCallBacks(); } void SyncPoint::EnableProcessing() { - std::unique_lock lock(mutex_); - enabled_ = true; + impl_->EnableProcessing(); } void SyncPoint::DisableProcessing() { - std::unique_lock lock(mutex_); - enabled_ = false; + impl_->DisableProcessing(); } void SyncPoint::ClearTrace() { - std::unique_lock lock(mutex_); - cleared_points_.clear(); -} - -bool SyncPoint::DisabledByMarker(const std::string& point, - std::thread::id thread_id) { - auto marked_point_iter = marked_thread_id_.find(point); - return marked_point_iter != marked_thread_id_.end() && - thread_id != marked_point_iter->second; + impl_->ClearTrace(); } void SyncPoint::Process(const std::string& point, void* cb_arg) { - std::unique_lock lock(mutex_); - if (!enabled_) { - return; - } - - auto thread_id = std::this_thread::get_id(); - - auto marker_iter = markers_.find(point); - if (marker_iter != markers_.end()) { - for (auto marked_point : marker_iter->second) { - marked_thread_id_.insert(std::make_pair(marked_point, thread_id)); - } - } - - if (DisabledByMarker(point, thread_id)) { - return; - } - - while (!PredecessorsAllCleared(point)) { - cv_.wait(lock); - if (DisabledByMarker(point, thread_id)) { - return; - } - } - - auto callback_pair = callbacks_.find(point); - if (callback_pair != callbacks_.end()) { - num_callbacks_running_++; - mutex_.unlock(); - callback_pair->second(cb_arg); - mutex_.lock(); - num_callbacks_running_--; - cv_.notify_all(); - } - - cleared_points_.insert(point); - cv_.notify_all(); + impl_->Process(point, cb_arg); } + } // namespace rocksdb #endif // NDEBUG diff --git a/util/sync_point.h b/util/sync_point.h index ab546805940..c85be9a4883 100644 --- a/util/sync_point.h +++ b/util/sync_point.h @@ -5,13 +5,10 @@ #pragma once #include -#include #include #include #include #include -#include -#include #include // This is only set from db_stress.cc and for testing only. @@ -26,7 +23,7 @@ extern std::vector rocksdb_kill_prefix_blacklist; #else namespace rocksdb { -// Kill the process with probablity 1/odds for testing. +// Kill the process with probability 1/odds for testing. extern void TestKillRandom(std::string kill_point, int odds, const std::string& srcfile, int srcline); @@ -65,6 +62,10 @@ class SyncPoint { public: static SyncPoint* GetInstance(); + SyncPoint(const SyncPoint&) = delete; + SyncPoint& operator=(const SyncPoint&) = delete; + ~SyncPoint(); + struct SyncPointPair { std::string predecessor; std::string successor; @@ -81,15 +82,14 @@ class SyncPoint { void LoadDependencyAndMarkers(const std::vector& dependencies, const std::vector& markers); - // Set up a call back function in sync point. // The argument to the callback is passed through from // TEST_SYNC_POINT_CALLBACK(); nullptr if TEST_SYNC_POINT or // TEST_IDX_SYNC_POINT was used. - void SetCallBack(const std::string point, - std::function callback); + void SetCallBack(const std::string& point, + const std::function& callback); // Clear callback function by point - void ClearCallBack(const std::string point); + void ClearCallBack(const std::string& point); // Clear all call back functions. void ClearAllCallBacks(); @@ -105,29 +105,20 @@ class SyncPoint { // triggered by TEST_SYNC_POINT, blocking execution until all predecessors // are executed. - // And/or call registered callback functionn, with argument `cb_arg` + // And/or call registered callback function, with argument `cb_arg` void Process(const std::string& point, void* cb_arg = nullptr); // TODO: it might be useful to provide a function that blocks until all // sync points are cleared. + // We want this to be public so we can + // subclass the implementation + struct Data; + private: - bool PredecessorsAllCleared(const std::string& point); - bool DisabledByMarker(const std::string& point, std::thread::id thread_id); - - // successor/predecessor map loaded from LoadDependency - std::unordered_map> successors_; - std::unordered_map> predecessors_; - std::unordered_map > callbacks_; - std::unordered_map > markers_; - std::unordered_map marked_thread_id_; - - std::mutex mutex_; - std::condition_variable cv_; - // sync points that have been passed through - std::unordered_set cleared_points_; - bool enabled_ = false; - int num_callbacks_running_ = 0; + // Singleton + SyncPoint(); + Data* impl_; }; } // namespace rocksdb diff --git a/util/sync_point_impl.cc b/util/sync_point_impl.cc new file mode 100644 index 00000000000..ab4cc5ae557 --- /dev/null +++ b/util/sync_point_impl.cc @@ -0,0 +1,129 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/sync_point_impl.h" + +#ifndef NDEBUG +namespace rocksdb { + +void TestKillRandom(std::string kill_point, int odds, + const std::string& srcfile, int srcline) { + for (auto& p : rocksdb_kill_prefix_blacklist) { + if (kill_point.substr(0, p.length()) == p) { + return; + } + } + + assert(odds > 0); + if (odds % 7 == 0) { + // class Random uses multiplier 16807, which is 7^5. If odds are + // multiplier of 7, there might be limited values generated. + odds++; + } + auto* r = Random::GetTLSInstance(); + bool crash = r->OneIn(odds); + if (crash) { + port::Crash(srcfile, srcline); + } +} + + +void SyncPoint::Data::LoadDependency(const std::vector& dependencies) { + std::lock_guard lock(mutex_); + successors_.clear(); + predecessors_.clear(); + cleared_points_.clear(); + for (const auto& dependency : dependencies) { + successors_[dependency.predecessor].push_back(dependency.successor); + predecessors_[dependency.successor].push_back(dependency.predecessor); + } + cv_.notify_all(); +} + +void SyncPoint::Data::LoadDependencyAndMarkers( + const std::vector& dependencies, + const std::vector& markers) { + std::lock_guard lock(mutex_); + successors_.clear(); + predecessors_.clear(); + cleared_points_.clear(); + markers_.clear(); + marked_thread_id_.clear(); + for (const auto& dependency : dependencies) { + successors_[dependency.predecessor].push_back(dependency.successor); + predecessors_[dependency.successor].push_back(dependency.predecessor); + } + for (const auto& marker : markers) { + successors_[marker.predecessor].push_back(marker.successor); + predecessors_[marker.successor].push_back(marker.predecessor); + markers_[marker.predecessor].push_back(marker.successor); + } + cv_.notify_all(); +} + +bool SyncPoint::Data::PredecessorsAllCleared(const std::string& point) { + for (const auto& pred : predecessors_[point]) { + if (cleared_points_.count(pred) == 0) { + return false; + } + } + return true; +} + +void SyncPoint::Data::ClearCallBack(const std::string& point) { + std::unique_lock lock(mutex_); + while (num_callbacks_running_ > 0) { + cv_.wait(lock); + } + callbacks_.erase(point); +} + +void SyncPoint::Data::ClearAllCallBacks() { + std::unique_lock lock(mutex_); + while (num_callbacks_running_ > 0) { + cv_.wait(lock); + } + callbacks_.clear(); +} + +void SyncPoint::Data::Process(const std::string& point, void* cb_arg) { + std::unique_lock lock(mutex_); + if (!enabled_) { + return; + } + + auto thread_id = std::this_thread::get_id(); + + auto marker_iter = markers_.find(point); + if (marker_iter != markers_.end()) { + for (auto& marked_point : marker_iter->second) { + marked_thread_id_.emplace(marked_point, thread_id); + } + } + + if (DisabledByMarker(point, thread_id)) { + return; + } + + while (!PredecessorsAllCleared(point)) { + cv_.wait(lock); + if (DisabledByMarker(point, thread_id)) { + return; + } + } + + auto callback_pair = callbacks_.find(point); + if (callback_pair != callbacks_.end()) { + num_callbacks_running_++; + mutex_.unlock(); + callback_pair->second(cb_arg); + mutex_.lock(); + num_callbacks_running_--; + } + cleared_points_.insert(point); + cv_.notify_all(); +} +} // rocksdb +#endif diff --git a/util/sync_point_impl.h b/util/sync_point_impl.h new file mode 100644 index 00000000000..8c7bd7a2d0d --- /dev/null +++ b/util/sync_point_impl.h @@ -0,0 +1,74 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/sync_point.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "port/port.h" +#include "util/random.h" + +#pragma once + +#ifndef NDEBUG +namespace rocksdb { +struct SyncPoint::Data { + // Enable proper deletion by subclasses + virtual ~Data() {} + // successor/predecessor map loaded from LoadDependency + std::unordered_map> successors_; + std::unordered_map> predecessors_; + std::unordered_map > callbacks_; + std::unordered_map > markers_; + std::unordered_map marked_thread_id_; + + std::mutex mutex_; + std::condition_variable cv_; + // sync points that have been passed through + std::unordered_set cleared_points_; + bool enabled_ = false; + int num_callbacks_running_ = 0; + + void LoadDependency(const std::vector& dependencies); + void LoadDependencyAndMarkers(const std::vector& dependencies, + const std::vector& markers); + bool PredecessorsAllCleared(const std::string& point); + void SetCallBack(const std::string& point, + const std::function& callback) { + std::lock_guard lock(mutex_); + callbacks_[point] = callback; +} + + void ClearCallBack(const std::string& point); + void ClearAllCallBacks(); + void EnableProcessing() { + std::lock_guard lock(mutex_); + enabled_ = true; + } + void DisableProcessing() { + std::lock_guard lock(mutex_); + enabled_ = false; + } + void ClearTrace() { + std::lock_guard lock(mutex_); + cleared_points_.clear(); + } + bool DisabledByMarker(const std::string& point, + std::thread::id thread_id) { + auto marked_point_iter = marked_thread_id_.find(point); + return marked_point_iter != marked_thread_id_.end() && + thread_id != marked_point_iter->second; + } + void Process(const std::string& point, void* cb_arg); +}; +} +#endif // NDEBUG diff --git a/util/testutil.cc b/util/testutil.cc index ecc4cbe145a..1aa4bce759b 100644 --- a/util/testutil.cc +++ b/util/testutil.cc @@ -107,12 +107,12 @@ class Uint64ComparatorImpl : public Comparator { } } - virtual void FindShortestSeparator(std::string* start, - const Slice& limit) const override { + virtual void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override { return; } - virtual void FindShortSuccessor(std::string* key) const override { + virtual void FindShortSuccessor(std::string* /*key*/) const override { return; } }; diff --git a/util/testutil.h b/util/testutil.h index 6683963af22..62a4de1ab75 100644 --- a/util/testutil.h +++ b/util/testutil.h @@ -123,10 +123,10 @@ class SimpleSuffixReverseComparator : public Comparator { return -(suffix_a.compare(suffix_b)); } } - virtual void FindShortestSeparator(std::string* start, - const Slice& limit) const override {} + virtual void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} - virtual void FindShortSuccessor(std::string* key) const override {} + virtual void FindShortSuccessor(std::string* /*key*/) const override {} }; // Returns a user key comparator that can be used for comparing two uint64_t @@ -257,7 +257,8 @@ class RandomRWStringSink : public RandomRWFile { return Status::OK(); } - Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { + Status Read(uint64_t offset, size_t n, Slice* result, + char* /*scratch*/) const { *result = Slice(nullptr, 0); if (offset < ss_->contents_.size()) { size_t str_res_sz = @@ -378,7 +379,7 @@ class StringSource: public RandomAccessFile { class NullLogger : public Logger { public: using Logger::Logv; - virtual void Logv(const char* format, va_list ap) override {} + virtual void Logv(const char* /*format*/, va_list /*ap*/) override {} virtual size_t GetLogFileSize() const override { return 0; } }; @@ -459,15 +460,16 @@ class FilterNumber : public CompactionFilter { std::string last_merge_operand_key() { return last_merge_operand_key_; } - bool Filter(int level, const rocksdb::Slice& key, const rocksdb::Slice& value, - std::string* new_value, bool* value_changed) const override { + bool Filter(int /*level*/, const rocksdb::Slice& /*key*/, + const rocksdb::Slice& value, std::string* /*new_value*/, + bool* /*value_changed*/) const override { if (value.size() == sizeof(uint64_t)) { return num_ == DecodeFixed64(value.data()); } return true; } - bool FilterMergeOperand(int level, const rocksdb::Slice& key, + bool FilterMergeOperand(int /*level*/, const rocksdb::Slice& key, const rocksdb::Slice& value) const override { last_merge_operand_key_ = key.ToString(); if (value.size() == sizeof(uint64_t)) { @@ -565,7 +567,7 @@ class StringEnv : public EnvWrapper { // The following text is boilerplate that forwards all methods to target() Status NewSequentialFile(const std::string& f, unique_ptr* r, - const EnvOptions& options) override { + const EnvOptions& /*options*/) override { auto iter = files_.find(f); if (iter == files_.end()) { return Status::NotFound("The specified file does not exist", f); @@ -573,13 +575,13 @@ class StringEnv : public EnvWrapper { r->reset(new SeqStringSource(iter->second)); return Status::OK(); } - Status NewRandomAccessFile(const std::string& f, - unique_ptr* r, - const EnvOptions& options) override { + Status NewRandomAccessFile(const std::string& /*f*/, + unique_ptr* /*r*/, + const EnvOptions& /*options*/) override { return Status::NotSupported(); } Status NewWritableFile(const std::string& f, unique_ptr* r, - const EnvOptions& options) override { + const EnvOptions& /*options*/) override { auto iter = files_.find(f); if (iter != files_.end()) { return Status::IOError("The specified file already exists", f); @@ -587,8 +589,8 @@ class StringEnv : public EnvWrapper { r->reset(new StringSink(&files_[f])); return Status::OK(); } - virtual Status NewDirectory(const std::string& name, - unique_ptr* result) override { + virtual Status NewDirectory(const std::string& /*name*/, + unique_ptr* /*result*/) override { return Status::NotSupported(); } Status FileExists(const std::string& f) override { @@ -597,21 +599,21 @@ class StringEnv : public EnvWrapper { } return Status::OK(); } - Status GetChildren(const std::string& dir, - std::vector* r) override { + Status GetChildren(const std::string& /*dir*/, + std::vector* /*r*/) override { return Status::NotSupported(); } Status DeleteFile(const std::string& f) override { files_.erase(f); return Status::OK(); } - Status CreateDir(const std::string& d) override { + Status CreateDir(const std::string& /*d*/) override { return Status::NotSupported(); } - Status CreateDirIfMissing(const std::string& d) override { + Status CreateDirIfMissing(const std::string& /*d*/) override { return Status::NotSupported(); } - Status DeleteDir(const std::string& d) override { + Status DeleteDir(const std::string& /*d*/) override { return Status::NotSupported(); } Status GetFileSize(const std::string& f, uint64_t* s) override { @@ -623,24 +625,25 @@ class StringEnv : public EnvWrapper { return Status::OK(); } - Status GetFileModificationTime(const std::string& fname, - uint64_t* file_mtime) override { + Status GetFileModificationTime(const std::string& /*fname*/, + uint64_t* /*file_mtime*/) override { return Status::NotSupported(); } - Status RenameFile(const std::string& s, const std::string& t) override { + Status RenameFile(const std::string& /*s*/, + const std::string& /*t*/) override { return Status::NotSupported(); } - Status LinkFile(const std::string& s, const std::string& t) override { + Status LinkFile(const std::string& /*s*/, const std::string& /*t*/) override { return Status::NotSupported(); } - Status LockFile(const std::string& f, FileLock** l) override { + Status LockFile(const std::string& /*f*/, FileLock** /*l*/) override { return Status::NotSupported(); } - Status UnlockFile(FileLock* l) override { return Status::NotSupported(); } + Status UnlockFile(FileLock* /*l*/) override { return Status::NotSupported(); } protected: std::unordered_map files_; @@ -663,14 +666,14 @@ class ChanglingMergeOperator : public MergeOperator { void SetName(const std::string& name) { name_ = name; } - virtual bool FullMergeV2(const MergeOperationInput& merge_in, - MergeOperationOutput* merge_out) const override { + virtual bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* /*merge_out*/) const override { return false; } - virtual bool PartialMergeMulti(const Slice& key, - const std::deque& operand_list, - std::string* new_value, - Logger* logger) const override { + virtual bool PartialMergeMulti(const Slice& /*key*/, + const std::deque& /*operand_list*/, + std::string* /*new_value*/, + Logger* /*logger*/) const override { return false; } virtual const char* Name() const override { return name_.c_str(); } @@ -691,8 +694,9 @@ class ChanglingCompactionFilter : public CompactionFilter { void SetName(const std::string& name) { name_ = name; } - bool Filter(int level, const Slice& key, const Slice& existing_value, - std::string* new_value, bool* value_changed) const override { + bool Filter(int /*level*/, const Slice& /*key*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + bool* /*value_changed*/) const override { return false; } @@ -715,7 +719,7 @@ class ChanglingCompactionFilterFactory : public CompactionFilterFactory { void SetName(const std::string& name) { name_ = name; } std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& context) override { + const CompactionFilter::Context& /*context*/) override { return std::unique_ptr(); } diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc index 36a221bf2d2..a4a343a9cf4 100644 --- a/util/thread_list_test.cc +++ b/util/thread_list_test.cc @@ -47,7 +47,7 @@ class SimulatedBackgroundTask { } Env::Default()->GetThreadStatusUpdater()->ClearThreadState(); Env::Default()->GetThreadStatusUpdater()->ClearThreadOperation(); - Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(0); + Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(nullptr); running_count_--; bg_cv_.notify_all(); } diff --git a/util/thread_local_test.cc b/util/thread_local_test.cc index 6fee5eaa574..789be83d8fd 100644 --- a/util/thread_local_test.cc +++ b/util/thread_local_test.cc @@ -535,7 +535,7 @@ TEST_F(ThreadLocalTest, CompareAndSwap) { namespace { -void* AccessThreadLocal(void* arg) { +void* AccessThreadLocal(void* /*arg*/) { TEST_SYNC_POINT("AccessThreadLocal:Start"); ThreadLocalPtr tlp; tlp.Reset(new std::string("hello RocksDB")); diff --git a/util/transaction_test_util.cc b/util/transaction_test_util.cc index e2883297226..bc6f512a69a 100644 --- a/util/transaction_test_util.cc +++ b/util/transaction_test_util.cc @@ -175,7 +175,7 @@ bool RandomTransactionInserter::DoInsert(DB* db, Transaction* txn, if (txn != nullptr) { std::hash hasher; char name[64]; - snprintf(name, 64, "txn%zu-%d", hasher(std::this_thread::get_id()), + snprintf(name, 64, "txn%" ROCKSDB_PRIszt "-%d", hasher(std::this_thread::get_id()), txn_id_++); assert(strlen(name) < 64 - 1); if (!is_optimistic && !rand_->OneIn(10)) { diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index bccb7efd225..15a7bfb53ad 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -1733,25 +1733,51 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) { if (!app_metadata_.empty()) { std::string hex_encoded_metadata = Slice(app_metadata_).ToString(/* hex */ true); + + // +1 to accomodate newline character + size_t hex_meta_strlen = kMetaDataPrefix.ToString().length() + hex_encoded_metadata.length() + 1; + if (hex_meta_strlen >= buf_size) { + return Status::Corruption("Buffer too small to fit backup metadata"); + } + else if (len + hex_meta_strlen >= buf_size) { + backup_meta_file->Append(Slice(buf.get(), len)); + buf.reset(); + unique_ptr new_reset_buf(new char[max_backup_meta_file_size_]); + buf.swap(new_reset_buf); + len = 0; + } len += snprintf(buf.get() + len, buf_size - len, "%s%s\n", kMetaDataPrefix.ToString().c_str(), hex_encoded_metadata.c_str()); - if (len >= buf_size) { - return Status::Corruption("Buffer too small to fit backup metadata"); - } } - len += snprintf(buf.get() + len, buf_size - len, "%" ROCKSDB_PRIszt "\n", - files_.size()); - if (len >= buf_size) { - return Status::Corruption("Buffer too small to fit backup metadata"); + + char writelen_temp[19]; + if (len + sprintf(writelen_temp, "%" ROCKSDB_PRIszt "\n", files_.size()) >= buf_size) { + backup_meta_file->Append(Slice(buf.get(), len)); + buf.reset(); + unique_ptr new_reset_buf(new char[max_backup_meta_file_size_]); + buf.swap(new_reset_buf); + len = 0; } + { + const char *const_write = writelen_temp; + len += snprintf(buf.get() + len, buf_size - len, "%s", const_write); + } + for (const auto& file : files_) { // use crc32 for now, switch to something else if needed - len += snprintf(buf.get() + len, buf_size - len, "%s crc32 %u\n", - file->filename.c_str(), file->checksum_value); - if (len >= buf_size) { - return Status::Corruption("Buffer too small to fit backup metadata"); - } + + size_t newlen = len + file->filename.length() + sprintf(writelen_temp, " crc32 %u\n", file->checksum_value); + const char *const_write = writelen_temp; + if (newlen >= buf_size) { + backup_meta_file->Append(Slice(buf.get(), len)); + buf.reset(); + unique_ptr new_reset_buf(new char[max_backup_meta_file_size_]); + buf.swap(new_reset_buf); + len = 0; + } + len += snprintf(buf.get() + len, buf_size - len, "%s%s", + file->filename.c_str(), const_write); } s = backup_meta_file->Append(Slice(buf.get(), len)); diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index b31d273d19e..3983a53f243 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -57,7 +57,8 @@ class DummyDB : public StackableDB { } using DB::GetOptions; - virtual Options GetOptions(ColumnFamilyHandle* column_family) const override { + virtual Options GetOptions( + ColumnFamilyHandle* /*column_family*/) const override { return options_; } @@ -65,7 +66,7 @@ class DummyDB : public StackableDB { return DBOptions(options_); } - virtual Status EnableFileDeletions(bool force) override { + virtual Status EnableFileDeletions(bool /*force*/) override { EXPECT_TRUE(!deletions_enabled_); deletions_enabled_ = true; return Status::OK(); @@ -78,7 +79,7 @@ class DummyDB : public StackableDB { } virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, - bool flush_memtable = true) override { + bool /*flush_memtable*/ = true) override { EXPECT_TRUE(!deletions_enabled_); vec = live_files_; *mfs = 100; @@ -135,7 +136,7 @@ class DummyDB : public StackableDB { } // To avoid FlushWAL called on stacked db which is nullptr - virtual Status FlushWAL(bool sync) override { return Status::OK(); } + virtual Status FlushWAL(bool /*sync*/) override { return Status::OK(); } std::vector live_files_; // pair @@ -521,7 +522,7 @@ class BackupableDBTest : public testing::Test { void OpenDBAndBackupEngineShareWithChecksum( bool destroy_old_data = false, bool dummy = false, - bool share_table_files = true, bool share_with_checksums = false) { + bool /*share_table_files*/ = true, bool share_with_checksums = false) { backupable_options_->share_files_with_checksum = share_with_checksums; OpenDBAndBackupEngine(destroy_old_data, dummy, share_with_checksums); } @@ -810,7 +811,7 @@ TEST_F(BackupableDBTest, NoDoubleCopy) { test_db_env_->SetFilenamesForMockedAttrs(dummy_db_->live_files_); ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false)); std::vector should_have_written = { - "/shared/.00010.sst.tmp", "/shared/.00011.sst.tmp", + "/shared/.00010.sst.tmp", "/shared/.00011.sst.tmp", "/private/1.tmp/CURRENT", "/private/1.tmp/MANIFEST-01", "/private/1.tmp/00011.log", "/meta/.1.tmp"}; AppendPath(backupdir_, should_have_written); diff --git a/utilities/blob_db/blob_compaction_filter.cc b/utilities/blob_db/blob_compaction_filter.cc new file mode 100644 index 00000000000..cbc76a98dd0 --- /dev/null +++ b/utilities/blob_db/blob_compaction_filter.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/blob_db/blob_compaction_filter.h" +#include "db/dbformat.h" + +namespace rocksdb { +namespace blob_db { + +namespace { + +// CompactionFilter to delete expired blob index from base DB. +class BlobIndexCompactionFilter : public CompactionFilter { + public: + BlobIndexCompactionFilter(BlobCompactionContext context, + uint64_t current_time, Statistics* statistics) + : context_(context), + current_time_(current_time), + statistics_(statistics) {} + + virtual ~BlobIndexCompactionFilter() { + RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, expired_count_); + RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, expired_size_); + RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_COUNT, evicted_count_); + RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_SIZE, evicted_size_); + } + + virtual const char* Name() const override { + return "BlobIndexCompactionFilter"; + } + + // Filter expired blob indexes regardless of snapshots. + virtual bool IgnoreSnapshots() const override { return true; } + + virtual Decision FilterV2(int /*level*/, const Slice& key, + ValueType value_type, const Slice& value, + std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + if (value_type != kBlobIndex) { + return Decision::kKeep; + } + BlobIndex blob_index; + Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + // Unable to decode blob index. Keeping the value. + return Decision::kKeep; + } + if (blob_index.HasTTL() && blob_index.expiration() <= current_time_) { + // Expired + expired_count_++; + expired_size_ += key.size() + value.size(); + return Decision::kRemove; + } + if (!blob_index.IsInlined() && + blob_index.file_number() < context_.next_file_number && + context_.current_blob_files.count(blob_index.file_number()) == 0) { + // Corresponding blob file gone. Could have been garbage collected or + // evicted by FIFO eviction. + evicted_count_++; + evicted_size_ += key.size() + value.size(); + return Decision::kRemove; + } + if (context_.fifo_eviction_seq > 0 && blob_index.HasTTL() && + blob_index.expiration() < context_.evict_expiration_up_to) { + // Hack: Internal key is passed to BlobIndexCompactionFilter for it to + // get sequence number. + ParsedInternalKey ikey; + bool ok = ParseInternalKey(key, &ikey); + // Remove keys that could have been remove by last FIFO eviction. + // If get error while parsing key, ignore and continue. + if (ok && ikey.sequence < context_.fifo_eviction_seq) { + evicted_count_++; + evicted_size_ += key.size() + value.size(); + return Decision::kRemove; + } + } + return Decision::kKeep; + } + + private: + BlobCompactionContext context_; + const uint64_t current_time_; + Statistics* statistics_; + // It is safe to not using std::atomic since the compaction filter, created + // from a compaction filter factroy, will not be called from multiple threads. + mutable uint64_t expired_count_ = 0; + mutable uint64_t expired_size_ = 0; + mutable uint64_t evicted_count_ = 0; + mutable uint64_t evicted_size_ = 0; +}; + +} // anonymous namespace + +std::unique_ptr +BlobIndexCompactionFilterFactory::CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) { + int64_t current_time = 0; + Status s = env_->GetCurrentTime(¤t_time); + if (!s.ok()) { + return nullptr; + } + assert(current_time >= 0); + + BlobCompactionContext context; + blob_db_impl_->GetCompactionContext(&context); + + return std::unique_ptr(new BlobIndexCompactionFilter( + context, static_cast(current_time), statistics_)); +} + +} // namespace blob_db +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_compaction_filter.h b/utilities/blob_db/blob_compaction_filter.h index 192a338ff30..7a8ea613573 100644 --- a/utilities/blob_db/blob_compaction_filter.h +++ b/utilities/blob_db/blob_compaction_filter.h @@ -5,82 +5,39 @@ #pragma once #ifndef ROCKSDB_LITE +#include + #include "monitoring/statistics.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/env.h" +#include "utilities/blob_db/blob_db_impl.h" #include "utilities/blob_db/blob_index.h" namespace rocksdb { namespace blob_db { -// CompactionFilter to delete expired blob index from base DB. -class BlobIndexCompactionFilter : public CompactionFilter { - public: - BlobIndexCompactionFilter(uint64_t current_time, Statistics* statistics) - : current_time_(current_time), statistics_(statistics) {} - - virtual ~BlobIndexCompactionFilter() { - RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED, expired_count_); - } - - virtual const char* Name() const override { - return "BlobIndexCompactionFilter"; - } - - // Filter expired blob indexes regardless of snapshots. - virtual bool IgnoreSnapshots() const override { return true; } - - virtual Decision FilterV2(int /*level*/, const Slice& /*key*/, - ValueType value_type, const Slice& value, - std::string* /*new_value*/, - std::string* /*skip_until*/) const override { - if (value_type != kBlobIndex) { - return Decision::kKeep; - } - BlobIndex blob_index; - Status s = blob_index.DecodeFrom(value); - if (!s.ok()) { - // Unable to decode blob index. Keeping the value. - return Decision::kKeep; - } - if (blob_index.HasTTL() && blob_index.expiration() <= current_time_) { - // Expired - expired_count_++; - return Decision::kRemove; - } - return Decision::kKeep; - } - - private: - const uint64_t current_time_; - Statistics* statistics_; - // It is safe to not using std::atomic since the compaction filter, created - // from a compaction filter factroy, will not be called from multiple threads. - mutable uint64_t expired_count_ = 0; +struct BlobCompactionContext { + uint64_t next_file_number; + std::unordered_set current_blob_files; + SequenceNumber fifo_eviction_seq; + uint64_t evict_expiration_up_to; }; class BlobIndexCompactionFilterFactory : public CompactionFilterFactory { public: - BlobIndexCompactionFilterFactory(Env* env, Statistics* statistics) - : env_(env), statistics_(statistics) {} + BlobIndexCompactionFilterFactory(BlobDBImpl* blob_db_impl, Env* env, + Statistics* statistics) + : blob_db_impl_(blob_db_impl), env_(env), statistics_(statistics) {} virtual const char* Name() const override { return "BlobIndexCompactionFilterFactory"; } virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& /*context*/) override { - int64_t current_time = 0; - Status s = env_->GetCurrentTime(¤t_time); - if (!s.ok()) { - return nullptr; - } - assert(current_time >= 0); - return std::unique_ptr(new BlobIndexCompactionFilter( - static_cast(current_time), statistics_)); - } + const CompactionFilter::Context& /*context*/) override; private: + BlobDBImpl* blob_db_impl_; Env* env_; Statistics* statistics_; }; diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc index f042db76e86..523324a7636 100644 --- a/utilities/blob_db/blob_db.cc +++ b/utilities/blob_db/blob_db.cc @@ -63,30 +63,48 @@ Status BlobDB::Open(const DBOptions& db_options, BlobDB::BlobDB() : StackableDB(nullptr) {} void BlobDBOptions::Dump(Logger* log) const { - ROCKS_LOG_HEADER(log, " blob_db_options.blob_dir: %s", - blob_dir.c_str()); - ROCKS_LOG_HEADER(log, " blob_db_options.path_relative: %d", - path_relative); - ROCKS_LOG_HEADER(log, " blob_db_options.is_fifo: %d", - is_fifo); - ROCKS_LOG_HEADER(log, " blob_db_options.blob_dir_size: %" PRIu64, - blob_dir_size); - ROCKS_LOG_HEADER(log, " blob_db_options.ttl_range_secs: %" PRIu32, - ttl_range_secs); - ROCKS_LOG_HEADER(log, " blob_db_options.min_blob_size: %" PRIu64, - min_blob_size); - ROCKS_LOG_HEADER(log, " blob_db_options.bytes_per_sync: %" PRIu64, - bytes_per_sync); - ROCKS_LOG_HEADER(log, " blob_db_options.blob_file_size: %" PRIu64, - blob_file_size); - ROCKS_LOG_HEADER(log, " blob_db_options.ttl_extractor: %p", - ttl_extractor.get()); - ROCKS_LOG_HEADER(log, " blob_db_options.compression: %d", - static_cast(compression)); - ROCKS_LOG_HEADER(log, "blob_db_options.enable_garbage_collection: %d", - enable_garbage_collection); - ROCKS_LOG_HEADER(log, " blob_db_options.disable_background_tasks: %d", - disable_background_tasks); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.blob_dir: %s", + blob_dir.c_str()); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.path_relative: %d", + path_relative); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.is_fifo: %d", + is_fifo); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.max_db_size: %" PRIu64, + max_db_size); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.ttl_range_secs: %" PRIu32, + ttl_range_secs); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.min_blob_size: %" PRIu64, + min_blob_size); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.bytes_per_sync: %" PRIu64, + bytes_per_sync); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.blob_file_size: %" PRIu64, + blob_file_size); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.ttl_extractor: %p", + ttl_extractor.get()); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.compression: %d", + static_cast(compression)); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.enable_garbage_collection: %d", + enable_garbage_collection); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.garbage_collection_interval_secs: %" PRIu64, + garbage_collection_interval_secs); + ROCKS_LOG_HEADER( + log, "BlobDBOptions.garbage_collection_deletion_size_threshold: %lf", + garbage_collection_deletion_size_threshold); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.disable_background_tasks: %d", + disable_background_tasks); } } // namespace blob_db diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h index 961f1728b0a..183d23a8cd8 100644 --- a/utilities/blob_db/blob_db.h +++ b/utilities/blob_db/blob_db.h @@ -36,13 +36,17 @@ struct BlobDBOptions { // whether the blob_dir path is relative or absolute. bool path_relative = true; - // is the eviction strategy fifo based + // When max_db_size is reached, evict blob files to free up space + // instead of returnning NoSpace error on write. Blob files will be + // evicted in this order until enough space is free up: + // * the TTL blob file cloeset to expire, + // * the oldest non-TTL blob file. bool is_fifo = false; - // maximum size of the blob dir. Once this gets used, up - // evict the blob file which is oldest (is_fifo ) - // 0 means no limits - uint64_t blob_dir_size = 0; + // Maximum size of the database (including SST files and blob files). + // + // Default: 0 (no limits) + uint64_t max_db_size = 0; // a new bucket is opened, for ttl_range. So if ttl_range is 600seconds // (10 minutes), and the first bucket starts at 1471542000 @@ -107,8 +111,6 @@ class BlobDB : public StackableDB { } using rocksdb::StackableDB::Delete; - virtual Status Delete(const WriteOptions& options, - const Slice& key) override = 0; virtual Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family, const Slice& key) override { @@ -116,7 +118,8 @@ class BlobDB : public StackableDB { return Status::NotSupported( "Blob DB doesn't support non-default column family."); } - return Delete(options, key); + assert(db_ != nullptr); + return db_->Delete(options, column_family, key); } virtual Status PutWithTTL(const WriteOptions& options, const Slice& key, @@ -199,6 +202,9 @@ class BlobDB : public StackableDB { return NewIterator(options); } + using rocksdb::StackableDB::Close; + virtual Status Close() override = 0; + // Opening blob db. static Status Open(const Options& options, const BlobDBOptions& bdb_options, const std::string& dbname, BlobDB** blob_db); diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index f22a792057b..b3bf44e0338 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -35,6 +35,7 @@ #include "util/timer_queue.h" #include "utilities/blob_db/blob_compaction_filter.h" #include "utilities/blob_db/blob_db_iterator.h" +#include "utilities/blob_db/blob_db_listener.h" #include "utilities/blob_db/blob_index.h" namespace { @@ -44,19 +45,16 @@ int kBlockBasedTableVersionFormat = 2; namespace rocksdb { namespace blob_db { -void BlobDBFlushBeginListener::OnFlushBegin(DB* db, const FlushJobInfo& info) { - assert(blob_db_impl_ != nullptr); - blob_db_impl_->SyncBlobFiles(); -} - WalFilter::WalProcessingOption BlobReconcileWalFilter::LogRecordFound( - unsigned long long log_number, const std::string& log_file_name, - const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) { + unsigned long long /*log_number*/, const std::string& /*log_file_name*/, + const WriteBatch& /*batch*/, WriteBatch* /*new_batch*/, + bool* /*batch_changed*/) { return WalFilter::WalProcessingOption::kContinueProcessing; } bool blobf_compare_ttl::operator()(const std::shared_ptr& lhs, const std::shared_ptr& rhs) const { + assert(lhs->HasTTL() && rhs->HasTTL()); if (lhs->expiration_range_.first < rhs->expiration_range_.first) { return true; } @@ -66,37 +64,6 @@ bool blobf_compare_ttl::operator()(const std::shared_ptr& lhs, return lhs->BlobFileNumber() < rhs->BlobFileNumber(); } -void EvictAllVersionsCompactionListener::InternalListener::OnCompaction( - int level, const Slice& key, - CompactionEventListener::CompactionListenerValueType value_type, - const Slice& existing_value, const SequenceNumber& sn, bool is_new) { - assert(impl_->bdb_options_.enable_garbage_collection); - if (!is_new && - value_type == - CompactionEventListener::CompactionListenerValueType::kValue) { - BlobIndex blob_index; - Status s = blob_index.DecodeFrom(existing_value); - if (s.ok()) { - if (impl_->debug_level_ >= 3) - ROCKS_LOG_INFO( - impl_->db_options_.info_log, - "CALLBACK COMPACTED OUT KEY: %s SN: %d " - "NEW: %d FN: %" PRIu64 " OFFSET: %" PRIu64 " SIZE: %" PRIu64, - key.ToString().c_str(), sn, is_new, blob_index.file_number(), - blob_index.offset(), blob_index.size()); - - impl_->override_vals_q_.enqueue({blob_index.file_number(), key.size(), - blob_index.offset(), blob_index.size(), - sn}); - } - } else { - if (impl_->debug_level_ >= 3) - ROCKS_LOG_INFO(impl_->db_options_.info_log, - "CALLBACK NEW KEY: %s SN: %d NEW: %d", - key.ToString().c_str(), sn, is_new); - } -} - BlobDBImpl::BlobDBImpl(const std::string& dbname, const BlobDBOptions& blob_db_options, const DBOptions& db_options, @@ -113,13 +80,13 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname, statistics_(db_options_.statistics.get()), next_file_number_(1), epoch_of_(0), - shutdown_(false), - current_epoch_(0), + closed_(true), open_file_count_(0), - total_blob_space_(0), - open_p1_done_(false), - debug_level_(0), - oldest_file_evicted_(false) { + total_blob_size_(0), + live_sst_size_(0), + fifo_eviction_seq_(0), + evict_expiration_up_to_(0), + debug_level_(0) { blob_dir_ = (bdb_options_.path_relative) ? dbname + "/" + bdb_options_.blob_dir : bdb_options_.blob_dir; @@ -128,8 +95,30 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname, BlobDBImpl::~BlobDBImpl() { // CancelAllBackgroundWork(db_, true); + Status s __attribute__((__unused__)) = Close(); + assert(s.ok()); +} + +Status BlobDBImpl::Close() { + if (closed_) { + return Status::OK(); + } + closed_ = true; - Shutdown(); + // Close base DB before BlobDBImpl destructs to stop event listener and + // compaction filter call. + Status s = db_->Close(); + // delete db_ anyway even if close failed. + delete db_; + // Reset pointers to avoid StackableDB delete the pointer again. + db_ = nullptr; + db_impl_ = nullptr; + if (!s.ok()) { + return s; + } + + s = SyncBlobFiles(); + return s; } BlobDBOptions BlobDBImpl::GetBlobDBOptions() const { return bdb_options_; } @@ -179,14 +168,9 @@ Status BlobDBImpl::Open(std::vector* handles) { } // Update options - db_options_.listeners.push_back( - std::shared_ptr(new BlobDBFlushBeginListener(this))); - if (bdb_options_.enable_garbage_collection) { - db_options_.listeners.push_back(std::shared_ptr( - new EvictAllVersionsCompactionListener(this))); - } + db_options_.listeners.push_back(std::make_shared(this)); cf_options_.compaction_filter_factory.reset( - new BlobIndexCompactionFilterFactory(env_, statistics_)); + new BlobIndexCompactionFilterFactory(this, env_, statistics_)); // Open base db. ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options_); @@ -195,6 +179,7 @@ Status BlobDBImpl::Open(std::vector* handles) { return s; } db_impl_ = static_cast_with_check(db_->GetRootDB()); + UpdateLiveSSTSize(); // Start background jobs. if (!bdb_options_.disable_background_tasks) { @@ -203,6 +188,7 @@ Status BlobDBImpl::Open(std::vector* handles) { ROCKS_LOG_INFO(db_options_.info_log, "BlobDB pointer %p", this); bdb_options_.Dump(db_options_.info_log.get()); + closed_ = false; return s; } @@ -214,14 +200,6 @@ void BlobDBImpl::StartBackgroundTasks() { tqueue_.add(static_cast( bdb_options_.garbage_collection_interval_secs * 1000), std::bind(&BlobDBImpl::RunGC, this, std::placeholders::_1)); - if (bdb_options_.enable_garbage_collection) { - tqueue_.add( - kDeleteCheckPeriodMillisecs, - std::bind(&BlobDBImpl::EvictDeletions, this, std::placeholders::_1)); - tqueue_.add( - kDeleteCheckPeriodMillisecs, - std::bind(&BlobDBImpl::EvictCompacted, this, std::placeholders::_1)); - } tqueue_.add( kDeleteObsoleteFilesPeriodMillisecs, std::bind(&BlobDBImpl::DeleteObsoleteFiles, this, std::placeholders::_1)); @@ -232,8 +210,6 @@ void BlobDBImpl::StartBackgroundTasks() { std::bind(&BlobDBImpl::CheckSeqFiles, this, std::placeholders::_1)); } -void BlobDBImpl::Shutdown() { shutdown_.store(true); } - Status BlobDBImpl::GetAllBlobFiles(std::set* file_numbers) { assert(file_numbers != nullptr); std::vector all_files; @@ -283,8 +259,7 @@ Status BlobDBImpl::OpenAllBlobFiles() { Status read_metadata_status = blob_file->ReadMetadata(env_, env_options_); if (read_metadata_status.IsCorruption()) { // Remove incomplete file. - blob_file->MarkObsolete(0 /*sequence number*/); - obsolete_files_.push_back(blob_file); + ObsoleteBlobFile(blob_file, 0 /*obsolete_seq*/, false /*update_size*/); if (!obsolete_file_list.empty()) { obsolete_file_list.append(", "); } @@ -298,11 +273,7 @@ Status BlobDBImpl::OpenAllBlobFiles() { return read_metadata_status; } - // since this file already existed, we will try to reconcile - // deleted count with LSM - if (bdb_options_.enable_garbage_collection) { - blob_file->gc_once_after_open_ = true; - } + total_blob_size_ += blob_file->GetFileSize(); blob_files_[file_number] = blob_file; if (!blob_file_list.empty()) { @@ -391,25 +362,33 @@ Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr& bfile) { std::shared_ptr BlobDBImpl::FindBlobFileLocked( uint64_t expiration) const { - if (open_ttl_files_.empty()) return nullptr; + if (open_ttl_files_.empty()) { + return nullptr; + } std::shared_ptr tmp = std::make_shared(); + tmp->SetHasTTL(true); tmp->expiration_range_ = std::make_pair(expiration, 0); + tmp->file_number_ = std::numeric_limits::max(); auto citr = open_ttl_files_.equal_range(tmp); if (citr.first == open_ttl_files_.end()) { assert(citr.second == open_ttl_files_.end()); std::shared_ptr check = *(open_ttl_files_.rbegin()); - return (check->expiration_range_.second < expiration) ? nullptr : check; + return (check->expiration_range_.second <= expiration) ? nullptr : check; } - if (citr.first != citr.second) return *(citr.first); + if (citr.first != citr.second) { + return *(citr.first); + } auto finditr = citr.second; - if (finditr != open_ttl_files_.begin()) --finditr; + if (finditr != open_ttl_files_.begin()) { + --finditr; + } - bool b2 = (*finditr)->expiration_range_.second < expiration; + bool b2 = (*finditr)->expiration_range_.second <= expiration; bool b1 = (*finditr)->expiration_range_.first > expiration; return (b1 || b2) ? nullptr : (*finditr); @@ -474,6 +453,7 @@ std::shared_ptr BlobDBImpl::SelectBlobFile() { blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile)); open_non_ttl_file_ = bfile; + total_blob_size_ += BlobLogHeader::kSize; return bfile; } @@ -548,22 +528,12 @@ std::shared_ptr BlobDBImpl::SelectBlobFileTTL(uint64_t expiration) { blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile)); open_ttl_files_.insert(bfile); + total_blob_size_ += BlobLogHeader::kSize; epoch_of_++; return bfile; } -Status BlobDBImpl::Delete(const WriteOptions& options, const Slice& key) { - SequenceNumber lsn = db_impl_->GetLatestSequenceNumber(); - Status s = db_->Delete(options, key); - - if (bdb_options_.enable_garbage_collection) { - // add deleted key to list of keys that have been deleted for book-keeping - delete_keys_q_.enqueue({DefaultColumnFamily(), key.ToString(), lsn}); - } - return s; -} - class BlobDBImpl::BlobInserter : public WriteBatch::Handler { private: const WriteOptions& options_; @@ -646,47 +616,7 @@ Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) { if (!s.ok()) { return s; } - s = db_->Write(options, blob_inserter.batch()); - if (!s.ok()) { - return s; - } - - // add deleted key to list of keys that have been deleted for book-keeping - class DeleteBookkeeper : public WriteBatch::Handler { - public: - explicit DeleteBookkeeper(BlobDBImpl* impl, const SequenceNumber& seq) - : impl_(impl), sequence_(seq) {} - - virtual Status PutCF(uint32_t /*column_family_id*/, const Slice& /*key*/, - const Slice& /*value*/) override { - sequence_++; - return Status::OK(); - } - - virtual Status DeleteCF(uint32_t column_family_id, - const Slice& key) override { - ColumnFamilyHandle* cfh = - impl_->db_impl_->GetColumnFamilyHandleUnlocked(column_family_id); - - impl_->delete_keys_q_.enqueue({cfh, key.ToString(), sequence_}); - sequence_++; - return Status::OK(); - } - - private: - BlobDBImpl* impl_; - SequenceNumber sequence_; - }; - - if (bdb_options_.enable_garbage_collection) { - // add deleted key to list of keys that have been deleted for book-keeping - SequenceNumber current_seq = - WriteBatchInternal::Sequence(blob_inserter.batch()); - DeleteBookkeeper delete_bookkeeper(this, current_seq); - s = updates->Iterate(&delete_bookkeeper); - } - - return s; + return db_->Write(options, blob_inserter.batch()); } Status BlobDBImpl::GetLiveFiles(std::vector& ret, @@ -759,9 +689,10 @@ Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key, return s; } -Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key, - const Slice& value, uint64_t expiration, - WriteBatch* batch) { +Status BlobDBImpl::PutBlobValue(const WriteOptions& /*options*/, + const Slice& key, const Slice& value, + uint64_t expiration, WriteBatch* batch) { + write_mutex_.AssertHeld(); Status s; std::string index_entry; uint32_t column_family_id = @@ -779,20 +710,27 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key, RecordTick(statistics_, BLOB_DB_WRITE_INLINED_TTL); } } else { - std::shared_ptr bfile = (expiration != kNoExpiration) - ? SelectBlobFileTTL(expiration) - : SelectBlobFile(); - if (!bfile) { - return Status::NotFound("Blob file not found"); - } - - assert(bfile->compression() == bdb_options_.compression); std::string compression_output; Slice value_compressed = GetCompressedSlice(value, &compression_output); std::string headerbuf; Writer::ConstructBlobHeader(&headerbuf, key, value_compressed, expiration); + // Check DB size limit before selecting blob file to + // Since CheckSizeAndEvictBlobFiles() can close blob files, it needs to be + // done before calling SelectBlobFile(). + s = CheckSizeAndEvictBlobFiles(headerbuf.size() + key.size() + + value_compressed.size()); + if (!s.ok()) { + return s; + } + + std::shared_ptr bfile = (expiration != kNoExpiration) + ? SelectBlobFileTTL(expiration) + : SelectBlobFile(); + assert(bfile != nullptr); + assert(bfile->compression() == bdb_options_.compression); + s = AppendBlob(bfile, headerbuf, key, value_compressed, expiration, &index_entry); if (expiration == kNoExpiration) { @@ -855,66 +793,118 @@ uint64_t BlobDBImpl::ExtractExpiration(const Slice& key, const Slice& value, return has_expiration ? expiration : kNoExpiration; } -std::shared_ptr BlobDBImpl::GetOldestBlobFile() { - std::vector> blob_files; - CopyBlobFiles(&blob_files, [](const std::shared_ptr& f) { - return !f->Obsolete() && f->Immutable(); - }); - if (blob_files.empty()) { - return nullptr; +void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context) { + ReadLock l(&mutex_); + + context->next_file_number = next_file_number_.load(); + context->current_blob_files.clear(); + for (auto& p : blob_files_) { + context->current_blob_files.insert(p.first); } - blobf_compare_ttl compare; - return *std::min_element(blob_files.begin(), blob_files.end(), compare); + context->fifo_eviction_seq = fifo_eviction_seq_; + context->evict_expiration_up_to = evict_expiration_up_to_; } -bool BlobDBImpl::EvictOldestBlobFile() { - auto oldest_file = GetOldestBlobFile(); - if (oldest_file == nullptr) { - return false; +void BlobDBImpl::UpdateLiveSSTSize() { + uint64_t live_sst_size = 0; + bool ok = GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size); + if (ok) { + live_sst_size_.store(live_sst_size); + ROCKS_LOG_INFO(db_options_.info_log, + "Updated total SST file size: %" PRIu64 " bytes.", + live_sst_size); + } else { + ROCKS_LOG_ERROR( + db_options_.info_log, + "Failed to update total SST file size after flush or compaction."); } + { + // Trigger FIFO eviction if needed. + MutexLock l(&write_mutex_); + Status s = CheckSizeAndEvictBlobFiles(0, true /*force*/); + if (s.IsNoSpace()) { + ROCKS_LOG_WARN(db_options_.info_log, + "DB grow out-of-space after SST size updated. Current live" + " SST size: %" PRIu64 + " , current blob files size: %" PRIu64 ".", + live_sst_size_.load(), total_blob_size_.load()); + } + } +} - WriteLock wl(&mutex_); - // Double check the file is not obsolete by others - if (oldest_file_evicted_ == false && !oldest_file->Obsolete()) { - auto expiration_range = oldest_file->GetExpirationRange(); +Status BlobDBImpl::CheckSizeAndEvictBlobFiles(uint64_t blob_size, + bool force_evict) { + write_mutex_.AssertHeld(); + + uint64_t live_sst_size = live_sst_size_.load(); + if (bdb_options_.max_db_size == 0 || + live_sst_size + total_blob_size_.load() + blob_size <= + bdb_options_.max_db_size) { + return Status::OK(); + } + + if (bdb_options_.is_fifo == false || + (!force_evict && live_sst_size + blob_size > bdb_options_.max_db_size)) { + // FIFO eviction is disabled, or no space to insert new blob even we evict + // all blob files. + return Status::NoSpace( + "Write failed, as writing it would exceed max_db_size limit."); + } + + std::vector> candidate_files; + CopyBlobFiles(&candidate_files, + [&](const std::shared_ptr& blob_file) { + // Only evict TTL files + return blob_file->HasTTL(); + }); + std::sort(candidate_files.begin(), candidate_files.end(), + blobf_compare_ttl()); + std::reverse(candidate_files.begin(), candidate_files.end()); + fifo_eviction_seq_ = GetLatestSequenceNumber(); + + WriteLock l(&mutex_); + + while (!candidate_files.empty() && + live_sst_size + total_blob_size_.load() + blob_size > + bdb_options_.max_db_size) { + std::shared_ptr blob_file = candidate_files.back(); + candidate_files.pop_back(); + WriteLock file_lock(&blob_file->mutex_); + if (blob_file->Obsolete()) { + // File already obsoleted by someone else. + continue; + } + // FIFO eviction can evict open blob files. + if (!blob_file->Immutable()) { + Status s = CloseBlobFile(blob_file, false /*need_lock*/); + if (!s.ok()) { + return s; + } + } + assert(blob_file->Immutable()); + auto expiration_range = blob_file->GetExpirationRange(); ROCKS_LOG_INFO(db_options_.info_log, "Evict oldest blob file since DB out of space. Current " - "space used: %" PRIu64 ", blob dir size: %" PRIu64 - ", evicted blob file #%" PRIu64 + "live SST file size: %" PRIu64 ", total blob size: %" PRIu64 + ", max db size: %" PRIu64 ", evicted blob file #%" PRIu64 " with expiration range (%" PRIu64 ", %" PRIu64 ").", - total_blob_space_.load(), bdb_options_.blob_dir_size, - oldest_file->BlobFileNumber(), expiration_range.first, - expiration_range.second); - oldest_file->MarkObsolete(GetLatestSequenceNumber()); - obsolete_files_.push_back(oldest_file); - oldest_file_evicted_.store(true); + live_sst_size, total_blob_size_.load(), + bdb_options_.max_db_size, blob_file->BlobFileNumber(), + expiration_range.first, expiration_range.second); + ObsoleteBlobFile(blob_file, fifo_eviction_seq_, true /*update_size*/); + evict_expiration_up_to_ = expiration_range.first; RecordTick(statistics_, BLOB_DB_FIFO_NUM_FILES_EVICTED); RecordTick(statistics_, BLOB_DB_FIFO_NUM_KEYS_EVICTED, - oldest_file->BlobCount()); + blob_file->BlobCount()); RecordTick(statistics_, BLOB_DB_FIFO_BYTES_EVICTED, - oldest_file->GetFileSize()); + blob_file->GetFileSize()); TEST_SYNC_POINT("BlobDBImpl::EvictOldestBlobFile:Evicted"); - return true; } - - return false; -} - -Status BlobDBImpl::CheckSize(size_t blob_size) { - uint64_t new_space_util = total_blob_space_.load() + blob_size; - if (bdb_options_.blob_dir_size > 0) { - if (!bdb_options_.is_fifo && - (new_space_util > bdb_options_.blob_dir_size)) { - return Status::NoSpace( - "Write failed, as writing it would exceed blob_dir_size limit."); - } - if (bdb_options_.is_fifo && !oldest_file_evicted_.load() && - (new_space_util > - kEvictOldestFileAtSize * bdb_options_.blob_dir_size)) { - EvictOldestBlobFile(); - } + if (live_sst_size + total_blob_size_.load() + blob_size > + bdb_options_.max_db_size) { + return Status::NoSpace( + "Write failed, as writing it would exceed max_db_size limit."); } - return Status::OK(); } @@ -922,18 +912,15 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr& bfile, const std::string& headerbuf, const Slice& key, const Slice& value, uint64_t expiration, std::string* index_entry) { - auto size_put = BlobLogRecord::kHeaderSize + key.size() + value.size(); - Status s = CheckSize(size_put); - if (!s.ok()) { - return s; - } - + Status s; uint64_t blob_offset = 0; uint64_t key_offset = 0; { WriteLock lockbfile_w(&bfile->mutex_); std::shared_ptr writer = CheckOrCreateWriterLocked(bfile); - if (!writer) return Status::IOError("Failed to create blob writer"); + if (!writer) { + return Status::IOError("Failed to create blob writer"); + } // write the blob to the blob log. s = writer->EmitPhysicalRecord(headerbuf, key, value, &key_offset, @@ -950,8 +937,9 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr& bfile, // increment blob count bfile->blob_count_++; + uint64_t size_put = headerbuf.size() + key.size() + value.size(); bfile->file_size_ += size_put; - total_blob_space_ += size_put; + total_blob_size_ += size_put; if (expiration == kNoExpiration) { BlobIndex::EncodeBlob(index_entry, bfile->BlobFileNumber(), blob_offset, @@ -1204,10 +1192,8 @@ std::pair BlobDBImpl::SanityCheck(bool aborted) { for (auto bfile_pair : blob_files_) { auto bfile = bfile_pair.second; ROCKS_LOG_INFO( - db_options_.info_log, - "Blob File %s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64, + db_options_.info_log, "Blob File %s %" PRIu64 " %" PRIu64 " %" PRIu64, bfile->PathName().c_str(), bfile->GetFileSize(), bfile->BlobCount(), - bfile->deleted_count_, bfile->deleted_size_, (bfile->expiration_range_.second - epoch_now)); } @@ -1215,31 +1201,39 @@ std::pair BlobDBImpl::SanityCheck(bool aborted) { return std::make_pair(true, -1); } -Status BlobDBImpl::CloseBlobFile(std::shared_ptr bfile) { +Status BlobDBImpl::CloseBlobFile(std::shared_ptr bfile, + bool need_lock) { assert(bfile != nullptr); + write_mutex_.AssertHeld(); Status s; ROCKS_LOG_INFO(db_options_.info_log, "Closing blob file %" PRIu64 ". Path: %s", bfile->BlobFileNumber(), bfile->PathName().c_str()); { - WriteLock wl(&mutex_); + std::unique_ptr lock; + if (need_lock) { + lock.reset(new WriteLock(&mutex_)); + } if (bfile->HasTTL()) { size_t erased __attribute__((__unused__)); erased = open_ttl_files_.erase(bfile); - assert(erased == 1); - } else { - assert(bfile == open_non_ttl_file_); + } else if (bfile == open_non_ttl_file_) { open_non_ttl_file_ = nullptr; } } if (!bfile->closed_.load()) { - WriteLock lockbfile_w(&bfile->mutex_); + std::unique_ptr file_lock; + if (need_lock) { + file_lock.reset(new WriteLock(&bfile->mutex_)); + } s = bfile->WriteFooterAndCloseLocked(); } - if (!s.ok()) { + if (s.ok()) { + total_blob_size_ += BlobLogFooter::kSize; + } else { ROCKS_LOG_ERROR(db_options_.info_log, "Failed to close blob file %" PRIu64 "with error: %s", bfile->BlobFileNumber(), s.ToString().c_str()); @@ -1256,6 +1250,18 @@ Status BlobDBImpl::CloseBlobFileIfNeeded(std::shared_ptr& bfile) { return CloseBlobFile(bfile); } +void BlobDBImpl::ObsoleteBlobFile(std::shared_ptr blob_file, + SequenceNumber obsolete_seq, + bool update_size) { + // Should hold write lock of mutex_ or during DB open. + blob_file->MarkObsolete(obsolete_seq); + obsolete_files_.push_back(blob_file); + assert(total_blob_size_.load() >= blob_file->GetFileSize()); + if (update_size) { + total_blob_size_ -= blob_file->GetFileSize(); + } +} + bool BlobDBImpl::VisibleToActiveSnapshot( const std::shared_ptr& bfile) { assert(bfile->Obsolete()); @@ -1281,136 +1287,6 @@ bool BlobDBImpl::VisibleToActiveSnapshot( return oldest_snapshot < obsolete_sequence; } -bool BlobDBImpl::FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size, - uint64_t blob_offset, - uint64_t blob_size) { - assert(bdb_options_.enable_garbage_collection); - (void)blob_offset; - std::shared_ptr bfile; - { - ReadLock rl(&mutex_); - auto hitr = blob_files_.find(file_number); - - // file was deleted - if (hitr == blob_files_.end()) { - return false; - } - - bfile = hitr->second; - } - - WriteLock lockbfile_w(&bfile->mutex_); - - bfile->deleted_count_++; - bfile->deleted_size_ += key_size + blob_size + BlobLogRecord::kHeaderSize; - return true; -} - -bool BlobDBImpl::MarkBlobDeleted(const Slice& key, const Slice& index_entry) { - assert(bdb_options_.enable_garbage_collection); - BlobIndex blob_index; - Status s = blob_index.DecodeFrom(index_entry); - if (!s.ok()) { - ROCKS_LOG_INFO(db_options_.info_log, - "Could not parse lsm val in MarkBlobDeleted %s", - index_entry.ToString().c_str()); - return false; - } - bool succ = FindFileAndEvictABlob(blob_index.file_number(), key.size(), - blob_index.offset(), blob_index.size()); - return succ; -} - -std::pair BlobDBImpl::EvictCompacted(bool aborted) { - assert(bdb_options_.enable_garbage_collection); - if (aborted) return std::make_pair(false, -1); - - override_packet_t packet; - size_t total_vals = 0; - size_t mark_evicted = 0; - while (override_vals_q_.dequeue(&packet)) { - bool succeeded = - FindFileAndEvictABlob(packet.file_number_, packet.key_size_, - packet.blob_offset_, packet.blob_size_); - total_vals++; - if (succeeded) { - mark_evicted++; - } - } - ROCKS_LOG_INFO(db_options_.info_log, - "Mark %" ROCKSDB_PRIszt - " values to evict, out of %" ROCKSDB_PRIszt - " compacted values.", - mark_evicted, total_vals); - return std::make_pair(true, -1); -} - -std::pair BlobDBImpl::EvictDeletions(bool aborted) { - assert(bdb_options_.enable_garbage_collection); - if (aborted) return std::make_pair(false, -1); - - ColumnFamilyHandle* last_cfh = nullptr; - Options last_op; - - Arena arena; - ScopedArenaIterator iter; - - // we will use same RangeDelAggregator for all cf's. - // essentially we do not support Range Deletes now - std::unique_ptr range_del_agg; - delete_packet_t dpacket; - while (delete_keys_q_.dequeue(&dpacket)) { - if (last_cfh != dpacket.cfh_) { - if (!range_del_agg) { - auto cfhi = reinterpret_cast(dpacket.cfh_); - auto cfd = cfhi->cfd(); - range_del_agg.reset(new RangeDelAggregator(cfd->internal_comparator(), - kMaxSequenceNumber)); - } - - // this can be expensive - last_cfh = dpacket.cfh_; - last_op = db_impl_->GetOptions(last_cfh); - iter.set(db_impl_->NewInternalIterator(&arena, range_del_agg.get(), - dpacket.cfh_)); - // this will not work for multiple CF's. - } - - Slice user_key(dpacket.key_); - InternalKey target(user_key, dpacket.dsn_, kTypeValue); - - Slice eslice = target.Encode(); - iter->Seek(eslice); - - if (!iter->status().ok()) { - ROCKS_LOG_INFO(db_options_.info_log, "Invalid iterator seek %s", - dpacket.key_.c_str()); - continue; - } - - const Comparator* bwc = BytewiseComparator(); - while (iter->Valid()) { - if (!bwc->Equal(ExtractUserKey(iter->key()), ExtractUserKey(eslice))) - break; - - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - if (!ParseInternalKey(iter->key(), &ikey)) { - continue; - } - - // once you hit a DELETE, assume the keys below have been - // processed previously - if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion) break; - - Slice val = iter->value(); - MarkBlobDeleted(ikey.user_key, val); - - iter->Next(); - } - } - return std::make_pair(true, -1); -} - std::pair BlobDBImpl::CheckSeqFiles(bool aborted) { if (aborted) return std::make_pair(false, -1); @@ -1423,12 +1299,15 @@ std::pair BlobDBImpl::CheckSeqFiles(bool aborted) { { ReadLock lockbfile_r(&bfile->mutex_); - if (bfile->expiration_range_.second > epoch_now) continue; + if (bfile->expiration_range_.second > epoch_now) { + continue; + } process_files.push_back(bfile); } } } + MutexLock l(&write_mutex_); for (auto bfile : process_files) { CloseBlobFile(bfile); } @@ -1572,8 +1451,6 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr& bfptr, return s; } - bool first_gc = bfptr->gc_once_after_open_; - auto* cfh = db_impl_->GetColumnFamilyHandleUnlocked(bfptr->column_family_id()); auto* cfd = reinterpret_cast(cfh)->cfd(); @@ -1583,19 +1460,9 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr& bfptr, // this reads the key but skips the blob Reader::ReadLevel shallow = Reader::kReadHeaderKey; - bool no_relocation_ttl = - (has_ttl && now >= bfptr->GetExpirationRange().second); - - bool no_relocation_lsmdel = false; - { - ReadLock lockbfile_r(&bfptr->mutex_); - no_relocation_lsmdel = - (bfptr->GetFileSize() == - (BlobLogHeader::kSize + bfptr->deleted_size_ + BlobLogFooter::kSize)); - } + bool file_expired = has_ttl && now >= bfptr->GetExpirationRange().second; - bool no_relocation = no_relocation_ttl || no_relocation_lsmdel; - if (!no_relocation) { + if (!file_expired) { // read the blob because you have to write it back to new file shallow = Reader::kReadHeaderKeyBlob; } @@ -1671,7 +1538,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr& bfptr, // If key has expired, remove it from base DB. // TODO(yiwu): Blob indexes will be remove by BlobIndexCompactionFilter. // We can just drop the blob record. - if (no_relocation_ttl || (has_ttl && now >= record.expiration)) { + if (file_expired || (has_ttl && now >= record.expiration)) { gc_stats->num_keys_expired++; gc_stats->bytes_expired += record.record_size(); TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:BeforeDelete"); @@ -1693,11 +1560,6 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr& bfptr, continue; } - if (first_gc) { - // Do not relocate blob record for initial GC. - continue; - } - // Relocate the blob record to new file. if (!newfile) { // new file @@ -1706,12 +1568,16 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr& bfptr, newfile = NewBlobFile(reason); new_writer = CheckOrCreateWriterLocked(newfile); - newfile->header_ = std::move(header); // Can't use header beyond this point + newfile->header_ = std::move(header); newfile->header_valid_ = true; newfile->file_size_ = BlobLogHeader::kSize; - s = new_writer->WriteHeader(newfile->header_); + newfile->SetColumnFamilyId(bfptr->column_family_id()); + newfile->SetHasTTL(bfptr->HasTTL()); + newfile->SetCompression(bfptr->compression()); + newfile->expiration_range_ = bfptr->expiration_range_; + s = new_writer->WriteHeader(newfile->header_); if (!s.ok()) { ROCKS_LOG_ERROR(db_options_.info_log, "File: %s - header writing failed", @@ -1719,8 +1585,10 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr& bfptr, break; } + // We don't add the file to open_ttl_files_ or open_non_ttl_files_, to + // avoid user writes writing to the file, and avoid CheckSeqFiles close + // the file by mistake. WriteLock wl(&mutex_); - blob_files_.insert(std::make_pair(newfile->BlobFileNumber(), newfile)); } @@ -1763,12 +1631,9 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr& bfptr, } } // end of ReadRecord loop - if (s.ok()) { - bfptr->MarkObsolete(GetLatestSequenceNumber()); - if (!first_gc) { - WriteLock wl(&mutex_); - obsolete_files_.push_back(bfptr); - } + { + WriteLock wl(&mutex_); + ObsoleteBlobFile(bfptr, GetLatestSequenceNumber(), true /*update_size*/); } ROCKS_LOG_INFO( @@ -1791,7 +1656,11 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr& bfptr, gc_stats->bytes_overwritten); RecordTick(statistics_, BLOB_DB_GC_BYTES_EXPIRED, gc_stats->bytes_expired); if (newfile != nullptr) { - total_blob_space_ += newfile->file_size_; + { + MutexLock l(&write_mutex_); + CloseBlobFile(newfile); + } + total_blob_size_ += newfile->file_size_; ROCKS_LOG_INFO(db_options_.info_log, "New blob file %" PRIu64 ".", newfile->BlobFileNumber()); RecordTick(statistics_, BLOB_DB_GC_NUM_NEW_FILES); @@ -1806,73 +1675,6 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr& bfptr, return s; } -// Ideally we should hold the lock during the entire function, -// but under the asusmption that this is only called when a -// file is Immutable, we can reduce the critical section -bool BlobDBImpl::ShouldGCFile(std::shared_ptr bfile, uint64_t now, - bool is_oldest_non_ttl_file, - std::string* reason) { - if (bfile->HasTTL()) { - ExpirationRange expiration_range = bfile->GetExpirationRange(); - if (now > expiration_range.second) { - *reason = "entire file ttl expired"; - return true; - } - - if (!bfile->file_size_.load()) { - ROCKS_LOG_ERROR(db_options_.info_log, "Invalid file size = 0 %s", - bfile->PathName().c_str()); - *reason = "file is empty"; - return false; - } - - if (bfile->gc_once_after_open_.load()) { - return true; - } - - ReadLock lockbfile_r(&bfile->mutex_); - bool ret = ((bfile->deleted_size_ / bfile->file_size_.load()) > - bdb_options_.garbage_collection_deletion_size_threshold); - if (ret) { - *reason = "deleted blobs beyond threshold"; - } else { - *reason = "deleted blobs below threshold"; - } - return ret; - } - - // when crash happens, we lose the in-memory account of deleted blobs. - // we are therefore forced to do one GC to make sure delete accounting - // is OK - if (bfile->gc_once_after_open_.load()) { - return true; - } - - ReadLock lockbfile_r(&bfile->mutex_); - - if (bdb_options_.enable_garbage_collection) { - if ((bfile->deleted_size_ / bfile->file_size_.load()) > - bdb_options_.garbage_collection_deletion_size_threshold) { - *reason = "deleted simple blobs beyond threshold"; - return true; - } - } - - // if we haven't reached limits of disk space, don't DELETE - if (bdb_options_.blob_dir_size == 0 || - total_blob_space_.load() < bdb_options_.blob_dir_size) { - *reason = "disk space not exceeded"; - return false; - } - - if (is_oldest_non_ttl_file) { - *reason = "out of space and is the oldest simple blob file"; - return true; - } - *reason = "out of space but is not the oldest simple blob file"; - return false; -} - std::pair BlobDBImpl::DeleteObsoleteFiles(bool aborted) { if (aborted) return std::make_pair(false, -1); @@ -1915,7 +1717,6 @@ std::pair BlobDBImpl::DeleteObsoleteFiles(bool aborted) { } file_deleted = true; - total_blob_space_ -= bfile->file_size_; ROCKS_LOG_INFO(db_options_.info_log, "File deleted as obsolete from blob dir %s", bfile->PathName().c_str()); @@ -1926,9 +1727,6 @@ std::pair BlobDBImpl::DeleteObsoleteFiles(bool aborted) { // directory change. Fsync if (file_deleted) { dir_ent_->Fsync(); - - // reset oldest_file_evicted flag - oldest_file_evicted_.store(false); } // put files back into obsolete if for some reason, delete failed @@ -1958,98 +1756,13 @@ void BlobDBImpl::CopyBlobFiles( } } -void BlobDBImpl::FilterSubsetOfFiles( - const std::vector>& blob_files, - std::vector>* to_process, uint64_t epoch, - size_t files_to_collect) { - // 100.0 / 15.0 = 7 - uint64_t next_epoch_increment = static_cast( - std::ceil(100 / static_cast(kGCFilePercentage))); - uint64_t now = EpochNow(); - - size_t files_processed = 0; - bool non_ttl_file_found = false; - for (auto bfile : blob_files) { - if (files_processed >= files_to_collect) break; - // if this is the first time processing the file - // i.e. gc_epoch == -1, process it. - // else process the file if its processing epoch matches - // the current epoch. Typically the #of epochs should be - // around 5-10 - if (bfile->gc_epoch_ != -1 && (uint64_t)bfile->gc_epoch_ != epoch) { - continue; - } - - files_processed++; - // reset the epoch - bfile->gc_epoch_ = epoch + next_epoch_increment; - - // file has already been GC'd or is still open for append, - // then it should not be GC'd - if (bfile->Obsolete() || !bfile->Immutable()) continue; - - bool is_oldest_non_ttl_file = false; - if (!non_ttl_file_found && !bfile->HasTTL()) { - is_oldest_non_ttl_file = true; - non_ttl_file_found = true; - } - - std::string reason; - bool shouldgc = ShouldGCFile(bfile, now, is_oldest_non_ttl_file, &reason); - if (!shouldgc) { - ROCKS_LOG_DEBUG(db_options_.info_log, - "File has been skipped for GC ttl %s %" PRIu64 " %" PRIu64 - " reason='%s'", - bfile->PathName().c_str(), now, - bfile->GetExpirationRange().second, reason.c_str()); - continue; - } - - ROCKS_LOG_INFO(db_options_.info_log, - "File has been chosen for GC ttl %s %" PRIu64 " %" PRIu64 - " reason='%s'", - bfile->PathName().c_str(), now, - bfile->GetExpirationRange().second, reason.c_str()); - to_process->push_back(bfile); - } -} - std::pair BlobDBImpl::RunGC(bool aborted) { - if (aborted) return std::make_pair(false, -1); - - current_epoch_++; - - std::vector> blob_files; - CopyBlobFiles(&blob_files); - - if (!blob_files.size()) return std::make_pair(true, -1); - - // 15% of files are collected each call to space out the IO and CPU - // consumption. - size_t files_to_collect = (kGCFilePercentage * blob_files.size()) / 100; - - std::vector> to_process; - FilterSubsetOfFiles(blob_files, &to_process, current_epoch_, - files_to_collect); - - for (auto bfile : to_process) { - GCStats gc_stats; - Status s = GCFileAndUpdateLSM(bfile, &gc_stats); - if (!s.ok()) { - continue; - } - - if (bfile->gc_once_after_open_.load()) { - WriteLock lockbfile_w(&bfile->mutex_); - - bfile->deleted_size_ = - gc_stats.bytes_overwritten + gc_stats.bytes_expired; - bfile->deleted_count_ = - gc_stats.num_keys_overwritten + gc_stats.num_keys_expired; - bfile->gc_once_after_open_ = false; - } + if (aborted) { + return std::make_pair(false, -1); } + // TODO(yiwu): Garbage collection implementation. + // reschedule return std::make_pair(true, -1); } @@ -2134,15 +1847,24 @@ void BlobDBImpl::TEST_DeleteObsoleteFiles() { } Status BlobDBImpl::TEST_CloseBlobFile(std::shared_ptr& bfile) { + MutexLock l(&write_mutex_); return CloseBlobFile(bfile); } +void BlobDBImpl::TEST_ObsoleteBlobFile(std::shared_ptr& blob_file, + SequenceNumber obsolete_seq, + bool update_size) { + return ObsoleteBlobFile(blob_file, obsolete_seq, update_size); +} + Status BlobDBImpl::TEST_GCFileAndUpdateLSM(std::shared_ptr& bfile, GCStats* gc_stats) { return GCFileAndUpdateLSM(bfile, gc_stats); } void BlobDBImpl::TEST_RunGC() { RunGC(false /*abort*/); } + +uint64_t BlobDBImpl::TEST_live_sst_size() { return live_sst_size_.load(); } #endif // !NDEBUG } // namespace blob_db diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h index 328087c9c9b..d3e810deb0c 100644 --- a/utilities/blob_db/blob_db_impl.h +++ b/utilities/blob_db/blob_db_impl.h @@ -26,7 +26,6 @@ #include "rocksdb/options.h" #include "rocksdb/statistics.h" #include "rocksdb/wal_filter.h" -#include "util/mpsc.h" #include "util/mutexlock.h" #include "util/timer_queue.h" #include "utilities/blob_db/blob_db.h" @@ -44,19 +43,9 @@ struct FlushJobInfo; namespace blob_db { -class BlobFile; +struct BlobCompactionContext; class BlobDBImpl; - -class BlobDBFlushBeginListener : public EventListener { - public: - explicit BlobDBFlushBeginListener(BlobDBImpl* blob_db_impl) - : blob_db_impl_(blob_db_impl) {} - - void OnFlushBegin(DB* db, const FlushJobInfo& info) override; - - private: - BlobDBImpl* blob_db_impl_; -}; +class BlobFile; // this implements the callback from the WAL which ensures that the // blob record is present in the blob log. If fsync/fdatasync in not @@ -73,34 +62,6 @@ class BlobReconcileWalFilter : public WalFilter { virtual const char* Name() const override { return "BlobDBWalReconciler"; } }; -class EvictAllVersionsCompactionListener : public EventListener { - public: - class InternalListener : public CompactionEventListener { - friend class BlobDBImpl; - - public: - explicit InternalListener(BlobDBImpl* blob_db_impl) : impl_(blob_db_impl) {} - - virtual void OnCompaction(int level, const Slice& key, - CompactionListenerValueType value_type, - const Slice& existing_value, - const SequenceNumber& sn, bool is_new) override; - - private: - BlobDBImpl* impl_; - }; - - explicit EvictAllVersionsCompactionListener(BlobDBImpl* blob_db_impl) - : internal_listener_(new InternalListener(blob_db_impl)) {} - - virtual CompactionEventListener* GetCompactionEventListener() override { - return internal_listener_.get(); - } - - private: - std::unique_ptr internal_listener_; -}; - // Comparator to sort "TTL" aware Blob files based on the lower value of // TTL range. struct blobf_compare_ttl { @@ -124,7 +85,6 @@ struct GCStats { * Garbage Collected. */ class BlobDBImpl : public BlobDB { - friend class EvictAllVersionsCompactionListener; friend class BlobFile; friend class BlobDBIterator; @@ -161,9 +121,6 @@ class BlobDBImpl : public BlobDB { Status Put(const WriteOptions& options, const Slice& key, const Slice& value) override; - using BlobDB::Delete; - Status Delete(const WriteOptions& options, const Slice& key) override; - using BlobDB::Get; Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) override; @@ -173,9 +130,9 @@ class BlobDBImpl : public BlobDB { using BlobDB::NewIterators; virtual Status NewIterators( - const ReadOptions& read_options, - const std::vector& column_families, - std::vector* iterators) override { + const ReadOptions& /*read_options*/, + const std::vector& /*column_families*/, + std::vector* /*iterators*/) override { return Status::NotSupported("Not implemented"); } @@ -187,6 +144,8 @@ class BlobDBImpl : public BlobDB { virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + virtual Status Close() override; + virtual Status GetLiveFiles(std::vector&, uint64_t* manifest_file_size, bool flush_memtable = true) override; @@ -213,6 +172,10 @@ class BlobDBImpl : public BlobDB { Status SyncBlobFiles() override; + void UpdateLiveSSTSize(); + + void GetCompactionContext(BlobCompactionContext* context); + #ifndef NDEBUG Status TEST_GetBlobValue(const Slice& key, const Slice& index_entry, PinnableSlice* value); @@ -223,12 +186,18 @@ class BlobDBImpl : public BlobDB { Status TEST_CloseBlobFile(std::shared_ptr& bfile); + void TEST_ObsoleteBlobFile(std::shared_ptr& blob_file, + SequenceNumber obsolete_seq = 0, + bool update_size = true); + Status TEST_GCFileAndUpdateLSM(std::shared_ptr& bfile, GCStats* gc_stats); void TEST_RunGC(); void TEST_DeleteObsoleteFiles(); + + uint64_t TEST_live_sst_size(); #endif // !NDEBUG private: @@ -249,19 +218,18 @@ class BlobDBImpl : public BlobDB { Slice GetCompressedSlice(const Slice& raw, std::string* compression_output) const; - // is this file ready for Garbage collection. if the TTL of the file - // has expired or if threshold of the file has been evicted - // tt - current time - // last_id - the id of the non-TTL file to evict - bool ShouldGCFile(std::shared_ptr bfile, uint64_t now, - bool is_oldest_non_ttl_file, std::string* reason); - // Close a file by appending a footer, and removes file from open files list. - Status CloseBlobFile(std::shared_ptr bfile); + Status CloseBlobFile(std::shared_ptr bfile, bool need_lock = true); // Close a file if its size exceeds blob_file_size Status CloseBlobFileIfNeeded(std::shared_ptr& bfile); + // Mark file as obsolete and move the file to obsolete file list. + // + // REQUIRED: hold write lock of mutex_ or during DB open. + void ObsoleteBlobFile(std::shared_ptr blob_file, + SequenceNumber obsolete_seq, bool update_size); + uint64_t ExtractExpiration(const Slice& key, const Slice& value, Slice* value_slice, std::string* new_value); @@ -283,8 +251,6 @@ class BlobDBImpl : public BlobDB { std::shared_ptr FindBlobFileLocked(uint64_t expiration) const; - void Shutdown(); - // periodic sanity check. Bunch of checks std::pair SanityCheck(bool aborted); @@ -305,11 +271,6 @@ class BlobDBImpl : public BlobDB { // efficiency std::pair ReclaimOpenFiles(bool aborted); - // background task to do book-keeping of deleted keys - std::pair EvictDeletions(bool aborted); - - std::pair EvictCompacted(bool aborted); - std::pair RemoveTimerQ(TimerQueue* tq, bool aborted); // Adds the background tasks to the timer queue @@ -354,27 +315,18 @@ class BlobDBImpl : public BlobDB { bool VisibleToActiveSnapshot(const std::shared_ptr& file); bool FileDeleteOk_SnapshotCheckLocked(const std::shared_ptr& bfile); - bool MarkBlobDeleted(const Slice& key, const Slice& lsmValue); - - bool FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size, - uint64_t blob_offset, uint64_t blob_size); - void CopyBlobFiles( std::vector>* bfiles_copy, std::function&)> predicate = {}); - void FilterSubsetOfFiles( - const std::vector>& blob_files, - std::vector>* to_process, uint64_t epoch, - size_t files_to_collect); - uint64_t EpochNow() { return env_->NowMicros() / 1000000; } - Status CheckSize(size_t blob_size); - - std::shared_ptr GetOldestBlobFile(); - - bool EvictOldestBlobFile(); + // Check if inserting a new blob will make DB grow out of space. + // If is_fifo = true, FIFO eviction will be triggered to make room for the + // new blob. If force_evict = true, FIFO eviction will evict blob files + // even eviction will not make enough room for the new blob. + Status CheckSizeAndEvictBlobFiles(uint64_t blob_size, + bool force_evict = false); // name of the database directory std::string dbname_; @@ -421,56 +373,37 @@ class BlobDBImpl : public BlobDB { // all the blob files which are currently being appended to based // on variety of incoming TTL's - std::multiset, blobf_compare_ttl> open_ttl_files_; - - // packet of information to put in lockess delete(s) queue - struct delete_packet_t { - ColumnFamilyHandle* cfh_; - std::string key_; - SequenceNumber dsn_; - }; - - struct override_packet_t { - uint64_t file_number_; - uint64_t key_size_; - uint64_t blob_offset_; - uint64_t blob_size_; - SequenceNumber dsn_; - }; - - // LOCKLESS multiple producer single consumer queue to quickly append - // deletes without taking lock. Can rapidly grow in size!! - // deletes happen in LSM, but minor book-keeping needs to happen on - // BLOB side (for triggering eviction) - mpsc_queue_t delete_keys_q_; - - // LOCKLESS multiple producer single consumer queue for values - // that are being compacted - mpsc_queue_t override_vals_q_; - - // atomic bool to represent shutdown - std::atomic shutdown_; + std::set, blobf_compare_ttl> open_ttl_files_; + + // Flag to check whether Close() has been called on this DB + bool closed_; // timer based queue to execute tasks TimerQueue tqueue_; - // only accessed in GC thread, hence not atomic. The epoch of the - // GC task. Each execution is one epoch. Helps us in allocating - // files to one execution - uint64_t current_epoch_; - // number of files opened for random access/GET // counter is used to monitor and close excess RA files. std::atomic open_file_count_; - // total size of all blob files at a given time - std::atomic total_blob_space_; + // Total size of all live blob files (i.e. exclude obsolete files). + std::atomic total_blob_size_; + + // total size of SST files. + std::atomic live_sst_size_; + + // Latest FIFO eviction timestamp + // + // REQUIRES: access with metex_ lock held. + uint64_t fifo_eviction_seq_; + + // The expiration up to which latest FIFO eviction evicts. + // + // REQUIRES: access with metex_ lock held. + uint64_t evict_expiration_up_to_; + std::list> obsolete_files_; - bool open_p1_done_; uint32_t debug_level_; - - std::atomic oldest_file_evicted_; }; } // namespace blob_db diff --git a/utilities/blob_db/blob_db_iterator.h b/utilities/blob_db/blob_db_iterator.h index f901df366f5..5ead75dd726 100644 --- a/utilities/blob_db/blob_db_iterator.h +++ b/utilities/blob_db/blob_db_iterator.h @@ -46,28 +46,36 @@ class BlobDBIterator : public Iterator { StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS); RecordTick(statistics_, BLOB_DB_NUM_SEEK); iter_->SeekToFirst(); - UpdateBlobValue(); + while (UpdateBlobValue()) { + iter_->Next(); + } } void SeekToLast() override { StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS); RecordTick(statistics_, BLOB_DB_NUM_SEEK); iter_->SeekToLast(); - UpdateBlobValue(); + while (UpdateBlobValue()) { + iter_->Prev(); + } } void Seek(const Slice& target) override { StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS); RecordTick(statistics_, BLOB_DB_NUM_SEEK); iter_->Seek(target); - UpdateBlobValue(); + while (UpdateBlobValue()) { + iter_->Next(); + } } void SeekForPrev(const Slice& target) override { StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS); RecordTick(statistics_, BLOB_DB_NUM_SEEK); iter_->SeekForPrev(target); - UpdateBlobValue(); + while (UpdateBlobValue()) { + iter_->Prev(); + } } void Next() override { @@ -75,7 +83,9 @@ class BlobDBIterator : public Iterator { StopWatch next_sw(env_, statistics_, BLOB_DB_NEXT_MICROS); RecordTick(statistics_, BLOB_DB_NUM_NEXT); iter_->Next(); - UpdateBlobValue(); + while (UpdateBlobValue()) { + iter_->Next(); + } } void Prev() override { @@ -83,7 +93,9 @@ class BlobDBIterator : public Iterator { StopWatch prev_sw(env_, statistics_, BLOB_DB_PREV_MICROS); RecordTick(statistics_, BLOB_DB_NUM_PREV); iter_->Prev(); - UpdateBlobValue(); + while (UpdateBlobValue()) { + iter_->Prev(); + } } Slice key() const override { @@ -102,12 +114,23 @@ class BlobDBIterator : public Iterator { // Iterator::Refresh() not supported. private: - void UpdateBlobValue() { + // Return true if caller should continue to next value. + bool UpdateBlobValue() { TEST_SYNC_POINT("BlobDBIterator::UpdateBlobValue:Start:1"); TEST_SYNC_POINT("BlobDBIterator::UpdateBlobValue:Start:2"); value_.Reset(); - if (iter_->Valid() && iter_->IsBlob()) { - status_ = blob_db_->GetBlobValue(iter_->key(), iter_->value(), &value_); + if (iter_->Valid() && iter_->status().ok() && iter_->IsBlob()) { + Status s = blob_db_->GetBlobValue(iter_->key(), iter_->value(), &value_); + if (s.IsNotFound()) { + return true; + } else { + if (!s.ok()) { + status_ = s; + } + return false; + } + } else { + return false; } } diff --git a/utilities/blob_db/blob_db_listener.h b/utilities/blob_db/blob_db_listener.h new file mode 100644 index 00000000000..f096d238ba3 --- /dev/null +++ b/utilities/blob_db/blob_db_listener.h @@ -0,0 +1,46 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include + +#include "rocksdb/listener.h" +#include "util/mutexlock.h" +#include "utilities/blob_db/blob_db_impl.h" + +namespace rocksdb { +namespace blob_db { + +class BlobDBListener : public EventListener { + public: + explicit BlobDBListener(BlobDBImpl* blob_db_impl) + : blob_db_impl_(blob_db_impl) {} + + void OnFlushBegin(DB* /*db*/, const FlushJobInfo& /*info*/) override { + assert(blob_db_impl_ != nullptr); + blob_db_impl_->SyncBlobFiles(); + } + + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& /*info*/) override { + assert(blob_db_impl_ != nullptr); + blob_db_impl_->UpdateLiveSSTSize(); + } + + void OnCompactionCompleted(DB* /*db*/, + const CompactionJobInfo& /*info*/) override { + assert(blob_db_impl_ != nullptr); + blob_db_impl_->UpdateLiveSSTSize(); + } + + private: + BlobDBImpl* blob_db_impl_; +}; + +} // namespace blob_db +} // namespace rocksdb +#endif // !ROCKSDB_LITE diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc index 2fa8fe12ae5..420757a4667 100644 --- a/utilities/blob_db/blob_db_test.cc +++ b/utilities/blob_db/blob_db_test.cc @@ -45,7 +45,10 @@ class BlobDBTest : public testing::Test { assert(s.ok()); } - ~BlobDBTest() { Destroy(); } + ~BlobDBTest() { + SyncPoint::GetInstance()->ClearAllCallBacks(); + Destroy(); + } Status TryOpen(BlobDBOptions bdb_options = BlobDBOptions(), Options options = Options()) { @@ -80,8 +83,13 @@ class BlobDBTest : public testing::Test { return reinterpret_cast(blob_db_); } - Status Put(const Slice &key, const Slice &value) { - return blob_db_->Put(WriteOptions(), key, value); + Status Put(const Slice &key, const Slice &value, + std::map *data = nullptr) { + Status s = blob_db_->Put(WriteOptions(), key, value); + if (data != nullptr) { + (*data)[key.ToString()] = value.ToString(); + } + return s; } void Delete(const std::string &key, @@ -92,6 +100,15 @@ class BlobDBTest : public testing::Test { } } + Status PutWithTTL(const Slice &key, const Slice &value, uint64_t ttl, + std::map *data = nullptr) { + Status s = blob_db_->PutWithTTL(WriteOptions(), key, value, ttl); + if (data != nullptr) { + (*data)[key.ToString()] = value.ToString(); + } + return s; + } + Status PutUntil(const Slice &key, const Slice &value, uint64_t expiration) { return blob_db_->PutUntil(WriteOptions(), key, value, expiration); } @@ -746,8 +763,28 @@ TEST_F(BlobDBTest, GCExpiredKeyWhileOverwriting) { VerifyDB({{"foo", "v2"}}); } +TEST_F(BlobDBTest, NewFileGeneratedFromGCShouldMarkAsImmutable) { + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + bdb_options.disable_background_tasks = true; + Open(bdb_options); + ASSERT_OK(Put("foo", "bar")); + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + auto blob_file1 = blob_files[0]; + ASSERT_EQ(1, blob_files.size()); + ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_file1)); + GCStats gc_stats; + ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_file1, &gc_stats)); + ASSERT_EQ(1, gc_stats.blob_count); + ASSERT_EQ(1, gc_stats.num_keys_relocated); + blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(2, blob_files.size()); + ASSERT_EQ(blob_file1, blob_files[0]); + ASSERT_TRUE(blob_files[1]->Immutable()); +} + // This test is no longer valid since we now return an error when we go -// over the configured blob_dir_size. +// over the configured max_db_size. // The test needs to be re-written later in such a way that writes continue // after a GC happens. TEST_F(BlobDBTest, DISABLED_GCOldestSimpleBlobFileWhenOutOfSpace) { @@ -755,7 +792,7 @@ TEST_F(BlobDBTest, DISABLED_GCOldestSimpleBlobFileWhenOutOfSpace) { Options options; options.env = mock_env_.get(); BlobDBOptions bdb_options; - bdb_options.blob_dir_size = 100; + bdb_options.max_db_size = 100; bdb_options.blob_file_size = 100; bdb_options.min_blob_size = 0; bdb_options.disable_background_tasks = true; @@ -1038,13 +1075,14 @@ TEST_F(BlobDBTest, MigrateFromPlainRocksDB) { } // Test to verify that a NoSpace IOError Status is returned on reaching -// blob_dir_size limit. +// max_db_size limit. TEST_F(BlobDBTest, OutOfSpace) { // Use mock env to stop wall clock. Options options; options.env = mock_env_.get(); BlobDBOptions bdb_options; - bdb_options.blob_dir_size = 150; + bdb_options.max_db_size = 200; + bdb_options.is_fifo = false; bdb_options.disable_background_tasks = true; Open(bdb_options); @@ -1053,16 +1091,16 @@ TEST_F(BlobDBTest, OutOfSpace) { std::string value(100, 'v'); ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key1", value, 60)); - // Putting another blob should fail as ading it would exceed the blob_dir_size + // Putting another blob should fail as ading it would exceed the max_db_size // limit. Status s = blob_db_->PutWithTTL(WriteOptions(), "key2", value, 60); ASSERT_TRUE(s.IsIOError()); ASSERT_TRUE(s.IsNoSpace()); } -TEST_F(BlobDBTest, EvictOldestFileWhenCloseToSpaceLimit) { +TEST_F(BlobDBTest, FIFOEviction) { BlobDBOptions bdb_options; - bdb_options.blob_dir_size = 270; + bdb_options.max_db_size = 200; bdb_options.blob_file_size = 100; bdb_options.is_fifo = true; bdb_options.disable_background_tasks = true; @@ -1078,32 +1116,36 @@ TEST_F(BlobDBTest, EvictOldestFileWhenCloseToSpaceLimit) { // So a 100 byte blob should take up 132 bytes. std::string value(100, 'v'); ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key1", value, 10)); + VerifyDB({{"key1", value}}); - auto *bdb_impl = static_cast(blob_db_); - auto blob_files = bdb_impl->TEST_GetBlobFiles(); - ASSERT_EQ(1, blob_files.size()); + ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); // Adding another 100 byte blob would take the total size to 264 bytes - // (2*132), which is more than 90% of blob_dir_size. So, the oldest file - // should be evicted and put in obsolete files list. + // (2*132). max_db_size will be exceeded + // than max_db_size and trigger FIFO eviction. ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key2", value, 60)); + ASSERT_EQ(1, evict_count); + // key1 will exist until corresponding file be deleted. + VerifyDB({{"key1", value}, {"key2", value}}); - auto obsolete_files = bdb_impl->TEST_GetObsoleteFiles(); + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(2, blob_files.size()); + ASSERT_TRUE(blob_files[0]->Obsolete()); + ASSERT_FALSE(blob_files[1]->Obsolete()); + auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles(); ASSERT_EQ(1, obsolete_files.size()); - ASSERT_TRUE(obsolete_files[0]->Immutable()); - ASSERT_EQ(blob_files[0]->BlobFileNumber(), - obsolete_files[0]->BlobFileNumber()); + ASSERT_EQ(blob_files[0], obsolete_files[0]); - bdb_impl->TEST_DeleteObsoleteFiles(); - obsolete_files = bdb_impl->TEST_GetObsoleteFiles(); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles(); ASSERT_TRUE(obsolete_files.empty()); - ASSERT_EQ(1, evict_count); + VerifyDB({{"key2", value}}); } -TEST_F(BlobDBTest, NoOldestFileToEvict) { +TEST_F(BlobDBTest, FIFOEviction_NoOldestFileToEvict) { Options options; BlobDBOptions bdb_options; - bdb_options.blob_dir_size = 1000; + bdb_options.max_db_size = 1000; bdb_options.blob_file_size = 5000; bdb_options.is_fifo = true; bdb_options.disable_background_tasks = true; @@ -1116,11 +1158,97 @@ TEST_F(BlobDBTest, NoOldestFileToEvict) { SyncPoint::GetInstance()->EnableProcessing(); std::string value(2000, 'v'); - ASSERT_OK(Put("foo", std::string(2000, 'v'))); - ASSERT_OK(Put("bar", std::string(2000, 'v'))); + ASSERT_TRUE(Put("foo", std::string(2000, 'v')).IsNoSpace()); ASSERT_EQ(0, evict_count); } +TEST_F(BlobDBTest, FIFOEviction_NoEnoughBlobFilesToEvict) { + BlobDBOptions bdb_options; + bdb_options.is_fifo = true; + bdb_options.min_blob_size = 100; + bdb_options.disable_background_tasks = true; + Options options; + // Use mock env to stop wall clock. + options.env = mock_env_.get(); + options.disable_auto_compactions = true; + auto statistics = CreateDBStatistics(); + options.statistics = statistics; + Open(bdb_options, options); + + ASSERT_EQ(0, blob_db_impl()->TEST_live_sst_size()); + std::string small_value(50, 'v'); + std::map data; + // Insert some data into LSM tree to make sure FIFO eviction take SST + // file size into account. + for (int i = 0; i < 1000; i++) { + ASSERT_OK(Put("key" + ToString(i), small_value, &data)); + } + ASSERT_OK(blob_db_->Flush(FlushOptions())); + uint64_t live_sst_size = 0; + ASSERT_TRUE(blob_db_->GetIntProperty(DB::Properties::kTotalSstFilesSize, + &live_sst_size)); + ASSERT_TRUE(live_sst_size > 0); + ASSERT_EQ(live_sst_size, blob_db_impl()->TEST_live_sst_size()); + + bdb_options.max_db_size = live_sst_size + 2000; + Reopen(bdb_options, options); + ASSERT_EQ(live_sst_size, blob_db_impl()->TEST_live_sst_size()); + + std::string value_1k(1000, 'v'); + ASSERT_OK(PutWithTTL("large_key1", value_1k, 60, &data)); + ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + VerifyDB(data); + // large_key2 evicts large_key1 + ASSERT_OK(PutWithTTL("large_key2", value_1k, 60, &data)); + ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + data.erase("large_key1"); + VerifyDB(data); + // large_key3 get no enough space even after evicting large_key2, so it + // instead return no space error. + std::string value_2k(2000, 'v'); + ASSERT_TRUE(PutWithTTL("large_key3", value_2k, 60).IsNoSpace()); + ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + // Verify large_key2 still exists. + VerifyDB(data); +} + +// Test flush or compaction will trigger FIFO eviction since they update +// total SST file size. +TEST_F(BlobDBTest, FIFOEviction_TriggerOnSSTSizeChange) { + BlobDBOptions bdb_options; + bdb_options.max_db_size = 1000; + bdb_options.is_fifo = true; + bdb_options.min_blob_size = 100; + bdb_options.disable_background_tasks = true; + Options options; + // Use mock env to stop wall clock. + options.env = mock_env_.get(); + auto statistics = CreateDBStatistics(); + options.statistics = statistics; + options.compression = kNoCompression; + Open(bdb_options, options); + + std::string value(800, 'v'); + ASSERT_OK(PutWithTTL("large_key", value, 60)); + ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); + ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + VerifyDB({{"large_key", value}}); + + // Insert some small keys and flush to bring DB out of space. + std::map data; + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put("key" + ToString(i), "v", &data)); + } + ASSERT_OK(blob_db_->Flush(FlushOptions())); + + // Verify large_key is deleted by FIFO eviction. + blob_db_impl()->TEST_DeleteObsoleteFiles(); + ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size()); + ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + VerifyDB(data); +} + TEST_F(BlobDBTest, InlineSmallValues) { constexpr uint64_t kMaxExpiration = 1000; Random rnd(301); @@ -1197,6 +1325,7 @@ TEST_F(BlobDBTest, CompactionFilterNotSupported) { } } +// Test comapction filter should remove any expired blob index. TEST_F(BlobDBTest, FilterExpiredBlobIndex) { constexpr size_t kNumKeys = 100; constexpr size_t kNumPuts = 1000; @@ -1262,6 +1391,147 @@ TEST_F(BlobDBTest, FilterExpiredBlobIndex) { VerifyDB(data_after_compact); } +// Test compaction filter should remove any blob index where corresponding +// blob file has been removed (either by FIFO or garbage collection). +TEST_F(BlobDBTest, FilterFileNotAvailable) { + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + bdb_options.disable_background_tasks = true; + Options options; + options.disable_auto_compactions = true; + Open(bdb_options, options); + + ASSERT_OK(Put("foo", "v1")); + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(1, blob_files.size()); + ASSERT_EQ(1, blob_files[0]->BlobFileNumber()); + ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0])); + + ASSERT_OK(Put("bar", "v2")); + blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(2, blob_files.size()); + ASSERT_EQ(2, blob_files[1]->BlobFileNumber()); + ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[1])); + + DB *base_db = blob_db_->GetRootDB(); + std::vector versions; + ASSERT_OK(GetAllKeyVersions(base_db, "", "", &versions)); + ASSERT_EQ(2, versions.size()); + ASSERT_EQ("bar", versions[0].user_key); + ASSERT_EQ("foo", versions[1].user_key); + VerifyDB({{"bar", "v2"}, {"foo", "v1"}}); + + ASSERT_OK(blob_db_->Flush(FlushOptions())); + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(GetAllKeyVersions(base_db, "", "", &versions)); + ASSERT_EQ(2, versions.size()); + ASSERT_EQ("bar", versions[0].user_key); + ASSERT_EQ("foo", versions[1].user_key); + VerifyDB({{"bar", "v2"}, {"foo", "v1"}}); + + // Remove the first blob file and compact. foo should be remove from base db. + blob_db_impl()->TEST_ObsoleteBlobFile(blob_files[0]); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(GetAllKeyVersions(base_db, "", "", &versions)); + ASSERT_EQ(1, versions.size()); + ASSERT_EQ("bar", versions[0].user_key); + VerifyDB({{"bar", "v2"}}); + + // Remove the second blob file and compact. bar should be remove from base db. + blob_db_impl()->TEST_ObsoleteBlobFile(blob_files[1]); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(GetAllKeyVersions(base_db, "", "", &versions)); + ASSERT_EQ(0, versions.size()); + VerifyDB({}); +} + +// Test compaction filter should filter any inlined TTL keys that would have +// been dropped by last FIFO eviction if they are store out-of-line. +TEST_F(BlobDBTest, FilterForFIFOEviction) { + Random rnd(215); + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 100; + bdb_options.ttl_range_secs = 60; + bdb_options.max_db_size = 0; + bdb_options.disable_background_tasks = true; + Options options; + // Use mock env to stop wall clock. + mock_env_->set_current_time(0); + options.env = mock_env_.get(); + auto statistics = CreateDBStatistics(); + options.statistics = statistics; + options.disable_auto_compactions = true; + Open(bdb_options, options); + + std::map data; + std::map data_after_compact; + // Insert some small values that will be inlined. + for (int i = 0; i < 1000; i++) { + std::string key = "key" + ToString(i); + std::string value = test::RandomHumanReadableString(&rnd, 50); + uint64_t ttl = rnd.Next() % 120 + 1; + ASSERT_OK(PutWithTTL(key, value, ttl, &data)); + if (ttl >= 60) { + data_after_compact[key] = value; + } + } + uint64_t num_keys_to_evict = data.size() - data_after_compact.size(); + ASSERT_OK(blob_db_->Flush(FlushOptions())); + uint64_t live_sst_size = blob_db_impl()->TEST_live_sst_size(); + ASSERT_GT(live_sst_size, 0); + VerifyDB(data); + + bdb_options.max_db_size = live_sst_size + 30000; + bdb_options.is_fifo = true; + Reopen(bdb_options, options); + VerifyDB(data); + + // Put two large values, each on a different blob file. + std::string large_value(10000, 'v'); + ASSERT_OK(PutWithTTL("large_key1", large_value, 90)); + ASSERT_OK(PutWithTTL("large_key2", large_value, 150)); + ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size()); + ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + data["large_key1"] = large_value; + data["large_key2"] = large_value; + VerifyDB(data); + + // Put a third large value which will bring the DB out of space. + // FIFO eviction will evict the file of large_key1. + ASSERT_OK(PutWithTTL("large_key3", large_value, 150)); + ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size()); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); + data.erase("large_key1"); + data["large_key3"] = large_value; + VerifyDB(data); + + // Putting some more small values. These values shouldn't be evicted by + // compaction filter since they are inserted after FIFO eviction. + ASSERT_OK(PutWithTTL("foo", "v", 30, &data_after_compact)); + ASSERT_OK(PutWithTTL("bar", "v", 30, &data_after_compact)); + + // FIFO eviction doesn't trigger again since there enough room for the flush. + ASSERT_OK(blob_db_->Flush(FlushOptions())); + ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + + // Manual compact and check if compaction filter evict those keys with + // expiration < 60. + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // All keys with expiration < 60, plus large_key1 is filtered by + // compaction filter. + ASSERT_EQ(num_keys_to_evict + 1, + statistics->getTickerCount(BLOB_DB_BLOB_INDEX_EVICTED_COUNT)); + ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); + data_after_compact["large_key2"] = large_value; + data_after_compact["large_key3"] = large_value; + VerifyDB(data_after_compact); +} + } // namespace blob_db } // namespace rocksdb diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc index 324a9521d81..287e62a404b 100644 --- a/utilities/blob_db/blob_file.cc +++ b/utilities/blob_db/blob_file.cc @@ -35,13 +35,9 @@ BlobFile::BlobFile() compression_(kNoCompression), has_ttl_(false), blob_count_(0), - gc_epoch_(-1), file_size_(0), - deleted_count_(0), - deleted_size_(0), closed_(false), obsolete_(false), - gc_once_after_open_(false), expiration_range_({0, 0}), last_access_(-1), last_fsync_(0), @@ -58,13 +54,9 @@ BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn, compression_(kNoCompression), has_ttl_(false), blob_count_(0), - gc_epoch_(-1), file_size_(0), - deleted_count_(0), - deleted_size_(0), closed_(false), obsolete_(false), - gc_once_after_open_(false), expiration_range_({0, 0}), last_access_(-1), last_fsync_(0), @@ -109,16 +101,14 @@ std::shared_ptr BlobFile::OpenSequentialReader( std::string BlobFile::DumpState() const { char str[1000]; - snprintf(str, sizeof(str), - "path: %s fn: %" PRIu64 " blob_count: %" PRIu64 " gc_epoch: %" PRIu64 - " file_size: %" PRIu64 " deleted_count: %" PRIu64 - " deleted_size: %" PRIu64 - " closed: %d obsolete: %d expiration_range: (%" PRIu64 ", %" PRIu64 - "), writer: %d reader: %d", - path_to_dir_.c_str(), file_number_, blob_count_.load(), - gc_epoch_.load(), file_size_.load(), deleted_count_, deleted_size_, - closed_.load(), obsolete_.load(), expiration_range_.first, - expiration_range_.second, (!!log_writer_), (!!ra_file_reader_)); + snprintf( + str, sizeof(str), + "path: %s fn: %" PRIu64 " blob_count: %" PRIu64 " file_size: %" PRIu64 + " closed: %d obsolete: %d expiration_range: (%" PRIu64 ", %" PRIu64 + "), writer: %d reader: %d", + path_to_dir_.c_str(), file_number_, blob_count_.load(), file_size_.load(), + closed_.load(), obsolete_.load(), expiration_range_.first, + expiration_range_.second, (!!log_writer_), (!!ra_file_reader_)); return str; } diff --git a/utilities/blob_db/blob_file.h b/utilities/blob_db/blob_file.h index 0dac911c0d9..b64f29ad005 100644 --- a/utilities/blob_db/blob_file.h +++ b/utilities/blob_db/blob_file.h @@ -53,18 +53,9 @@ class BlobFile { // number of blobs in the file std::atomic blob_count_; - // the file will be selected for GC in this future epoch - std::atomic gc_epoch_; - // size of the file std::atomic file_size_; - // number of blobs in this particular file which have been evicted - uint64_t deleted_count_; - - // size of deleted blobs (used by heuristic to select file for GC) - uint64_t deleted_size_; - BlobLogHeader header_; // closed_ = true implies the file is no more mutable @@ -79,9 +70,6 @@ class BlobFile { // Data in this file is visible to a snapshot taken before the sequence. SequenceNumber obsolete_sequence_; - // should this file been gc'd once to reconcile lost deletes/compactions - std::atomic gc_once_after_open_; - ExpirationRange expiration_range_; // Sequential/Append writer for blobs diff --git a/utilities/cassandra/cassandra_compaction_filter.cc b/utilities/cassandra/cassandra_compaction_filter.cc index af3e9a7799f..1b99d3a8b7d 100644 --- a/utilities/cassandra/cassandra_compaction_filter.cc +++ b/utilities/cassandra/cassandra_compaction_filter.cc @@ -17,13 +17,9 @@ const char* CassandraCompactionFilter::Name() const { } CompactionFilter::Decision CassandraCompactionFilter::FilterV2( - int level, - const Slice& key, - ValueType value_type, - const Slice& existing_value, - std::string* new_value, - std::string* skip_until) const { - + int /*level*/, const Slice& /*key*/, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* /*skip_until*/) const { bool value_changed = false; RowValue row_value = RowValue::Deserialize( existing_value.data(), existing_value.size()); diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc index 5bd19cadbb4..895c2517769 100644 --- a/utilities/cassandra/cassandra_functional_test.cc +++ b/utilities/cassandra/cassandra_functional_test.cc @@ -100,7 +100,7 @@ class TestCompactionFilterFactory : public CompactionFilterFactory { gc_grace_period_in_seconds_(gc_grace_period_in_seconds) {} virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& context) override { + const CompactionFilter::Context& /*context*/) override { return unique_ptr(new CassandraCompactionFilter( purge_ttl_on_expiration_, gc_grace_period_in_seconds_)); } diff --git a/utilities/cassandra/merge_operator.cc b/utilities/cassandra/merge_operator.cc index ffd31340300..4e529a6f2a8 100644 --- a/utilities/cassandra/merge_operator.cc +++ b/utilities/cassandra/merge_operator.cc @@ -42,10 +42,8 @@ bool CassandraValueMergeOperator::FullMergeV2( } bool CassandraValueMergeOperator::PartialMergeMulti( - const Slice& key, - const std::deque& operand_list, - std::string* new_value, - Logger* logger) const { + const Slice& /*key*/, const std::deque& operand_list, + std::string* new_value, Logger* /*logger*/) const { // Clear the *new_value for writing. assert(new_value); new_value->clear(); diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index d93c7095f19..9135dbabfc2 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -37,8 +37,8 @@ Status Checkpoint::Create(DB* db, Checkpoint** checkpoint_ptr) { return Status::OK(); } -Status Checkpoint::CreateCheckpoint(const std::string& checkpoint_dir, - uint64_t log_size_for_flush) { +Status Checkpoint::CreateCheckpoint(const std::string& /*checkpoint_dir*/, + uint64_t /*log_size_for_flush*/) { return Status::NotSupported(""); } diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc index 794097f2da7..f45db0e6b40 100644 --- a/utilities/checkpoint/checkpoint_test.cc +++ b/utilities/checkpoint/checkpoint_test.cc @@ -573,6 +573,18 @@ TEST_F(CheckpointTest, CheckpointInvalidDirectoryName) { } } +TEST_F(CheckpointTest, CheckpointWithParallelWrites) { + // When run with TSAN, this exposes the data race fixed in + // https://github.com/facebook/rocksdb/pull/3603 + ASSERT_OK(Put("key1", "val1")); + port::Thread thread([this]() { ASSERT_OK(Put("key2", "val2")); }); + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_)); + delete checkpoint; + thread.join(); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/utilities/col_buf_decoder.h b/utilities/col_buf_decoder.h index 0311a410012..cea95263733 100644 --- a/utilities/col_buf_decoder.h +++ b/utilities/col_buf_decoder.h @@ -23,7 +23,7 @@ struct ColDeclaration; class ColBufDecoder { public: virtual ~ColBufDecoder() = 0; - virtual size_t Init(const char* src) { return 0; } + virtual size_t Init(const char* /*src*/) { return 0; } virtual size_t Decode(const char* src, char** dest) = 0; static ColBufDecoder* NewColBufDecoder(const ColDeclaration& col_declaration); diff --git a/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc b/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc index 43a25293456..49760ba5a97 100644 --- a/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc +++ b/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc @@ -16,12 +16,11 @@ const char* RemoveEmptyValueCompactionFilter::Name() const { return "RemoveEmptyValueCompactionFilter"; } -bool RemoveEmptyValueCompactionFilter::Filter(int level, - const Slice& key, - const Slice& existing_value, - std::string* new_value, - bool* value_changed) const { - +bool RemoveEmptyValueCompactionFilter::Filter(int /*level*/, + const Slice& /*key*/, + const Slice& existing_value, + std::string* /*new_value*/, + bool* /*value_changed*/) const { // remove kv pairs that have empty values return existing_value.empty(); } diff --git a/utilities/document/document_db.cc b/utilities/document/document_db.cc index 90b21bbe7cb..939327ed1bc 100644 --- a/utilities/document/document_db.cc +++ b/utilities/document/document_db.cc @@ -1044,24 +1044,25 @@ class DocumentDBImpl : public DocumentDB { // RocksDB functions using DB::Get; - virtual Status Get(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) override { + virtual Status Get(const ReadOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, PinnableSlice* /*value*/) override { return Status::NotSupported(""); } - virtual Status Get(const ReadOptions& options, const Slice& key, - std::string* value) override { + virtual Status Get(const ReadOptions& /*options*/, const Slice& /*key*/, + std::string* /*value*/) override { return Status::NotSupported(""); } - virtual Status Write(const WriteOptions& options, - WriteBatch* updates) override { + virtual Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { return Status::NotSupported(""); } - virtual Iterator* NewIterator(const ReadOptions& options, - ColumnFamilyHandle* column_family) override { + virtual Iterator* NewIterator( + const ReadOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { return nullptr; } - virtual Iterator* NewIterator(const ReadOptions& options) override { + virtual Iterator* NewIterator(const ReadOptions& /*options*/) override { return nullptr; } diff --git a/utilities/merge_operators.h b/utilities/merge_operators.h index 602a4d01aa3..4c720b822fe 100644 --- a/utilities/merge_operators.h +++ b/utilities/merge_operators.h @@ -3,13 +3,13 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#ifndef MERGE_OPERATORS_H -#define MERGE_OPERATORS_H +#pragma once +#include "rocksdb/merge_operator.h" -#include #include -#include "rocksdb/merge_operator.h" +#include +#include namespace rocksdb { @@ -19,8 +19,10 @@ class MergeOperators { static std::shared_ptr CreateDeprecatedPutOperator(); static std::shared_ptr CreateUInt64AddOperator(); static std::shared_ptr CreateStringAppendOperator(); + static std::shared_ptr CreateStringAppendOperator(char delim_char); static std::shared_ptr CreateStringAppendTESTOperator(); static std::shared_ptr CreateMaxOperator(); + static std::shared_ptr CreateBytesXOROperator(); // Will return a different merge operator depending on the string. // TODO: Hook the "name" up to the actual Name() of the MergeOperators? @@ -38,14 +40,13 @@ class MergeOperators { return CreateStringAppendTESTOperator(); } else if (name == "max") { return CreateMaxOperator(); + } else if (name == "bytesxor") { + return CreateBytesXOROperator(); } else { // Empty or unknown, just return nullptr return nullptr; } } - }; -} // namespace rocksdb - -#endif +} // namespace rocksdb diff --git a/utilities/merge_operators/bytesxor.cc b/utilities/merge_operators/bytesxor.cc new file mode 100644 index 00000000000..cf9d9766455 --- /dev/null +++ b/utilities/merge_operators/bytesxor.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include + +#include "utilities/merge_operators/bytesxor.h" + +namespace rocksdb { + +std::shared_ptr MergeOperators::CreateBytesXOROperator() { + return std::make_shared(); +} + +bool BytesXOROperator::Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const { + XOR(existing_value, value, new_value); + return true; +} + +void BytesXOROperator::XOR(const Slice* existing_value, + const Slice& value, std::string* new_value) const { + if (!existing_value) { + new_value->clear(); + new_value->assign(value.data(), value.size()); + return; + } + + size_t min_size = std::min(existing_value->size(), value.size()); + size_t max_size = std::max(existing_value->size(), value.size()); + + new_value->clear(); + new_value->reserve(max_size); + + const char* existing_value_data = existing_value->data(); + const char* value_data = value.data(); + + for (size_t i = 0; i < min_size; i++) { + new_value->push_back(existing_value_data[i] ^ value_data[i]); + } + + if (existing_value->size() == max_size) { + for (size_t i = min_size; i < max_size; i++) { + new_value->push_back(existing_value_data[i]); + } + } else { + assert(value.size() == max_size); + for (size_t i = min_size; i < max_size; i++) { + new_value->push_back(value_data[i]); + } + } +} + +} // namespace rocksdb diff --git a/utilities/merge_operators/bytesxor.h b/utilities/merge_operators/bytesxor.h new file mode 100644 index 00000000000..1562ca852a4 --- /dev/null +++ b/utilities/merge_operators/bytesxor.h @@ -0,0 +1,42 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef UTILITIES_MERGE_OPERATORS_BYTESXOR_H_ +#define UTILITIES_MERGE_OPERATORS_BYTESXOR_H_ + +#include +#include +#include +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "util/coding.h" +#include "utilities/merge_operators.h" + +namespace rocksdb { + +// A 'model' merge operator that XORs two (same sized) array of bytes. +// Implemented as an AssociativeMergeOperator for simplicity and example. +class BytesXOROperator : public AssociativeMergeOperator { + public: + // XORs the two array of bytes one byte at a time and stores the result + // in new_value. len is the number of xored bytes, and the length of new_value + virtual bool Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const override; + + virtual const char* Name() const override { + return "BytesXOR"; + } + + void XOR(const Slice* existing_value, const Slice& value, + std::string* new_value) const; +}; + +} // namespace rocksdb + +#endif // UTILITIES_MERGE_OPERATORS_BYTESXOR_H_ diff --git a/utilities/merge_operators/max.cc b/utilities/merge_operators/max.cc index 5f42e816ef7..732f203e3f3 100644 --- a/utilities/merge_operators/max.cc +++ b/utilities/merge_operators/max.cc @@ -38,9 +38,9 @@ class MaxOperator : public MergeOperator { return true; } - virtual bool PartialMerge(const Slice& key, const Slice& left_operand, + virtual bool PartialMerge(const Slice& /*key*/, const Slice& left_operand, const Slice& right_operand, std::string* new_value, - Logger* logger) const override { + Logger* /*logger*/) const override { if (left_operand.compare(right_operand) >= 0) { new_value->assign(left_operand.data(), left_operand.size()); } else { @@ -49,10 +49,10 @@ class MaxOperator : public MergeOperator { return true; } - virtual bool PartialMergeMulti(const Slice& key, + virtual bool PartialMergeMulti(const Slice& /*key*/, const std::deque& operand_list, std::string* new_value, - Logger* logger) const override { + Logger* /*logger*/) const override { Slice max; for (const auto& operand : operand_list) { if (max.compare(operand) < 0) { diff --git a/utilities/merge_operators/put.cc b/utilities/merge_operators/put.cc index 7f206ad3b09..fcbf67d9b03 100644 --- a/utilities/merge_operators/put.cc +++ b/utilities/merge_operators/put.cc @@ -22,11 +22,10 @@ namespace { // anonymous namespace // From the client-perspective, semantics are the same. class PutOperator : public MergeOperator { public: - virtual bool FullMerge(const Slice& key, - const Slice* existing_value, + virtual bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/, const std::deque& operand_sequence, std::string* new_value, - Logger* logger) const override { + Logger* /*logger*/) const override { // Put basically only looks at the current/latest value assert(!operand_sequence.empty()); assert(new_value != nullptr); @@ -34,20 +33,18 @@ class PutOperator : public MergeOperator { return true; } - virtual bool PartialMerge(const Slice& key, - const Slice& left_operand, - const Slice& right_operand, - std::string* new_value, - Logger* logger) const override { + virtual bool PartialMerge(const Slice& /*key*/, const Slice& /*left_operand*/, + const Slice& right_operand, std::string* new_value, + Logger* /*logger*/) const override { new_value->assign(right_operand.data(), right_operand.size()); return true; } using MergeOperator::PartialMergeMulti; - virtual bool PartialMergeMulti(const Slice& key, + virtual bool PartialMergeMulti(const Slice& /*key*/, const std::deque& operand_list, - std::string* new_value, Logger* logger) const - override { + std::string* new_value, + Logger* /*logger*/) const override { new_value->assign(operand_list.back().data(), operand_list.back().size()); return true; } @@ -58,10 +55,10 @@ class PutOperator : public MergeOperator { }; class PutOperatorV2 : public PutOperator { - virtual bool FullMerge(const Slice& key, const Slice* existing_value, - const std::deque& operand_sequence, - std::string* new_value, - Logger* logger) const override { + virtual bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/, + const std::deque& /*operand_sequence*/, + std::string* /*new_value*/, + Logger* /*logger*/) const override { assert(false); return false; } diff --git a/utilities/merge_operators/string_append/stringappend.cc b/utilities/merge_operators/string_append/stringappend.cc index ff19348f07b..d9c135fd371 100644 --- a/utilities/merge_operators/string_append/stringappend.cc +++ b/utilities/merge_operators/string_append/stringappend.cc @@ -21,12 +21,10 @@ StringAppendOperator::StringAppendOperator(char delim_char) } // Implementation for the merge operation (concatenates two strings) -bool StringAppendOperator::Merge(const Slice& key, +bool StringAppendOperator::Merge(const Slice& /*key*/, const Slice* existing_value, - const Slice& value, - std::string* new_value, - Logger* logger) const { - + const Slice& value, std::string* new_value, + Logger* /*logger*/) const { // Clear the *new_value for writing. assert(new_value); new_value->clear(); @@ -54,4 +52,8 @@ std::shared_ptr MergeOperators::CreateStringAppendOperator() { return std::make_shared(','); } +std::shared_ptr MergeOperators::CreateStringAppendOperator(char delim_char) { + return std::make_shared(delim_char); +} + } // namespace rocksdb diff --git a/utilities/merge_operators/string_append/stringappend2.cc b/utilities/merge_operators/string_append/stringappend2.cc index 2d7b7423ce8..6e46d80a139 100644 --- a/utilities/merge_operators/string_append/stringappend2.cc +++ b/utilities/merge_operators/string_append/stringappend2.cc @@ -68,16 +68,16 @@ bool StringAppendTESTOperator::FullMergeV2( } bool StringAppendTESTOperator::PartialMergeMulti( - const Slice& key, const std::deque& operand_list, - std::string* new_value, Logger* logger) const { + const Slice& /*key*/, const std::deque& /*operand_list*/, + std::string* /*new_value*/, Logger* /*logger*/) const { return false; } // A version of PartialMerge that actually performs "partial merging". // Use this to simulate the exact behaviour of the StringAppendOperator. bool StringAppendTESTOperator::_AssocPartialMergeMulti( - const Slice& key, const std::deque& operand_list, - std::string* new_value, Logger* logger) const { + const Slice& /*key*/, const std::deque& operand_list, + std::string* new_value, Logger* /*logger*/) const { // Clear the *new_value for writing assert(new_value); new_value->clear(); diff --git a/utilities/merge_operators/uint64add.cc b/utilities/merge_operators/uint64add.cc index d7821737517..dc761e74b20 100644 --- a/utilities/merge_operators/uint64add.cc +++ b/utilities/merge_operators/uint64add.cc @@ -20,10 +20,8 @@ namespace { // anonymous namespace // Implemented as an AssociativeMergeOperator for simplicity and example. class UInt64AddOperator : public AssociativeMergeOperator { public: - virtual bool Merge(const Slice& key, - const Slice* existing_value, - const Slice& value, - std::string* new_value, + virtual bool Merge(const Slice& /*key*/, const Slice* existing_value, + const Slice& value, std::string* new_value, Logger* logger) const override { uint64_t orig_value = 0; if (existing_value){ diff --git a/utilities/object_registry_test.cc b/utilities/object_registry_test.cc index 40fb387bc93..fe69d9a3959 100644 --- a/utilities/object_registry_test.cc +++ b/utilities/object_registry_test.cc @@ -18,13 +18,14 @@ class EnvRegistryTest : public testing::Test { int EnvRegistryTest::num_a = 0; int EnvRegistryTest::num_b = 0; -static Registrar test_reg_a("a://.*", [](const std::string& uri, - std::unique_ptr* env_guard) { - ++EnvRegistryTest::num_a; - return Env::Default(); -}); +static Registrar test_reg_a("a://.*", + [](const std::string& /*uri*/, + std::unique_ptr* /*env_guard*/) { + ++EnvRegistryTest::num_a; + return Env::Default(); + }); -static Registrar test_reg_b("b://.*", [](const std::string& uri, +static Registrar test_reg_b("b://.*", [](const std::string& /*uri*/, std::unique_ptr* env_guard) { ++EnvRegistryTest::num_b; // Env::Default() is a singleton so we can't grant ownership directly to the diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc index 6a4efd77881..ef5bbdfe606 100644 --- a/utilities/options/options_util_test.cc +++ b/utilities/options/options_util_test.cc @@ -103,29 +103,30 @@ class DummyTableFactory : public TableFactory { virtual const char* Name() const override { return "DummyTableFactory"; } virtual Status NewTableReader( - const TableReaderOptions& table_reader_options, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table_reader, - bool prefetch_index_and_filter_in_cache) const override { + const TableReaderOptions& /*table_reader_options*/, + unique_ptr&& /*file*/, uint64_t /*file_size*/, + unique_ptr* /*table_reader*/, + bool /*prefetch_index_and_filter_in_cache*/) const override { return Status::NotSupported(); } virtual TableBuilder* NewTableBuilder( - const TableBuilderOptions& table_builder_options, - uint32_t column_family_id, WritableFileWriter* file) const override { + const TableBuilderOptions& /*table_builder_options*/, + uint32_t /*column_family_id*/, + WritableFileWriter* /*file*/) const override { return nullptr; } virtual Status SanitizeOptions( - const DBOptions& db_opts, - const ColumnFamilyOptions& cf_opts) const override { + const DBOptions& /*db_opts*/, + const ColumnFamilyOptions& /*cf_opts*/) const override { return Status::NotSupported(); } virtual std::string GetPrintableTableOptions() const override { return ""; } - Status GetOptionString(std::string* opt_string, - const std::string& delimiter) const override { + Status GetOptionString(std::string* /*opt_string*/, + const std::string& /*delimiter*/) const override { return Status::OK(); } }; @@ -135,15 +136,15 @@ class DummyMergeOperator : public MergeOperator { DummyMergeOperator() {} virtual ~DummyMergeOperator() {} - virtual bool FullMergeV2(const MergeOperationInput& merge_in, - MergeOperationOutput* merge_out) const override { + virtual bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* /*merge_out*/) const override { return false; } - virtual bool PartialMergeMulti(const Slice& key, - const std::deque& operand_list, - std::string* new_value, - Logger* logger) const override { + virtual bool PartialMergeMulti(const Slice& /*key*/, + const std::deque& /*operand_list*/, + std::string* /*new_value*/, + Logger* /*logger*/) const override { return false; } @@ -162,10 +163,10 @@ class DummySliceTransform : public SliceTransform { virtual Slice Transform(const Slice& src) const { return src; } // determine whether this is a valid src upon the function applies - virtual bool InDomain(const Slice& src) const { return false; } + virtual bool InDomain(const Slice& /*src*/) const { return false; } // determine whether dst=Transform(src) for some src - virtual bool InRange(const Slice& dst) const { return false; } + virtual bool InRange(const Slice& /*dst*/) const { return false; } }; } // namespace diff --git a/utilities/persistent_cache/block_cache_tier_file.cc b/utilities/persistent_cache/block_cache_tier_file.cc index 9e419563a78..ce6335fb586 100644 --- a/utilities/persistent_cache/block_cache_tier_file.cc +++ b/utilities/persistent_cache/block_cache_tier_file.cc @@ -278,7 +278,7 @@ WriteableCacheFile::~WriteableCacheFile() { ClearBuffers(); } -bool WriteableCacheFile::Create(const bool enable_direct_writes, +bool WriteableCacheFile::Create(const bool /*enable_direct_writes*/, const bool enable_direct_reads) { WriteLock _(&rwlock_); diff --git a/utilities/persistent_cache/block_cache_tier_file.h b/utilities/persistent_cache/block_cache_tier_file.h index 3922136d67e..ef5dbab0408 100644 --- a/utilities/persistent_cache/block_cache_tier_file.h +++ b/utilities/persistent_cache/block_cache_tier_file.h @@ -103,13 +103,15 @@ class BlockCacheFile : public LRUElement { virtual ~BlockCacheFile() {} // append key/value to file and return LBA locator to user - virtual bool Append(const Slice& key, const Slice& val, LBA* const lba) { + virtual bool Append(const Slice& /*key*/, const Slice& /*val*/, + LBA* const /*lba*/) { assert(!"not implemented"); return false; } // read from the record locator (LBA) and return key, value and status - virtual bool Read(const LBA& lba, Slice* key, Slice* block, char* scratch) { + virtual bool Read(const LBA& /*lba*/, Slice* /*key*/, Slice* /*block*/, + char* /*scratch*/) { assert(!"not implemented"); return false; } diff --git a/utilities/persistent_cache/hash_table_test.cc b/utilities/persistent_cache/hash_table_test.cc index 1a6df4e6144..6fe5a596545 100644 --- a/utilities/persistent_cache/hash_table_test.cc +++ b/utilities/persistent_cache/hash_table_test.cc @@ -43,7 +43,7 @@ struct HashTableTest : public testing::Test { } }; - static void ClearNode(Node node) {} + static void ClearNode(Node /*node*/) {} HashTable map_; }; @@ -73,7 +73,7 @@ struct EvictableHashTableTest : public testing::Test { } }; - static void ClearNode(Node* node) {} + static void ClearNode(Node* /*node*/) {} EvictableHashTable map_; }; diff --git a/utilities/persistent_cache/persistent_cache_test.h b/utilities/persistent_cache/persistent_cache_test.h index 77fd172ba08..37e842f2e2a 100644 --- a/utilities/persistent_cache/persistent_cache_test.h +++ b/utilities/persistent_cache/persistent_cache_test.h @@ -233,8 +233,8 @@ class PersistentCacheDBTest : public DBTestBase { // insert data to table void Insert(const Options& options, - const BlockBasedTableOptions& table_options, const int num_iter, - std::vector* values) { + const BlockBasedTableOptions& /*table_options*/, + const int num_iter, std::vector* values) { CreateAndReopenWithCF({"pikachu"}, options); // default column family doesn't have block cache Options no_block_cache_opts; diff --git a/utilities/persistent_cache/persistent_cache_tier.cc b/utilities/persistent_cache/persistent_cache_tier.cc index 0f500e87127..732762a1652 100644 --- a/utilities/persistent_cache/persistent_cache_tier.cc +++ b/utilities/persistent_cache/persistent_cache_tier.cc @@ -75,12 +75,12 @@ Status PersistentCacheTier::Close() { return Status::OK(); } -bool PersistentCacheTier::Reserve(const size_t size) { +bool PersistentCacheTier::Reserve(const size_t /*size*/) { // default implementation is a pass through return true; } -bool PersistentCacheTier::Erase(const Slice& key) { +bool PersistentCacheTier::Erase(const Slice& /*key*/) { // default implementation is a pass through since not all cache tiers might // support erase return true; diff --git a/utilities/persistent_cache/volatile_tier_impl.cc b/utilities/persistent_cache/volatile_tier_impl.cc index d190a210282..177fc916904 100644 --- a/utilities/persistent_cache/volatile_tier_impl.cc +++ b/utilities/persistent_cache/volatile_tier_impl.cc @@ -106,7 +106,7 @@ Status VolatileCacheTier::Lookup(const Slice& page_key, return Status::NotFound("key not found in volatile cache"); } -bool VolatileCacheTier::Erase(const Slice& key) { +bool VolatileCacheTier::Erase(const Slice& /*key*/) { assert(!"not supported"); return true; } diff --git a/utilities/redis/redis_list_iterator.h b/utilities/redis/redis_list_iterator.h index 73907ddf8c4..7bfe206900c 100644 --- a/utilities/redis/redis_list_iterator.h +++ b/utilities/redis/redis_list_iterator.h @@ -288,7 +288,7 @@ class RedisListIterator { /// Will throw an exception based on the passed-in message. /// This function is guaranteed to STOP THE CONTROL-FLOW. /// (i.e.: you do not have to call "return" after calling ThrowError) - void ThrowError(const char* const msg = NULL) { + void ThrowError(const char* const /*msg*/ = nullptr) { // TODO: For now we ignore the msg parameter. This can be expanded later. throw RedisListException(); } diff --git a/utilities/redis/redis_lists.cc b/utilities/redis/redis_lists.cc index 2b38a2da4b3..3ba7470ec5d 100644 --- a/utilities/redis/redis_lists.cc +++ b/utilities/redis/redis_lists.cc @@ -101,7 +101,7 @@ bool RedisLists::Index(const std::string& key, int32_t index, if (curIndex == index && !it.Done()) { Slice elem; it.GetCurrent(&elem); - if (result != NULL) { + if (result != nullptr) { *result = elem.ToString(); } @@ -345,7 +345,7 @@ bool RedisLists::PopLeft(const std::string& key, std::string* result) { db_->Put(put_option_, key, it.WriteResult()); // Return the value - if (result != NULL) { + if (result != nullptr) { *result = elem.ToString(); } return true; @@ -385,7 +385,7 @@ bool RedisLists::PopRight(const std::string& key, std::string* result) { db_->Put(put_option_, key, it.WriteResult()); // Return the value - if (result != NULL) { + if (result != nullptr) { *result = elem.ToString(); } return true; diff --git a/utilities/simulator_cache/sim_cache_test.cc b/utilities/simulator_cache/sim_cache_test.cc index 4e979381729..0e504d44371 100644 --- a/utilities/simulator_cache/sim_cache_test.cc +++ b/utilities/simulator_cache/sim_cache_test.cc @@ -39,7 +39,7 @@ class SimCacheTest : public DBTestBase { return options; } - void InitTable(const Options& options) { + void InitTable(const Options& /*options*/) { std::string value(kValueSize, 'a'); for (size_t i = 0; i < kNumBlocks * 2; i++) { ASSERT_OK(Put(ToString(i), value.c_str())); diff --git a/utilities/spatialdb/spatial_db.cc b/utilities/spatialdb/spatial_db.cc index 539ddd06ee0..a9b990ee20f 100644 --- a/utilities/spatialdb/spatial_db.cc +++ b/utilities/spatialdb/spatial_db.cc @@ -704,7 +704,7 @@ DBOptions GetDBOptionsFromSpatialDBOptions(const SpatialDBOptions& options) { return db_options; } -ColumnFamilyOptions GetColumnFamilyOptions(const SpatialDBOptions& options, +ColumnFamilyOptions GetColumnFamilyOptions(const SpatialDBOptions& /*options*/, std::shared_ptr block_cache) { ColumnFamilyOptions column_family_options; column_family_options.write_buffer_size = 128 * 1024 * 1024; // 128MB diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector.cc b/utilities/table_properties_collectors/compact_on_deletion_collector.cc index 1d6f969527b..fdb7389a7ab 100644 --- a/utilities/table_properties_collectors/compact_on_deletion_collector.cc +++ b/utilities/table_properties_collectors/compact_on_deletion_collector.cc @@ -29,10 +29,11 @@ CompactOnDeletionCollector::CompactOnDeletionCollector( // @params key the user key that is inserted into the table. // @params value the value that is inserted into the table. // @params file_size file size up to now -Status CompactOnDeletionCollector::AddUserKey( - const Slice& key, const Slice& value, - EntryType type, SequenceNumber seq, - uint64_t file_size) { +Status CompactOnDeletionCollector::AddUserKey(const Slice& /*key*/, + const Slice& /*value*/, + EntryType type, + SequenceNumber /*seq*/, + uint64_t /*file_size*/) { assert(!finished_); if (need_compaction_) { // If the output file already needs to be compacted, skip the check. @@ -68,7 +69,7 @@ Status CompactOnDeletionCollector::AddUserKey( TablePropertiesCollector* CompactOnDeletionCollectorFactory::CreateTablePropertiesCollector( - TablePropertiesCollectorFactory::Context context) { + TablePropertiesCollectorFactory::Context /*context*/) { return new CompactOnDeletionCollector( sliding_window_size_, deletion_trigger_); } diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector.h b/utilities/table_properties_collectors/compact_on_deletion_collector.h index 9b8205108f3..5406ecea28a 100644 --- a/utilities/table_properties_collectors/compact_on_deletion_collector.h +++ b/utilities/table_properties_collectors/compact_on_deletion_collector.h @@ -60,7 +60,7 @@ class CompactOnDeletionCollector : public TablePropertiesCollector { // for writing the properties block. // @params properties User will add their collected statistics to // `properties`. - virtual Status Finish(UserCollectedProperties* properties) override { + virtual Status Finish(UserCollectedProperties* /*properties*/) override { finished_ = true; return Status::OK(); } diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc index cfa44413997..c0768ec5bad 100644 --- a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc +++ b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc @@ -20,7 +20,7 @@ #include "util/random.h" #include "utilities/table_properties_collectors/compact_on_deletion_collector.h" -int main(int argc, char** argv) { +int main(int /*argc*/, char** /*argv*/) { const int kWindowSizes[] = {1000, 10000, 10000, 127, 128, 129, 255, 256, 257, 2, 10000}; const int kDeletionTriggers[] = diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc index cd75209f103..b4bcc34f864 100644 --- a/utilities/transactions/pessimistic_transaction_db.cc +++ b/utilities/transactions/pessimistic_transaction_db.cc @@ -142,6 +142,11 @@ Status PessimisticTransactionDB::Initialize( } s = real_trx->RebuildFromWriteBatch(recovered_trx->batch_); + // WriteCommitted set this to to disable this check that is specific to + // WritePrepared txns + assert(recovered_trx->batch_cnt_ == 0 || + real_trx->GetWriteBatch()->SubBatchCnt() == + recovered_trx->batch_cnt_); real_trx->SetState(Transaction::PREPARED); if (!s.ok()) { break; @@ -368,7 +373,7 @@ Transaction* PessimisticTransactionDB::BeginInternalTransaction( // // Put(), Merge(), and Delete() only lock a single key per call. Write() will // sort its keys before locking them. This guarantees that TransactionDB write -// methods cannot deadlock with eachother (but still could deadlock with a +// methods cannot deadlock with each other (but still could deadlock with a // Transaction). Status PessimisticTransactionDB::Put(const WriteOptions& options, ColumnFamilyHandle* column_family, diff --git a/utilities/transactions/pessimistic_transaction_db.h b/utilities/transactions/pessimistic_transaction_db.h index 1386b5c2229..af4114cc288 100644 --- a/utilities/transactions/pessimistic_transaction_db.h +++ b/utilities/transactions/pessimistic_transaction_db.h @@ -135,6 +135,7 @@ class PessimisticTransactionDB : public TransactionDB { friend class WritePreparedTxnDB; friend class WritePreparedTxnDBMock; friend class TransactionTest_DoubleEmptyWrite_Test; + friend class TransactionTest_DuplicateKeys_Test; friend class TransactionTest_PersistentTwoPhaseTransactionTest_Test; friend class TransactionTest_TwoPhaseLongPrepareTest_Test; friend class TransactionTest_TwoPhaseDoubleRecoveryTest_Test; diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h index d336eb3a788..d42a6d1ba7f 100644 --- a/utilities/transactions/transaction_base.h +++ b/utilities/transactions/transaction_base.h @@ -180,7 +180,7 @@ class TransactionBaseImpl : public Transaction { WriteBatchWithIndex* GetWriteBatch() override; - virtual void SetLockTimeout(int64_t timeout) override { /* Do nothing */ + virtual void SetLockTimeout(int64_t /*timeout*/) override { /* Do nothing */ } const Snapshot* GetSnapshot() const override { diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index 7df5afa7c4e..9d9d0608258 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -770,6 +770,42 @@ TEST_P(TransactionTest, CommitTimeBatchFailTest) { delete txn1; } +TEST_P(TransactionTest, LogMarkLeakTest) { + TransactionOptions txn_options; + WriteOptions write_options; + options.write_buffer_size = 1024; + ReOpenNoDelete(); + Random rnd(47); + std::vector txns; + DBImpl* db_impl = reinterpret_cast(db->GetRootDB()); + // At the beginning there should be no log containing prepare data + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + for (size_t i = 0; i < 100; i++) { + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->SetName("xid" + ToString(i))); + ASSERT_OK(txn->Put(Slice("foo" + ToString(i)), Slice("bar"))); + ASSERT_OK(txn->Prepare()); + ASSERT_GT(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + if (rnd.OneIn(5)) { + txns.push_back(txn); + } else { + ASSERT_OK(txn->Commit()); + delete txn; + } + db_impl->TEST_FlushMemTable(true); + } + for (auto txn : txns) { + ASSERT_OK(txn->Commit()); + delete txn; + } + // At the end there should be no log left containing prepare data + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + // Make sure that the underlying data structures are properly truncated and + // cause not leak + ASSERT_EQ(db_impl->TEST_PreparedSectionCompletedSize(), 0); + ASSERT_EQ(db_impl->TEST_LogsWithPrepSize(), 0); +} + TEST_P(TransactionTest, SimpleTwoPhaseTransactionTest) { for (bool cwb4recovery : {true, false}) { ReOpen(); @@ -5015,6 +5051,36 @@ TEST_P(TransactionTest, Optimizations) { } } +// A comparator that uses only the first three bytes +class ThreeBytewiseComparator : public Comparator { + public: + ThreeBytewiseComparator() {} + virtual const char* Name() const override { + return "test.ThreeBytewiseComparator"; + } + virtual int Compare(const Slice& a, const Slice& b) const override { + Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3); + Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3); + return na.compare(nb); + } + virtual bool Equal(const Slice& a, const Slice& b) const override { + Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3); + Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3); + return na == nb; + } + // This methods below dont seem relevant to this test. Implement them if + // proven othersize. + void FindShortestSeparator(std::string* start, + const Slice& limit) const override { + const Comparator* bytewise_comp = BytewiseComparator(); + bytewise_comp->FindShortestSeparator(start, limit); + } + void FindShortSuccessor(std::string* key) const override { + const Comparator* bytewise_comp = BytewiseComparator(); + bytewise_comp->FindShortSuccessor(key); + } +}; + // Test that the transactional db can handle duplicate keys in the write batch TEST_P(TransactionTest, DuplicateKeys) { ColumnFamilyOptions cf_options; @@ -5054,35 +5120,6 @@ TEST_P(TransactionTest, DuplicateKeys) { // Test with non-bytewise comparator { - // A comparator that uses only the first three bytes - class ThreeBytewiseComparator : public Comparator { - public: - ThreeBytewiseComparator() {} - virtual const char* Name() const override { - return "test.ThreeBytewiseComparator"; - } - virtual int Compare(const Slice& a, const Slice& b) const override { - Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3); - Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3); - return na.compare(nb); - } - virtual bool Equal(const Slice& a, const Slice& b) const override { - Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3); - Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3); - return na == nb; - } - // This methods below dont seem relevant to this test. Implement them if - // proven othersize. - void FindShortestSeparator(std::string* start, - const Slice& limit) const override { - const Comparator* bytewise_comp = BytewiseComparator(); - bytewise_comp->FindShortestSeparator(start, limit); - } - void FindShortSuccessor(std::string* key) const override { - const Comparator* bytewise_comp = BytewiseComparator(); - bytewise_comp->FindShortSuccessor(key); - } - }; ReOpen(); std::unique_ptr comp_gc(new ThreeBytewiseComparator()); cf_options.comparator = comp_gc.get(); @@ -5092,6 +5129,8 @@ TEST_P(TransactionTest, DuplicateKeys) { batch.Put(cf_handle, Slice("key"), Slice("value")); // The first three bytes are the same, do it must be counted as duplicate batch.Put(cf_handle, Slice("key2"), Slice("value2")); + // check for 2nd duplicate key in cf with non-default comparator + batch.Put(cf_handle, Slice("key2b"), Slice("value2b")); ASSERT_OK(db->Write(write_options, &batch)); // The value must be the most recent value for all the keys equal to "key", @@ -5099,7 +5138,7 @@ TEST_P(TransactionTest, DuplicateKeys) { ReadOptions ropt; PinnableSlice pinnable_val; ASSERT_OK(db->Get(ropt, cf_handle, "key", &pinnable_val)); - ASSERT_TRUE(pinnable_val == ("value2")); + ASSERT_TRUE(pinnable_val == ("value2b")); // Test duplicate keys with rollback TransactionOptions txn_options; @@ -5109,7 +5148,7 @@ TEST_P(TransactionTest, DuplicateKeys) { ASSERT_OK(txn0->Merge(cf_handle, Slice("key4"), Slice("value4"))); ASSERT_OK(txn0->Rollback()); ASSERT_OK(db->Get(ropt, cf_handle, "key5", &pinnable_val)); - ASSERT_TRUE(pinnable_val == ("value2")); + ASSERT_TRUE(pinnable_val == ("value2b")); delete txn0; delete cf_handle; @@ -5285,6 +5324,212 @@ TEST_P(TransactionTest, DuplicateKeys) { ASSERT_OK(txn0->Commit()); delete txn0; } + + // Test sucessfull recovery after a crash + { + ReOpen(); + TransactionOptions txn_options; + WriteOptions write_options; + ReadOptions ropt; + Transaction* txn0; + PinnableSlice pinnable_val; + Status s; + + std::unique_ptr comp_gc(new ThreeBytewiseComparator()); + cf_options.comparator = comp_gc.get(); + ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle)); + delete cf_handle; + std::vector cfds{ + ColumnFamilyDescriptor(kDefaultColumnFamilyName, + ColumnFamilyOptions(options)), + ColumnFamilyDescriptor(cf_name, cf_options), + }; + std::vector handles; + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + + ASSERT_OK(db->Put(write_options, "foo0", "init")); + ASSERT_OK(db->Put(write_options, "foo1", "init")); + ASSERT_OK(db->Put(write_options, handles[1], "foo0", "init")); + ASSERT_OK(db->Put(write_options, handles[1], "foo1", "init")); + + // one entry + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + db->FlushWAL(true); + reinterpret_cast(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0a")); + + // two entries, no duplicate + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(handles[1], Slice("foo0"), Slice("bar0b"))); + ASSERT_OK(txn0->Put(handles[1], Slice("fol1"), Slice("bar1b"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0b"))); + ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar1b"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + db->FlushWAL(true); + // Flush only cf 1 + reinterpret_cast(db->GetRootDB()) + ->TEST_FlushMemTable(true, handles[1]); + reinterpret_cast(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0b")); + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar1b")); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0b")); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "fol1", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar1b")); + + // one duplicate with ::Put + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0c"))); + ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey1"), Slice("bar1d"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0c"))); + ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar1c"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0d"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + db->FlushWAL(true); + // Flush only cf 1 + reinterpret_cast(db->GetRootDB()) + ->TEST_FlushMemTable(true, handles[1]); + reinterpret_cast(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0d")); + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar1c")); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar1d")); + + // Duplicate with ::Put, ::Delete + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0e"))); + ASSERT_OK(txn0->Delete(handles[1], Slice("key-nonkey1"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0e"))); + ASSERT_OK(txn0->Delete(Slice("foo0"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + db->FlushWAL(true); + // Flush only cf 1 + reinterpret_cast(db->GetRootDB()) + ->TEST_FlushMemTable(true, handles[1]); + reinterpret_cast(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + + // Duplicate with ::Put, ::SingleDelete + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0g"))); + ASSERT_OK(txn0->SingleDelete(handles[1], Slice("key-nonkey1"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0e"))); + ASSERT_OK(txn0->SingleDelete(Slice("foo0"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + db->FlushWAL(true); + // Flush only cf 1 + reinterpret_cast(db->GetRootDB()) + ->TEST_FlushMemTable(true, handles[1]); + reinterpret_cast(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + + // Duplicate with ::Put, ::Merge + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar1i"))); + ASSERT_OK(txn0->Merge(handles[1], Slice("key-nonkey1"), Slice("bar1j"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0f"))); + ASSERT_OK(txn0->Merge(Slice("foo0"), Slice("bar0g"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + db->FlushWAL(true); + // Flush only cf 1 + reinterpret_cast(db->GetRootDB()) + ->TEST_FlushMemTable(true, handles[1]); + reinterpret_cast(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0f,bar0g")); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar1i,bar1j")); + + for (auto h : handles) { + delete h; + } + delete db; + db = nullptr; + } } } // namespace rocksdb diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h index 950af6c13c6..beec0df40a9 100644 --- a/utilities/transactions/transaction_test.h +++ b/utilities/transactions/transaction_test.h @@ -101,6 +101,27 @@ class TransactionTestBase : public ::testing::Test { return s; } + Status ReOpenNoDelete(std::vector& cfs, + std::vector* handles) { + for (auto h : *handles) { + delete h; + } + handles->clear(); + delete db; + db = nullptr; + env->AssertNoOpenFile(); + env->DropUnsyncedFileData(); + env->ResetState(); + Status s; + if (use_stackable_db_ == false) { + s = TransactionDB::Open(options, txn_db_options, dbname, cfs, handles, + &db); + } else { + s = OpenWithStackableDB(cfs, handles); + } + return s; + } + Status ReOpen() { delete db; DestroyDB(dbname, options); @@ -113,6 +134,24 @@ class TransactionTestBase : public ::testing::Test { return s; } + Status OpenWithStackableDB(std::vector& cfs, + std::vector* handles) { + std::vector compaction_enabled_cf_indices; + TransactionDB::PrepareWrap(&options, &cfs, &compaction_enabled_cf_indices); + DB* root_db; + Options options_copy(options); + const bool use_seq_per_batch = + txn_db_options.write_policy == WRITE_PREPARED; + Status s = DBImpl::Open(options_copy, dbname, cfs, handles, &root_db, + use_seq_per_batch); + if (s.ok()) { + s = TransactionDB::WrapStackableDB( + new StackableDB(root_db), txn_db_options, + compaction_enabled_cf_indices, *handles, &db); + } + return s; + } + Status OpenWithStackableDB() { std::vector compaction_enabled_cf_indices; std::vector column_families{ColumnFamilyDescriptor( diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index 1e2c8bb4a46..cfba5a7a00d 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -101,7 +101,7 @@ TEST(PreparedHeap, BasicsTest) { heap.erase(89l); heap.erase(86l); heap.erase(88l); - // Test top remians the same after a ranodm order of many erases + // Test top remains the same after a random order of many erases ASSERT_EQ(64l, heap.top()); heap.pop(); // Test that pop works with a series of random pending erases @@ -240,7 +240,7 @@ TEST(WriteBatchWithIndex, SubBatchCnt) { ASSERT_EQ(batch_cnt, counter.BatchCount()); // Test that RollbackToSavePoint will properly resets the number of - // sub-bathces + // sub-batches for (size_t i = save_points; i > 0; i--) { batch.RollbackToSavePoint(); ASSERT_EQ(batch_cnt_at[i - 1], batch.SubBatchCnt()); @@ -280,7 +280,7 @@ TEST(CommitEntry64b, BasicTest) { const size_t INDEX_SIZE = static_cast(1ull << INDEX_BITS); const CommitEntry64bFormat FORMAT(static_cast(INDEX_BITS)); - // zero-initialized CommitEntry64b should inidcate an empty entry + // zero-initialized CommitEntry64b should indicate an empty entry CommitEntry64b empty_entry64b; uint64_t empty_index = 11ul; CommitEntry empty_entry; @@ -353,7 +353,7 @@ class WritePreparedTransactionTestBase : public TransactionTestBase { protected: // If expect_update is set, check if it actually updated old_commit_map_. If // it did not and yet suggested not to check the next snapshot, do the - // opposite to check if it was not a bad suggstion. + // opposite to check if it was not a bad suggestion. void MaybeUpdateOldCommitMapTestWithNext(uint64_t prepare, uint64_t commit, uint64_t snapshot, uint64_t next_snapshot, @@ -371,7 +371,7 @@ class WritePreparedTransactionTestBase : public TransactionTestBase { } EXPECT_EQ(!expect_update, wp_db->old_commit_map_empty_); if (!check_next && wp_db->old_commit_map_empty_) { - // do the oppotisite to make sure it was not a bad suggestion + // do the opposite to make sure it was not a bad suggestion const bool dont_care_bool = true; wp_db->MaybeUpdateOldCommitMap(prepare, commit, next_snapshot, dont_care_bool); @@ -772,7 +772,7 @@ TEST_P(WritePreparedTransactionTest, CheckAgainstSnapshotsTest) { wp_db->UpdateSnapshots(snapshots, version); ASSERT_EQ(snapshots.size(), wp_db->snapshots_total_); // seq numbers are chosen so that we have two of them between each two - // snapshots. If the diff of two consecuitive seq is more than 5, there is a + // snapshots. If the diff of two consecutive seq is more than 5, there is a // snapshot between them. std::vector seqs = {50l, 55l, 150l, 155l, 250l, 255l, 350l, 355l, 450l, 455l, 550l, 555l, 650l, 655l, @@ -904,7 +904,7 @@ TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqBasicTest) { // a. max should be updated to new_max ASSERT_EQ(wp_db->max_evicted_seq_, new_max); // b. delayed prepared should contain every txn <= max and prepared should - // only contian txns > max + // only contain txns > max auto it = initial_prepared.begin(); for (; it != initial_prepared.end() && *it <= new_max; it++) { ASSERT_EQ(1, wp_db->delayed_prepared_.erase(*it)); @@ -930,6 +930,44 @@ TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqBasicTest) { } } +// This tests that transactions with duplicate keys perform correctly after max +// is advancing their prepared sequence numbers. This will not be the case if +// for example the txn does not add the prepared seq for the second sub-batch to +// the PrepareHeap structure. +TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqWithDuplicatesTest) { + WriteOptions write_options; + TransactionOptions txn_options; + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(Slice("key"), Slice("value1"))); + ASSERT_OK(txn0->Put(Slice("key"), Slice("value2"))); + ASSERT_OK(txn0->Prepare()); + + WritePreparedTxnDB* wp_db = dynamic_cast(db); + // Ensure that all the prepared sequence numbers will be removed from the + // PrepareHeap. + SequenceNumber new_max = wp_db->COMMIT_CACHE_SIZE; + wp_db->AdvanceMaxEvictedSeq(0, new_max); + + ReadOptions ropt; + PinnableSlice pinnable_val; + auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + delete txn0; + + wp_db->db_impl_->FlushWAL(true); + wp_db->TEST_Crash(); + ReOpenNoDelete(); + wp_db = dynamic_cast(db); + wp_db->AdvanceMaxEvictedSeq(0, new_max); + s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + + txn0 = db->GetTransactionByName("xid"); + ASSERT_OK(txn0->Rollback()); + delete txn0; +} + TEST_P(WritePreparedTransactionTest, SeqAdvanceConcurrentTest) { // Given the sequential run of txns, with this timeout we should never see a // deadlock nor a timeout unless we have a key conflict, which should be @@ -1034,9 +1072,9 @@ TEST_P(WritePreparedTransactionTest, SeqAdvanceConcurrentTest) { } if (options.two_write_queues) { // In this case none of the above scheduling tricks to deterministically - // form merged bactches works because the writes go to saparte queues. + // form merged batches works because the writes go to separate queues. // This would result in different write groups in each run of the test. We - // still keep the test since althgouh non-deterministic and hard to debug, + // still keep the test since although non-deterministic and hard to debug, // it is still useful to have. // TODO(myabandeh): Add a deterministic unit test for two_write_queues } @@ -1069,7 +1107,7 @@ TEST_P(WritePreparedTransactionTest, SeqAdvanceConcurrentTest) { } } -// Run a couple of differnet txns among them some uncommitted. Restart the db at +// Run a couple of different txns among them some uncommitted. Restart the db at // a couple points to check whether the list of uncommitted txns are recovered // properly. TEST_P(WritePreparedTransactionTest, BasicRecoveryTest) { @@ -1284,16 +1322,16 @@ TEST_P(WritePreparedTransactionTest, IsInSnapshotTest) { // only a few snapshots are below the max_evicted_seq_. for (int max_snapshots = 1; max_snapshots < 20; max_snapshots++) { // Leave some gap between the preliminary snapshots and the final snapshot - // that we check. This should test for also different overlapping scnearios + // that we check. This should test for also different overlapping scenarios // between the last snapshot and the commits. for (int max_gap = 1; max_gap < 10; max_gap++) { // Since we do not actually write to db, we mock the seq as it would be - // increaased by the db. The only exception is that we need db seq to + // increased by the db. The only exception is that we need db seq to // advance for our snapshots. for which we apply a dummy put each time we // increase our mock of seq. uint64_t seq = 0; // At each step we prepare a txn and then we commit it in the next txn. - // This emulates the consecuitive transactions that write to the same key + // This emulates the consecutive transactions that write to the same key uint64_t cur_txn = 0; // Number of snapshots taken so far int num_snapshots = 0; @@ -1306,7 +1344,7 @@ TEST_P(WritePreparedTransactionTest, IsInSnapshotTest) { // we add a new prepare txn. These do not mean to be committed for // snapshot inspection. std::set prepared; - // We keep the list of txns comitted before we take the last snaphot. + // We keep the list of txns committed before we take the last snapshot. // These should be the only seq numbers that will be found in the snapshot std::set committed_before; // The set of commit seq numbers to be excluded from IsInSnapshot queries diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc index 6f573c3242e..9a8022ae15a 100644 --- a/utilities/transactions/write_prepared_txn.cc +++ b/utilities/transactions/write_prepared_txn.cc @@ -83,7 +83,9 @@ Status WritePreparedTxn::PrepareInternal() { // callback otherwise there is a non-zero chance of max dvancing prepare_seq // and readers assume the data as committed. if (s.ok()) { - wpt_db_->AddPrepared(prepare_seq); + for (size_t i = 0; i < prepare_batch_cnt_; i++) { + wpt_db_->AddPrepared(prepare_seq + i); + } } return s; } @@ -128,9 +130,14 @@ Status WritePreparedTxn::CommitInternal() { assert(s.ok()); commit_batch_cnt = counter.BatchCount(); } - WritePreparedCommitEntryPreReleaseCallback update_commit_map( - wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, commit_batch_cnt); + const bool PREP_HEAP_SKIPPED = true; const bool disable_memtable = !includes_data; + const bool do_one_write = + !db_impl_->immutable_db_options().two_write_queues || disable_memtable; + const bool publish_seq = do_one_write; + WritePreparedCommitEntryPreReleaseCallback update_commit_map( + wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, commit_batch_cnt, + !PREP_HEAP_SKIPPED, publish_seq); uint64_t seq_used = kMaxSequenceNumber; // Since the prepared batch is directly written to memtable, there is already // a connection between the memtable and its WAL, so there is no need to @@ -141,6 +148,38 @@ Status WritePreparedTxn::CommitInternal() { zero_log_number, disable_memtable, &seq_used, batch_cnt, &update_commit_map); assert(!s.ok() || seq_used != kMaxSequenceNumber); + if (LIKELY(do_one_write || !s.ok())) { + return s; + } // else do the 2nd write to publish seq + // Note: the 2nd write comes with a performance penality. So if we have too + // many of commits accompanied with ComitTimeWriteBatch and yet we cannot + // enable use_only_the_last_commit_time_batch_for_recovery_ optimization, + // two_write_queues should be disabled to avoid many additional writes here. + class PublishSeqPreReleaseCallback : public PreReleaseCallback { + public: + explicit PublishSeqPreReleaseCallback(DBImpl* db_impl) + : db_impl_(db_impl) {} + virtual Status Callback(SequenceNumber seq, bool is_mem_disabled) override { + assert(is_mem_disabled); + assert(db_impl_->immutable_db_options().two_write_queues); + db_impl_->SetLastPublishedSequence(seq); + return Status::OK(); + } + + private: + DBImpl* db_impl_; + } publish_seq_callback(db_impl_); + WriteBatch empty_batch; + empty_batch.PutLogData(Slice()); + // In the absence of Prepare markers, use Noop as a batch separator + WriteBatchInternal::InsertNoop(&empty_batch); + const bool DISABLE_MEMTABLE = true; + const size_t ONE_BATCH = 1; + const uint64_t NO_REF_LOG = 0; + s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr, + NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + &publish_seq_callback); + assert(!s.ok() || seq_used != kMaxSequenceNumber); return s; } @@ -202,7 +241,7 @@ Status WritePreparedTxn::RollbackInternal() { return s; } - Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override { + Status PutCF(uint32_t cf, const Slice& key, const Slice& /*val*/) override { return Rollback(cf, key); } @@ -214,7 +253,8 @@ Status WritePreparedTxn::RollbackInternal() { return Rollback(cf, key); } - Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override { + Status MergeCF(uint32_t cf, const Slice& key, + const Slice& /*val*/) override { return Rollback(cf, key); } @@ -239,14 +279,14 @@ Status WritePreparedTxn::RollbackInternal() { WriteBatchInternal::MarkRollback(&rollback_batch, name_); bool do_one_write = !db_impl_->immutable_db_options().two_write_queues; const bool DISABLE_MEMTABLE = true; - const uint64_t no_log_ref = 0; + const uint64_t NO_REF_LOG = 0; uint64_t seq_used = kMaxSequenceNumber; const size_t ZERO_PREPARES = 0; const size_t ONE_BATCH = 1; WritePreparedCommitEntryPreReleaseCallback update_commit_map( wpt_db_, db_impl_, kMaxSequenceNumber, ZERO_PREPARES, ONE_BATCH); s = db_impl_->WriteImpl(write_options_, &rollback_batch, nullptr, nullptr, - no_log_ref, !DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + NO_REF_LOG, !DISABLE_MEMTABLE, &seq_used, ONE_BATCH, do_one_write ? &update_commit_map : nullptr); assert(!s.ok() || seq_used != kMaxSequenceNumber); if (!s.ok()) { @@ -255,7 +295,9 @@ Status WritePreparedTxn::RollbackInternal() { if (do_one_write) { // Mark the txn as rolled back uint64_t& rollback_seq = seq_used; - wpt_db_->RollbackPrepared(GetId(), rollback_seq); + for (size_t i = 0; i < prepare_batch_cnt_; i++) { + wpt_db_->RollbackPrepared(GetId() + i, rollback_seq); + } return s; } // else do the 2nd write for commit uint64_t& prepare_seq = seq_used; @@ -274,13 +316,15 @@ Status WritePreparedTxn::RollbackInternal() { // In the absence of Prepare markers, use Noop as a batch separator WriteBatchInternal::InsertNoop(&empty_batch); s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr, - no_log_ref, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, &update_commit_map_with_prepare); assert(!s.ok() || seq_used != kMaxSequenceNumber); // Mark the txn as rolled back uint64_t& rollback_seq = seq_used; if (s.ok()) { - wpt_db_->RollbackPrepared(GetId(), rollback_seq); + for (size_t i = 0; i < prepare_batch_cnt_; i++) { + wpt_db_->RollbackPrepared(GetId() + i, rollback_seq); + } } return s; diff --git a/utilities/transactions/write_prepared_txn.h b/utilities/transactions/write_prepared_txn.h index df2fb8c20dc..50ce899302a 100644 --- a/utilities/transactions/write_prepared_txn.h +++ b/utilities/transactions/write_prepared_txn.h @@ -34,8 +34,8 @@ namespace rocksdb { class WritePreparedTxnDB; -// This impl could write to DB also uncomitted data and then later tell apart -// committed data from uncomitted data. Uncommitted data could be after the +// This impl could write to DB also uncommitted data and then later tell apart +// committed data from uncommitted data. Uncommitted data could be after the // Prepare phase in 2PC (WritePreparedTxn) or before that // (WriteUnpreparedTxnImpl). class WritePreparedTxn : public PessimisticTransaction { diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index d91bdccaaba..48f185682ce 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -36,7 +36,10 @@ Status WritePreparedTxnDB::Initialize( assert(dbimpl != nullptr); auto rtxns = dbimpl->recovered_transactions(); for (auto rtxn : rtxns) { - AddPrepared(rtxn.second->seq_); + auto cnt = rtxn.second->batch_cnt_ ? rtxn.second->batch_cnt_ : 1; + for (size_t i = 0; i < cnt; i++) { + AddPrepared(rtxn.second->seq_ + i); + } } SequenceNumber prev_max = max_evicted_seq_; SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber(); @@ -239,7 +242,7 @@ struct WritePreparedTxnDB::IteratorState { }; namespace { -static void CleanupWritePreparedTxnDBIterator(void* arg1, void* arg2) { +static void CleanupWritePreparedTxnDBIterator(void* arg1, void* /*arg2*/) { delete reinterpret_cast(arg1); } } // anonymous namespace @@ -436,7 +439,7 @@ void WritePreparedTxnDB::AddPrepared(uint64_t seq) { } void WritePreparedTxnDB::RollbackPrepared(uint64_t prep_seq, - uint64_t rollback_seq) { + uint64_t /*rollback_seq*/) { ROCKS_LOG_DETAILS( info_log_, "Txn %" PRIu64 " rolling back with rollback seq of " PRIu64 "", prep_seq, rollback_seq); @@ -475,7 +478,8 @@ void WritePreparedTxnDB::AddCommitted(uint64_t prepare_seq, uint64_t commit_seq, CommitEntry64b evicted_64b; CommitEntry evicted; bool to_be_evicted = GetCommitEntry(indexed_seq, &evicted_64b, &evicted); - if (to_be_evicted) { + if (LIKELY(to_be_evicted)) { + assert(evicted.prep_seq != prepare_seq); auto prev_max = max_evicted_seq_.load(std::memory_order_acquire); ROCKS_LOG_DETAILS(info_log_, "Evicting %" PRIu64 ",%" PRIu64 " with max %" PRIu64, @@ -491,7 +495,11 @@ void WritePreparedTxnDB::AddCommitted(uint64_t prepare_seq, uint64_t commit_seq, } bool succ = ExchangeCommitEntry(indexed_seq, evicted_64b, {prepare_seq, commit_seq}); - if (!succ) { + if (UNLIKELY(!succ)) { + ROCKS_LOG_ERROR(info_log_, + "ExchangeCommitEntry failed on [%" PRIu64 "] %" PRIu64 + ",%" PRIu64 " retrying...", + indexed_seq, prepare_seq, commit_seq); // A very rare event, in which the commit entry is updated before we do. // Here we apply a very simple solution of retrying. if (loop_cnt > 100) { @@ -783,16 +791,21 @@ WritePreparedTxnDB::~WritePreparedTxnDB() { db_impl_->CancelAllBackgroundWork(true /*wait*/); } +void SubBatchCounter::InitWithComp(const uint32_t cf) { + auto cmp = comparators_[cf]; + keys_[cf] = CFKeys(SetComparator(cmp)); +} + void SubBatchCounter::AddKey(const uint32_t cf, const Slice& key) { CFKeys& cf_keys = keys_[cf]; if (cf_keys.size() == 0) { // just inserted - auto cmp = comparators_[cf]; - keys_[cf] = CFKeys(SetComparator(cmp)); + InitWithComp(cf); } auto it = cf_keys.insert(key); if (it.second == false) { // second is false if a element already existed. batches_++; keys_.clear(); + InitWithComp(cf); keys_[cf].insert(key); } } diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h index 63b66a75395..5de30ab8f5c 100644 --- a/utilities/transactions/write_prepared_txn_db.h +++ b/utilities/transactions/write_prepared_txn_db.h @@ -20,6 +20,7 @@ #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/utilities/transaction_db.h" +#include "util/set_comparator.h" #include "util/string_util.h" #include "utilities/transactions/pessimistic_transaction.h" #include "utilities/transactions/pessimistic_transaction_db.h" @@ -108,17 +109,17 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { virtual void ReleaseSnapshot(const Snapshot* snapshot) override; - // Check whether the transaction that wrote the value with seqeunce number seq + // Check whether the transaction that wrote the value with sequence number seq // is visible to the snapshot with sequence number snapshot_seq bool IsInSnapshot(uint64_t seq, uint64_t snapshot_seq) const; - // Add the trasnaction with prepare sequence seq to the prepared list + // Add the transaction with prepare sequence seq to the prepared list void AddPrepared(uint64_t seq); // Rollback a prepared txn identified with prep_seq. rollback_seq is the seq // with which the additional data is written to cancel the txn effect. It can - // be used to idenitfy the snapshots that overlap with the rolled back txn. + // be used to identify the snapshots that overlap with the rolled back txn. void RollbackPrepared(uint64_t prep_seq, uint64_t rollback_seq); // Add the transaction with prepare sequence prepare_seq and commit sequence - // commit_seq to the commit map. prepare_skipped is set if the prpeare phase + // commit_seq to the commit map. prepare_skipped is set if the prepare phase // is skipped for this commit. loop_cnt is to detect infinite loops. void AddCommitted(uint64_t prepare_seq, uint64_t commit_seq, bool prepare_skipped = false, uint8_t loop_cnt = 0); @@ -157,7 +158,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { }; // Prepare Seq (64 bits) = PAD ... PAD PREP PREP ... PREP INDEX INDEX ... - // INDEX Detal Seq (64 bits) = 0 0 0 0 0 0 0 0 0 0 0 0 DELTA DELTA ... + // INDEX Delta Seq (64 bits) = 0 0 0 0 0 0 0 0 0 0 0 0 DELTA DELTA ... // DELTA DELTA Encoded Value = PREP PREP .... PREP PREP DELTA DELTA // ... DELTA DELTA PAD: first bits of a seq that is reserved for tagging and // hence ignored PREP/INDEX: the used bits in a prepare seq number INDEX: the @@ -240,6 +241,8 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { friend class PreparedHeap_Concurrent_Test; friend class WritePreparedTxnDBMock; friend class WritePreparedTransactionTest_AdvanceMaxEvictedSeqBasicTest_Test; + friend class + WritePreparedTransactionTest_AdvanceMaxEvictedSeqWithDuplicatesTest_Test; friend class WritePreparedTransactionTest_BasicRecoveryTest_Test; friend class WritePreparedTransactionTest_IsInSnapshotEmptyMapTest_Test; friend class WritePreparedTransactionTest_OldCommitMapGC_Test; @@ -273,12 +276,13 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { while (!heap_.empty() && !erased_heap_.empty() && // heap_.top() > erased_heap_.top() could happen if we have erased // a non-existent entry. Ideally the user should not do that but we - // should be resiliant againt it. + // should be resilient against it. heap_.top() >= erased_heap_.top()) { if (heap_.top() == erased_heap_.top()) { heap_.pop(); } - auto erased __attribute__((__unused__)) = erased_heap_.top(); + uint64_t erased __attribute__((__unused__)); + erased = erased_heap_.top(); erased_heap_.pop(); // No duplicate prepare sequence numbers assert(erased_heap_.empty() || erased_heap_.top() != erased); @@ -328,7 +332,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { // the time of updating the max. Thread-safety: this function can be called // concurrently. The concurrent invocations of this function is equivalent to // a serial invocation in which the last invocation is the one with the - // largetst new_max value. + // largest new_max value. void AdvanceMaxEvictedSeq(const SequenceNumber& prev_max, const SequenceNumber& new_max); @@ -340,9 +344,9 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { void ReleaseSnapshotInternal(const SequenceNumber snap_seq); // Update the list of snapshots corresponding to the soon-to-be-updated - // max_eviceted_seq_. Thread-safety: this function can be called concurrently. + // max_evicted_seq_. Thread-safety: this function can be called concurrently. // The concurrent invocations of this function is equivalent to a serial - // invocation in which the last invocation is the one with the largetst + // invocation in which the last invocation is the one with the largest // version value. void UpdateSnapshots(const std::vector& snapshots, const SequenceNumber& version); @@ -381,14 +385,14 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { // Thread-safety is provided with snapshots_mutex_. std::vector snapshots_; // The version of the latest list of snapshots. This can be used to avoid - // rewrittiing a list that is concurrently updated with a more recent version. + // rewriting a list that is concurrently updated with a more recent version. SequenceNumber snapshots_version_ = 0; // A heap of prepared transactions. Thread-safety is provided with // prepared_mutex_. PreparedHeap prepared_txns_; - // 2m entry, 16MB size - static const size_t DEF_COMMIT_CACHE_BITS = static_cast(21); + // 8m entry, 64MB size + static const size_t DEF_COMMIT_CACHE_BITS = static_cast(23); const size_t COMMIT_CACHE_BITS; const size_t COMMIT_CACHE_SIZE; const CommitEntry64bFormat FORMAT; @@ -406,7 +410,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { // maintenance work under the lock. size_t INC_STEP_FOR_MAX_EVICTED = 1; // A map from old snapshots (expected to be used by a few read-only txns) to - // prpared sequence number of the evicted entries from commit_cache_ that + // prepared sequence number of the evicted entries from commit_cache_ that // overlaps with such snapshot. These are the prepared sequence numbers that // the snapshot, to which they are mapped, cannot assume to be committed just // because it is no longer in the commit_cache_. The vector must be sorted @@ -457,19 +461,22 @@ class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback { SequenceNumber prep_seq, size_t prep_batch_cnt, size_t data_batch_cnt = 0, - bool prep_heap_skipped = false) + bool prep_heap_skipped = false, + bool publish_seq = true) : db_(db), db_impl_(db_impl), prep_seq_(prep_seq), prep_batch_cnt_(prep_batch_cnt), data_batch_cnt_(data_batch_cnt), prep_heap_skipped_(prep_heap_skipped), - includes_data_(data_batch_cnt_ > 0) { + includes_data_(data_batch_cnt_ > 0), + publish_seq_(publish_seq) { assert((prep_batch_cnt_ > 0) != (prep_seq == kMaxSequenceNumber)); // xor assert(prep_batch_cnt_ > 0 || data_batch_cnt_ > 0); } - virtual Status Callback(SequenceNumber commit_seq) override { + virtual Status Callback(SequenceNumber commit_seq, + bool is_mem_disabled) override { assert(includes_data_ || prep_seq_ != kMaxSequenceNumber); const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1) ? commit_seq @@ -481,7 +488,7 @@ class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback { } // else there was no prepare phase if (includes_data_) { assert(data_batch_cnt_); - // Commit the data that is accompnaied with the commit request + // Commit the data that is accompanied with the commit request const bool PREPARE_SKIPPED = true; for (size_t i = 0; i < data_batch_cnt_; i++) { // For commit seq of each batch use the commit seq of the last batch. @@ -490,7 +497,8 @@ class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback { db_->AddCommitted(commit_seq + i, last_commit_seq, PREPARE_SKIPPED); } } - if (db_impl_->immutable_db_options().two_write_queues) { + if (db_impl_->immutable_db_options().two_write_queues && publish_seq_) { + assert(is_mem_disabled); // implies the 2nd queue // Publish the sequence number. We can do that here assuming the callback // is invoked only from one write queue, which would guarantee that the // publish sequence numbers will be in order, i.e., once a seq is @@ -515,21 +523,10 @@ class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback { // Either because it is commit without prepare or it has a // CommitTimeWriteBatch bool includes_data_; + // Should the callback also publishes the commit seq number + bool publish_seq_; }; -// A wrapper around Comparator to make it usable in std::set -struct SetComparator { - explicit SetComparator() : user_comparator_(BytewiseComparator()) {} - explicit SetComparator(const Comparator* user_comparator) - : user_comparator_(user_comparator ? user_comparator - : BytewiseComparator()) {} - bool operator()(const Slice& lhs, const Slice& rhs) const { - return user_comparator_->Compare(lhs, rhs) < 0; - } - - private: - const Comparator* user_comparator_; -}; // Count the number of sub-batches inside a batch. A sub-batch does not have // duplicate keys. struct SubBatchCounter : public WriteBatch::Handler { @@ -541,6 +538,7 @@ struct SubBatchCounter : public WriteBatch::Handler { size_t batches_; size_t BatchCount() { return batches_; } void AddKey(const uint32_t cf, const Slice& key); + void InitWithComp(const uint32_t cf); Status MarkNoop(bool) override { return Status::OK(); } Status MarkEndPrepare(const Slice&) override { return Status::OK(); } Status MarkCommit(const Slice&) override { return Status::OK(); } diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc index c404b9a988c..7462fbb816d 100644 --- a/utilities/ttl/ttl_test.cc +++ b/utilities/ttl/ttl_test.cc @@ -301,8 +301,8 @@ class TtlTest : public testing::Test { // Keeps key if it is in [kSampleSize_/3, 2*kSampleSize_/3), // Change value if it is in [2*kSampleSize_/3, kSampleSize_) // Eg. kSampleSize_=6. Drop:key0-1...Keep:key2-3...Change:key4-5... - virtual bool Filter(int level, const Slice& key, - const Slice& value, std::string* new_value, + virtual bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/, + std::string* new_value, bool* value_changed) const override { assert(new_value != nullptr); @@ -351,7 +351,7 @@ class TtlTest : public testing::Test { } virtual std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& context) override { + const CompactionFilter::Context& /*context*/) override { return std::unique_ptr( new TestFilter(kSampleSize_, kNewValue_)); } diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc index 5b1250a6431..105f7517d29 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_test.cc +++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc @@ -63,7 +63,7 @@ struct TestHandler : public WriteBatch::Handler { seen[column_family_id].push_back(e); return Status::OK(); } - virtual void LogData(const Slice& blob) {} + virtual void LogData(const Slice& /*blob*/) {} virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { Entry e; e.key = key.ToString();