diff --git a/CMakeLists.txt b/CMakeLists.txt index 090f585..fbf3568 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,4 +17,6 @@ FIND_PACKAGE( Boost 1.65 REQUIRED COMPONENTS filesystem regex) INCLUDE_DIRECTORIES( ${Boost_INCLUDE_DIR} ) # Include sub-projects. -add_subdirectory ("DSCSTools") \ No newline at end of file +add_subdirectory ("DSCSTools") + +install (FILES LICENSE THIRD-PARTY-NOTICE DESTINATION ${CMAKE_BINARY_DIR}/target/license) \ No newline at end of file diff --git a/DSCSTools/CMakeLists.txt b/DSCSTools/CMakeLists.txt index 23399d2..a3ea1fa 100644 --- a/DSCSTools/CMakeLists.txt +++ b/DSCSTools/CMakeLists.txt @@ -16,4 +16,4 @@ add_executable (DSCSTools ${SOURCE_FILES}) target_link_libraries(DSCSTools PUBLIC doboz ${Boost_LIBRARIES}) install (TARGETS DSCSTools DESTINATION ${CMAKE_BINARY_DIR}/target) -install (DIRECTORY structures/ DESTINATION ${CMAKE_BINARY_DIR}/target) \ No newline at end of file +install (DIRECTORY structures/ DESTINATION ${CMAKE_BINARY_DIR}/target/structures) \ No newline at end of file diff --git a/DSCSTools/DSCSTools.cpp b/DSCSTools/DSCSTools.cpp index 59303d6..fcfccc1 100644 --- a/DSCSTools/DSCSTools.cpp +++ b/DSCSTools/DSCSTools.cpp @@ -22,6 +22,9 @@ void printUse() { std::cout << " --mbeextract " << std::endl; std::cout << " Extracts a .mbe file or a directory of them into CSV, " << std::endl; std::cout << " as long as it's structure is defined in the structure.json file." << std::endl; + std::cout << " --mbepack " << std::endl; + std::cout << " Repacks an .mbe folder containing CSV files back into a .mbe file " << std::endl; + std::cout << " as long as it's structure is found and defined in the structure.json file." << std::endl; } int main(int argc, char** argv) { @@ -58,6 +61,10 @@ int main(int argc, char** argv) { extractMBE(source, target); std::cout << "Done" << std::endl; } + else if (strncmp("--mbepack", argv[1], 10) == 0) { + packMBE(source, target); + std::cout << "Done" << std::endl; + } else { printUse(); } diff --git a/DSCSTools/EXPA.cpp b/DSCSTools/EXPA.cpp index 9f6061e..4065a1c 100644 --- a/DSCSTools/EXPA.cpp +++ b/DSCSTools/EXPA.cpp @@ -14,22 +14,10 @@ #include #include #include +#include +#include -template -inline T readFromStream(std::ifstream &stream, uint32_t size = sizeof(T)) { - char* buffer = new char[size]; - stream.read(buffer, size); - - return *reinterpret_cast(buffer); -} - -template <> -inline char* readFromStream(std::ifstream &stream, uint32_t size) { - char* buffer = new char[size]; - stream.read(buffer, size); - - return buffer; -} +#include "../libs/csv-parser/parser.hpp" struct EXPAHeader { uint32_t magicValue; @@ -63,11 +51,26 @@ struct CHNKEntry { char* string; }; -void writeEXPAEntry(std::ofstream &output, char* &ptr, std::string type) { +uint32_t getEntrySize(const std::string &type, uint32_t currentSize) { + if (type == "byte") + return 1; + if (type == "short") + return 2 + ((uint32_t) currentSize % 2); + if (type == "int") + return 4 + ((uint32_t) currentSize % 4); + else if (type == "string") + return 8 + ((uint32_t) currentSize % 8); + else if (type == "int array") + return 16 + ((uint32_t) currentSize % 8); + + return 0; +} +void writeEXPAEntry(std::ofstream &output, char* &ptr, const std::string &type) { + // TODO remove boilerplate if (type == "int") { ptr = ptr + ((std::size_t) ptr % 4); - output << *reinterpret_cast(ptr); + output << *reinterpret_cast(ptr); ptr += 4; } else if (type == "string") { @@ -76,11 +79,52 @@ void writeEXPAEntry(std::ofstream &output, char* &ptr, std::string type) { output << std::quoted(strPtr == nullptr ? "" : std::string(strPtr + 8), '\"', '\"'); ptr += 8; } + else if (type == "byte") { + ptr = ptr + ((std::size_t) ptr % 1); + output << (int32_t) *reinterpret_cast(ptr); + ptr += 1; + } + else if (type == "short") { + ptr = ptr + ((std::size_t) ptr % 2); + output << *reinterpret_cast(ptr); + ptr += 2; + } + else if (type == "int array") { + ptr = ptr + ((std::size_t) ptr % 8); + uint32_t elemCount = *reinterpret_cast(ptr); + ptr += 4; + ptr = ptr + ((std::size_t) ptr % 8); + int32_t* arrPtr = *reinterpret_cast(ptr); + + for (uint32_t i = 0; i < elemCount; i++) + output << arrPtr[i+2] << ((i != (elemCount - 1)) ? " " : ""); + ptr += 8; + } +} + +boost::property_tree::ptree getStructureFile(boost::filesystem::path source) { + boost::property_tree::ptree format; + + boost::property_tree::ptree structure; + boost::property_tree::read_json(std::string("structures/structure.json"), structure); + + std::string formatFile; + for (auto var : structure) { + if (boost::regex_search(source.string(), boost::regex{ var.first })) { + formatFile = var.second.data(); + break; + } + } + + if (formatFile.empty()) + return format; + + boost::property_tree::read_json("structures/" + formatFile, format); + + return format; } void extractMBEFile(boost::filesystem::path source, boost::filesystem::path target) { - boost::filesystem::ifstream input(source, std::ios::in | std::ios::binary); - if (!boost::filesystem::exists(target)) boost::filesystem::create_directories(target); else if(!boost::filesystem::is_directory(target)) { @@ -88,20 +132,26 @@ void extractMBEFile(boost::filesystem::path source, boost::filesystem::path targ return; } + boost::filesystem::ifstream input(source, std::ios::in | std::ios::binary); input.seekg(0, std::ios::end); std::streamoff length = input.tellg(); input.seekg(0, std::ios::beg); - char* data = new char[length]; - input.read(data, length); + std::unique_ptr data = std::make_unique(length); + input.read(data.get(), length); - EXPAHeader* header = reinterpret_cast(data); - std::vector tables; + EXPAHeader* header = reinterpret_cast(data.get()); + + if (header->magicValue != 0x41505845) { // "EXPA" + std::cout << "Error: source file is not in EXPA format." << std::endl; + return; + } + std::vector tables; uint64_t offset = 8; for (uint32_t i = 0; i < header->numTables; i++) { - EXPATable table = { data + offset }; + EXPATable table = { data.get() + offset }; tables.push_back(table); offset += table.nameSize() + 0x0C; @@ -112,46 +162,24 @@ void extractMBEFile(boost::filesystem::path source, boost::filesystem::path targ offset += table.entryCount() * (table.entrySize() + table.entrySize() % 8); } - CHNKHeader* chunkHeader = reinterpret_cast(data + offset); + CHNKHeader* chunkHeader = reinterpret_cast(data.get() + offset); offset += 8; for (uint32_t i = 0; i < chunkHeader->numEntry; i++) { - uint32_t dataOffset = *reinterpret_cast(data + offset); - uint32_t size = *reinterpret_cast(data + offset + 4); - uint64_t ptr = reinterpret_cast(data + offset); + uint32_t dataOffset = *reinterpret_cast(data.get() + offset); + uint32_t size = *reinterpret_cast(data.get() + offset + 4); + uint64_t ptr = reinterpret_cast(data.get() + offset); - memcpy(data + dataOffset, &ptr, 8); + memcpy(data.get() + dataOffset, &ptr, 8); offset += (size + 8); } - std::vector> map; - - boost::property_tree::ptree structure; - try { - boost::property_tree::read_json(std::string("structure.json"), structure); - } - catch (const boost::property_tree::json_parser_error &error) { - std::cout << "Error while reading structure.json | " << error.message() << " in line " << error.line() << "." << std::endl; - return; - } - - std::string formatFile; - for (auto var : structure) { - if (boost::regex_search(source.string(), boost::regex{ var.first })) { - formatFile = var.second.data(); - break; - } - } - - if (formatFile.empty()) - return; - boost::property_tree::ptree format; try { - boost::property_tree::read_json(formatFile, format); + format = getStructureFile(source); } catch (const boost::property_tree::json_parser_error &error) { - std::cout << "Error while reading "<< error.filename() << " | " << error.message() << " in line " << error.line() << "." << std::endl; + std::cout << "Error while reading structure.json | " << error.message() << " in line " << error.line() << "." << std::endl; return; } @@ -186,7 +214,7 @@ void extractMBEFile(boost::filesystem::path source, boost::filesystem::path targ // write data for (uint32_t i = 0; i < table.entryCount(); i++) { bool first = true; - char* localOffset = table.tablePtr + i * table.entrySize() + tableHeaderSize; + char* localOffset = table.tablePtr + i * (table.entrySize() + table.entrySize() % 8) + tableHeaderSize; for (auto var : formatValue.get()) { if (first) @@ -200,7 +228,7 @@ void extractMBEFile(boost::filesystem::path source, boost::filesystem::path targ } } - /* + /* lookup file - JSON { pattern1: structureFilePath1, @@ -238,11 +266,195 @@ void extractMBE(boost::filesystem::path source, boost::filesystem::path target) if (boost::filesystem::is_directory(source)) for (auto file : boost::filesystem::directory_iterator(source)) - extractMBEFile(file, target); // TODO make parallel? (test if execution time is actually a concern) + extractMBEFile(file, target); else if (boost::filesystem::is_regular_file(source)) extractMBEFile(source, target); else { std::cout << "Error: input is neither directory nor file." << std::endl; return; } +} + +// folder input, file output +void packMBE(boost::filesystem::path source, boost::filesystem::path target) { + if (boost::filesystem::equivalent(source, target)) { + std::cout << "Error: input and output path must be different!" << std::endl; + return; + } + if (!boost::filesystem::is_directory(source)) { + std::cout << "Error: input path is not a directory." << std::endl; + return; + } + if (!boost::filesystem::exists(target)) + boost::filesystem::create_directories(target.parent_path()); + else if (!boost::filesystem::is_regular_file(target)) { + std::cout << "Error: target path already exists and is not a file." << std::endl; + return; + } + + boost::filesystem::ofstream output(target, std::ios::out | std::ios::binary); + + boost::property_tree::ptree format; + try { + format = getStructureFile(source); + } + catch (const boost::property_tree::json_parser_error &error) { + std::cout << "Error while reading structure.json | " << error.message() << " in line " << error.line() << "." << std::endl; + return; + } + + std::size_t numTables = format.size(); + + // write EXPA Header + output.write("EXPA", 4); + output.write(reinterpret_cast(&numTables), 4); + + struct CHNKData { + std::string type; + std::string data; + uint32_t offset; + }; + std::vector chnkData; + + for (auto table : format) { + boost::filesystem::path file = source / (table.first + ".csv"); + boost::property_tree::ptree localFormat = table.second; + + // write EXPA Table header + boost::filesystem::ifstream countInput(file, std::ios::in); + aria::csv::CsvParser countParser(countInput); + + uint32_t entrySize = 0; + for (auto entry : localFormat) + entrySize += getEntrySize(entry.second.data(), entrySize); + + entrySize += entrySize % 8; + uint32_t count = (uint32_t) std::distance(countParser.begin(), countParser.end()) - 1; + countInput.close(); + + uint32_t nameSize = (uint32_t) (table.first.size() + 4) / 4 * 4; + std::vector name(nameSize); + std::copy(table.first.begin(), table.first.end(), name.begin()); + std::vector padding((0x0C + nameSize) % 8, 0); + + output.write(reinterpret_cast(&nameSize), 4); + output.write(name.data(), nameSize); + output.write(reinterpret_cast(&entrySize), 4); + output.write(reinterpret_cast(&count), 4); + output.write(padding.data(), (0x0C + nameSize) % 8); + + // write EXPA data, cache CHNK data + boost::filesystem::ifstream input(file, std::ios::in); + aria::csv::CsvParser parser(input); + + bool first = true; + + for (auto &row : parser) { + if (first) { + if (localFormat.size() != row.size()) { + std::cout << "Error: structure element count differs from input element count. The wrong structure may be used?" << std::endl; + std::cout << "Expected: " << localFormat.size() << " | Found: " << row.size() << std::endl; + return; + } + + first = false; + continue; + } + + auto itr = localFormat.begin(); + uint32_t entrySize = 0; + + for (auto &col : row) { + std::string type = (*itr++).second.data(); + + // TODO remove boilerplate + if (type == "byte") { + int8_t value = std::stoi(col); + output.write(reinterpret_cast(&value), 1); + entrySize += 1; + } + else if (type == "short") { + uint32_t paddingSize = entrySize % 2; + std::vector padding(paddingSize, 0xCC); + output.write(padding.data(), paddingSize); + + int16_t value = std::stoi(col); + output.write(reinterpret_cast(&value), 2); + entrySize += 2 + paddingSize; + } + else if (type == "int") { + uint32_t paddingSize = entrySize % 4; + std::vector padding(paddingSize, 0xCC); + output.write(padding.data(), paddingSize); + + int32_t value = std::stoi(col); + output.write(reinterpret_cast(&value), 4); + entrySize += 4 + paddingSize; + } + else if (type == "string") { + if (!col.empty()) + chnkData.push_back({ type, col, (uint32_t) output.tellp() }); + + uint32_t paddingSize = entrySize % 8; + std::vector padding(paddingSize, 0xCC); + output.write(padding.data(), paddingSize); + + output.write("\0\0\0\0\0\0\0\0", 8); + entrySize += 8 + paddingSize; + } + else if (type == "int array") { + if (!col.empty()) + chnkData.push_back({ type, col, (uint32_t) output.tellp() + 8 + entrySize % 8 }); + + uint32_t paddingSize = entrySize % 8; + std::vector padding(8, 0xCC); + output.write(padding.data(), paddingSize); + + uint32_t arraySize = (uint32_t) std::count(col.begin(), col.end(), ' ') + 1; + output.write(reinterpret_cast(&arraySize), 4); + output.write(padding.data(), 4); + output.write("\0\0\0\0\0\0\0\0", 8); + + entrySize += 16 + paddingSize; + } + } + + if (entrySize % 8 != 0) { + std::vector padding(entrySize % 8, 0xCC); + output.write(padding.data(), entrySize % 8); + entrySize += entrySize % 8; + } + } + + } + + std::size_t chunkCount = chnkData.size(); + output.write("CHNK", 4); + output.write(reinterpret_cast(&chunkCount), 4); + + for (auto entry : chnkData) { + // TODO remove boilerplate + if (entry.type == "string") { + uint32_t stringSize = (uint32_t) ((entry.data.size() + 5) / 4) * 4; + std::vector data(stringSize); + std::copy(entry.data.begin(), entry.data.end(), data.begin()); + + output.write(reinterpret_cast(&entry.offset), 4); + output.write(reinterpret_cast(&stringSize), 4); + output.write(data.data(), stringSize); + } + else if (entry.type == "int array") { + std::vector numbers; + boost::split(numbers, entry.data, boost::is_any_of(" ")); + + uint32_t size = (uint32_t) numbers.size() * 4; + output.write(reinterpret_cast(&entry.offset), 4); + output.write(reinterpret_cast(&size), 4); + + for (auto number : numbers) { + int32_t val = std::stoi(number); + output.write(reinterpret_cast(&val), 4); + } + } + } } \ No newline at end of file diff --git a/DSCSTools/EXPA.h b/DSCSTools/EXPA.h index 6538744..f2d6697 100644 --- a/DSCSTools/EXPA.h +++ b/DSCSTools/EXPA.h @@ -1,4 +1,6 @@ #pragma once #include -void extractMBE(boost::filesystem::path source, boost::filesystem::path target); \ No newline at end of file +void extractMBE(boost::filesystem::path source, boost::filesystem::path target); + +void packMBE(boost::filesystem::path source, boost::filesystem::path target); \ No newline at end of file diff --git a/DSCSTools/structures/colosseum_event_battle.json b/DSCSTools/structures/colosseum_event_battle.json new file mode 100644 index 0000000..070de06 --- /dev/null +++ b/DSCSTools/structures/colosseum_event_battle.json @@ -0,0 +1,129 @@ +{ + "rule": { + "ID": "int", + "unknown1": "short", + "unknown2": "short", + "unknown3_1": "byte", + "unknown3_2": "byte", + "unknown4_1": "byte", + "unknown4_2": "byte", + "unknown5": "short", + "unknown6": "short", + "unknown7_1": "short", + "unknown7_2": "short", + "unknown8": "int", + "unknown9": "int", + "unknown10_1": "short", + "unknown10_2": "short", + "unknown11_1": "short", + "unknown11_2": "short", + "unknown12": "int", + "unknown13": "int", + "unknown14": "int", + "unknown15_1": "short", + "unknown15_2": "short", + "unknown16_1": "short", + "unknown16_2": "short", + "unknown17": "int", + "unknown18": "int", + "unknown19_1": "short", + "unknown19_2": "short", + "unknown20_1": "short", + "unknown20_2": "short", + "unknown21_1": "short", + "unknown21_2": "short", + "unknown22": "int", + "unknown23": "int", + "unknown24": "int", + "unknown25": "int", + "unknown26": "int", + "unknown27": "int", + "unknown28_1": "short", + "unknown28_2": "short", + "unknown29_1": "short", + "unknown29_2": "short", + "unknown30": "int", + "unknown31_1": "short", + "unknown31_2": "short", + "unknown35": "int array", + "unknown38": "int array", + "unknown41": "int array", + "unknown42_1": "short", + "unknown42_2": "short" + }, + "mission": { + "ID": "int", + "Unknown1": "int", + "Unknown2": "int", + "Unknown6": "int array", + "Unknown7": "int", + "Unknown8": "int", + "Unknown9": "int", + "Unknown10": "int", + "Unknown11": "int", + "Unknown12": "int", + "Unknown13": "int", + "Unknown14": "int", + "Unknown15": "int", + "Unknown16": "int", + "Unknown17": "int", + "Unknown18": "int", + "Unknown19": "int", + "Unknown20": "int", + "Unknown21": "int", + "Unknown22": "int", + "Unknown23": "int", + "Unknown24": "int", + "Unknown25": "int", + "Unknown26": "int", + "Unknown27": "int", + "Unknown28": "int", + "Unknown29": "int", + "Unknown33": "int array", + "Unknown34": "int", + "Unknown35": "int", + "Unknown36": "int", + "Unknown37": "int", + "Unknown38": "int", + "Unknown39": "int", + "Unknown40": "int", + "Unknown41": "int", + "Unknown42": "int", + "Unknown43": "int", + "Unknown44": "int", + "Unknown45": "int", + "Unknown46": "int", + "Unknown47": "int", + "Unknown48": "int", + "Unknown49": "int", + "Unknown50": "int", + "Unknown51": "int", + "Unknown52": "int", + "Unknown53": "int", + "Unknown54": "int", + "Unknown55": "int", + "Unknown56": "int", + "Unknown60": "int array", + "Unknown61": "int", + "Unknown62": "int", + "Unknown63": "int", + "Unknown64": "int", + "Unknown65": "int", + "Unknown66": "int", + "Unknown67": "int", + "Unknown68": "int", + "Unknown69": "int", + "Unknown70": "int", + "Unknown71": "int", + "Unknown72": "int", + "Unknown73": "int", + "Unknown74": "int", + "Unknown75": "int", + "Unknown76": "int", + "Unknown77": "int", + "Unknown78": "int", + "Unknown79": "int", + "Unknown80": "int", + "Unknown81": "int" + } +} \ No newline at end of file diff --git a/DSCSTools/structures/farm_development.json b/DSCSTools/structures/farm_development.json new file mode 100644 index 0000000..413e3b3 --- /dev/null +++ b/DSCSTools/structures/farm_development.json @@ -0,0 +1,109 @@ +{ + "point": { + "id": "int", + "Unk1": "int", + "Unk2": "int", + "Unk3": "int", + "Unk4": "int", + "Unk5": "int", + "Unk6": "int", + "Unk7": "int" + }, + "item_table": { + "id": "int", + "Unk #1": "int", + "Item #1": "int", + "Amount #1": "int", + "Unk #2": "int", + "Item #2": "int", + "Amount #2": "int", + "Unk #3": "int", + "Item #3": "int", + "Amount #3": "int", + "Unk #4": "int", + "Item #4": "int", + "Amount #4": "int", + "Unk #5": "int", + "Item #5": "int", + "Amount #5": "int", + "Unk #6": "int", + "Item #6": "int", + "Amount #6": "int", + "Unk #7": "int", + "Item #7": "int", + "Amount #7": "int", + "Unk #8": "int", + "Item #8": "int", + "Amount #8": "int", + "Unk #9": "int", + "Item #9": "int", + "Amount #9": "int", + "Unk #10": "int", + "Item #10": "int", + "Amount #10": "int", + "Unk #11": "int", + "Item #11": "int", + "Amount #11": "int", + "Unk #12": "int", + "Item #12": "int", + "Amount #12": "int", + "Unk #13": "int", + "Item #13": "int", + "Amount #13": "int", + "Unk #14": "int", + "Item #14": "int", + "Amount #14": "int", + "Unk #15": "int", + "Item #15": "int", + "Amount #15": "int", + "Unk #16": "int", + "Item #16": "int", + "Amount #16": "int", + "Unk #17": "int", + "Item #17": "int", + "Amount #17": "int", + "Unk #18": "int", + "Item #18": "int", + "Amount #18": "int", + "Unk #19": "int", + "Item #19": "int", + "Amount #19": "int", + "Unk #20": "int", + "Item #20": "int", + "Amount #20": "int", + "Unk #21": "int", + "Item #21": "int", + "Amount #21": "int", + "Unk #22": "int", + "Item #22": "int", + "Amount #22": "int", + "Unk #23": "int", + "Item #23": "int", + "Amount #23": "int", + "Unk #24": "int", + "Item #24": "int", + "Amount #24": "int", + "Unk #25": "int", + "Item #25": "int", + "Amount #25": "int", + "Unk #26": "int", + "Item #26": "int", + "Amount #26": "int", + "Unk #27": "int", + "Item #27": "int", + "Amount #27": "int", + "Unk #28": "int", + "Item #28": "int", + "Amount #28": "int", + "Unk #29": "int", + "Item #29": "int", + "Amount #29": "int", + "Unk #30": "int", + "Item #30": "int", + "Amount #30": "int" + }, + "fund": { + "id": "int", + "price": "int" + } +} \ No newline at end of file diff --git a/DSCSTools/structures/message.json b/DSCSTools/structures/message.json new file mode 100644 index 0000000..8cd9b37 --- /dev/null +++ b/DSCSTools/structures/message.json @@ -0,0 +1,12 @@ +{ + "Sheet1": { + "ID": "int", + "Speaker": "int", + "Unknown1": "string", + "English": "string", + "Chinese": "string", + "Unknown2": "string", + "Korean": "string", + "German": "string" + } +} \ No newline at end of file diff --git a/DSCSTools/structures/structure.json b/DSCSTools/structures/structure.json index f58941f..e74869b 100644 --- a/DSCSTools/structures/structure.json +++ b/DSCSTools/structures/structure.json @@ -1,4 +1,6 @@ { + "colosseum_event_battle.mbe": "colosseum_event_battle.json", + "farm_development.mbe": "farm_development.json", "text\\\\eden_group_text.mbe$": "text_para.json", "text\\\\eden_text.mbe$": "text_para.json", "text\\\\mirror_dungeon_text.mbe$": "text_para.json", @@ -7,5 +9,5 @@ "text\\\\yes_no_message.mbe$": "text_para.json", "text\\\\tournament_name.mbe$": "tournament_name.json", "text\\\\[a-zA-Z0-9_]*\\.mbe$": "text.json", - "message\\\\[a-zA-Z0-9_]*\\.mbe$": "text.json" + "message\\\\[a-zA-Z0-9_]*\\.mbe$": "message.json" } \ No newline at end of file diff --git a/README.md b/README.md index 2de8f54..65d458f 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ MBE Extract: DSCSTools --mbeextract ``` ## MBE Structure files -In order for the MBE functions to work it need to assume a data structure. For this a `structure.json` must be present in the root folder of the tool. +In order for the MBE functions to work it need to assume a data structure. For this a `structure.json` must be present in the `structures` folder of the tool. It contains simple `regexPattern: structureDefinition.json` associations. The tool match the currently handled file path with the patterns in the structure.json and pick the first match. The structure definition is another JSON file following this format: diff --git a/THIRD-PARTY-NOTICE b/THIRD-PARTY-NOTICE new file mode 100644 index 0000000..216357d --- /dev/null +++ b/THIRD-PARTY-NOTICE @@ -0,0 +1,46 @@ +This software utilizes libraries provided by third parties. + +=== Doboz Data Compression Library === +https://github.com/nemequ/doboz + +Doboz Data Compression Library +Copyright (C) 2010-2011 Attila T. Afra + +This software is provided 'as-is', without any express or implied warranty. In no event will +the authors be held liable for any damages arising from the use of this software. + +Permission is granted to anyone to use this software for any purpose, including commercial +applications, and to alter it and redistribute it freely, subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the + original software. If you use this software in a product, an acknowledgment in the product + documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as + being the original software. +3. This notice may not be removed or altered from any source distribution. + + +=== CSV Parser === +https://github.com/AriaFallah/csv-parser + +MIT License + +Copyright (c) 2017 Aria Fallah + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/libs/csv-parser/LICENSE b/libs/csv-parser/LICENSE new file mode 100644 index 0000000..b89bac4 --- /dev/null +++ b/libs/csv-parser/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 Aria Fallah + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/libs/csv-parser/parser.hpp b/libs/csv-parser/parser.hpp new file mode 100644 index 0000000..f1e2812 --- /dev/null +++ b/libs/csv-parser/parser.hpp @@ -0,0 +1,350 @@ +#ifndef ARIA_CSV_H +#define ARIA_CSV_H + +#include +#include +#include +#include +#include + +namespace aria { + namespace csv { + enum class Term : char { CRLF = -2 }; + enum class FieldType { DATA, ROW_END, CSV_END }; + using CSV = std::vector>; + + // Checking for '\n', '\r', and '\r\n' by default + inline bool operator==(const char c, const Term t) { + switch (t) { + case Term::CRLF: + return c == '\r' || c == '\n'; + default: + return static_cast(t) == c; + } + } + + inline bool operator!=(const char c, const Term t) { + return !(c == t); + } + + // Wraps returned fields so we can also indicate + // that we hit row endings or the end of the csv itself + struct Field { + explicit Field(FieldType t) : type(t), data(nullptr) {} + explicit Field(const std::string& str) : type(FieldType::DATA), data(&str) {} + + FieldType type; + const std::string *data; + }; + + // Reads and parses lines from a csv file + class CsvParser { + private: + // CSV state for state machine + enum class State { + START_OF_FIELD, + IN_FIELD, + IN_QUOTED_FIELD, + IN_ESCAPED_QUOTE, + END_OF_ROW, + EMPTY + }; + State m_state = State::START_OF_FIELD; + + // Configurable attributes + char m_quote = '"'; + char m_delimiter = ','; + Term m_terminator = Term::CRLF; + std::istream& m_input; + + // Buffer capacities + static constexpr int FIELDBUF_CAP = 1024; + static constexpr int INPUTBUF_CAP = 1024 * 128; + + // Buffers + std::string m_fieldbuf{}; + char m_inputbuf[INPUTBUF_CAP]{}; + + // Misc + bool m_eof = false; + size_t m_cursor = INPUTBUF_CAP; + size_t m_inputbuf_size = INPUTBUF_CAP; + std::streamoff m_scanposition = -INPUTBUF_CAP; + public: + // Creates the CSV parser which by default, splits on commas, + // uses quotes to escape, and handles CSV files that end in either + // '\r', '\n', or '\r\n'. + explicit CsvParser(std::istream& input) : m_input(input) { + // Reserve space upfront to improve performance + m_fieldbuf.reserve(FIELDBUF_CAP); + if (!m_input.good()) { + throw std::runtime_error("Something is wrong with input stream"); + } + } + + // Change the quote character + CsvParser& quote(char c) noexcept { + m_quote = c; + return *this; + } + + // Change the delimiter character + CsvParser& delimiter(char c) noexcept { + m_delimiter = c; + return *this; + } + + // Change the terminator character + CsvParser& terminator(char c) noexcept { + m_terminator = static_cast(c); + return *this; + } + + // The parser is in the empty state when there are + // no more tokens left to read from the input buffer + bool empty() { + return m_state == State::EMPTY; + } + + // Not the actual position in the stream (its buffered) just the + // position up to last availiable token + std::streamoff position() const { + return m_scanposition + static_cast(m_cursor); + } + + // Reads a single field from the CSV + Field next_field() { + if (empty()) { + return Field(FieldType::CSV_END); + } + m_fieldbuf.clear(); + + // This loop runs until either the parser has + // read a full field or until there's no tokens left to read + for (;;) { + char *maybe_token = top_token(); + + // If we're out of tokens to read return whatever's left in the + // field and row buffers. If there's nothing left, return null. + if (!maybe_token) { + m_state = State::EMPTY; + return !m_fieldbuf.empty() ? Field(m_fieldbuf) : Field(FieldType::CSV_END); + } + + // Parsing the CSV is done using a finite state machine + char c = *maybe_token; + switch (m_state) { + case State::START_OF_FIELD: + m_cursor++; + if (c == m_terminator) { + handle_crlf(c); + return Field(FieldType::ROW_END); + } + + if (c == m_quote) { + m_state = State::IN_QUOTED_FIELD; + } + else if (c == m_delimiter) { + return Field(m_fieldbuf); + } + else { + m_state = State::IN_FIELD; + m_fieldbuf += c; + } + + break; + + case State::IN_FIELD: + m_cursor++; + if (c == m_terminator) { + handle_crlf(c); + m_state = State::END_OF_ROW; + return Field(m_fieldbuf); + } + + if (c == m_delimiter) { + m_state = State::START_OF_FIELD; + return Field(m_fieldbuf); + } + else { + m_fieldbuf += c; + } + + break; + + case State::IN_QUOTED_FIELD: + m_cursor++; + if (c == m_quote) { + m_state = State::IN_ESCAPED_QUOTE; + } + else { + m_fieldbuf += c; + } + + break; + + case State::IN_ESCAPED_QUOTE: + m_cursor++; + if (c == m_terminator) { + handle_crlf(c); + m_state = State::END_OF_ROW; + return Field(m_fieldbuf); + } + + if (c == m_quote) { + m_state = State::IN_QUOTED_FIELD; + m_fieldbuf += c; + } + else if (c == m_delimiter) { + m_state = State::START_OF_FIELD; + return Field(m_fieldbuf); + } + else { + m_state = State::IN_FIELD; + m_fieldbuf += c; + } + + break; + + case State::END_OF_ROW: + m_state = State::START_OF_FIELD; + return Field(FieldType::ROW_END); + + case State::EMPTY: + throw std::logic_error("You goofed"); + } + } + } + private: + // When the parser hits the end of a line it needs + // to check the special case of '\r\n' as a terminator. + // If it finds that the previous token was a '\r', and + // the next token will be a '\n', it skips the '\n'. + void handle_crlf(const char c) { + if (m_terminator != Term::CRLF || c != '\r') { + return; + } + + char *token = top_token(); + if (token && *token == '\n') { + m_cursor++; + } + } + + // Pulls the next token from the input buffer, but does not move + // the cursor forward. If the stream is empty and the input buffer + // is also empty return a nullptr. + char* top_token() { + // Return null if there's nothing left to read + if (m_eof && m_cursor == m_inputbuf_size) { + return nullptr; + } + + // Refill the input buffer if it's been fully read + if (m_cursor == m_inputbuf_size) { + m_scanposition += static_cast(m_cursor); + m_cursor = 0; + m_input.read(m_inputbuf, INPUTBUF_CAP); + + // Indicate we hit end of file, and resize + // input buffer to show that it's not at full capacity + if (m_input.eof()) { + m_eof = true; + m_inputbuf_size = m_input.gcount(); + + // Return null if there's nothing left to read + if (m_inputbuf_size == 0) { + return nullptr; + } + } + } + + return &m_inputbuf[m_cursor]; + } + public: + // Iterator implementation for the CSV parser, which reads + // from the CSV row by row in the form of a vector of strings + class iterator { + public: + using difference_type = std::ptrdiff_t; + using value_type = std::vector; + using pointer = const std::vector*; + using reference = const std::vector&; + using iterator_category = std::input_iterator_tag; + + explicit iterator(CsvParser *p, bool end = false) : m_parser(p) { + if (!end) { + m_row.reserve(50); + m_current_row = 0; + next(); + } + } + + iterator& operator++() { + next(); + return *this; + } + + iterator operator++(int) { + iterator i = (*this); + ++(*this); + return i; + } + + bool operator==(const iterator& other) const { + return m_current_row == other.m_current_row + && m_row.size() == other.m_row.size(); + } + + bool operator!=(const iterator& other) const { + return !(*this == other); + } + + reference operator*() const { + return m_row; + } + + pointer operator->() const { + return &m_row; + } + private: + value_type m_row{}; + CsvParser *m_parser; + int m_current_row = -1; + + void next() { + value_type::size_type num_fields = 0; + for (;;) { + auto field = m_parser->next_field(); + switch (field.type) { + case FieldType::CSV_END: + if (num_fields < m_row.size()) { + m_row.resize(num_fields); + } + m_current_row = -1; + return; + case FieldType::ROW_END: + if (num_fields < m_row.size()) { + m_row.resize(num_fields); + } + m_current_row++; + return; + case FieldType::DATA: + if (num_fields < m_row.size()) { + m_row[num_fields] = std::move(*field.data); + } + else { + m_row.push_back(std::move(*field.data)); + } + num_fields++; + } + } + } + }; + + iterator begin() { return iterator(this); }; + iterator end() { return iterator(this, true); }; + }; + } +} +#endif