Skip to content

Commit

Permalink
Now Portcullis filter will be able to specify a system directory for …
Browse files Browse the repository at this point in the history
…the training initial rules. Progress on #41
  • Loading branch information
lucventurini committed Mar 18, 2019
1 parent 74c6e91 commit 42f15f0
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 29 deletions.
3 changes: 3 additions & 0 deletions deps/boost/project-config.jam
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ if ! gcc in [ feature.values <toolset> ]

project : default-build <toolset>gcc ;

path-constant ICU_PATH : /usr ;


# List of --with-<library> and --without-<library>
# options. If left empty, all libraries will be built.
# Options specified on the command line completely
Expand Down
2 changes: 1 addition & 1 deletion doc/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ MKDIR_P = mkdir -p
INSTALL = /usr/bin/install -c -m 644

# Set autoconf variables
prefix = /usr/local
prefix = /home/luca/workspace/portcullis/compiled
PACKAGE_TARNAME = portcullis
top_srcdir = ..
srcdir = .
Expand Down
4 changes: 3 additions & 1 deletion lib/src/junction.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1160,10 +1160,12 @@ shared_ptr<portcullis::Junction> portcullis::Junction::parse(const string& line)
boost::split(parts, line, boost::is_any_of("\t"), boost::token_compress_on);
uint32_t expected_cols = 11 + Junction::STRAND_NAMES.size() + Junction::METRIC_NAMES.size() + Junction::JAD_NAMES.size();
if (parts.size() != expected_cols) {
std::string sparts;
sparts = accumulate(begin(parts), end(parts), sparts);
BOOST_THROW_EXCEPTION(JunctionException() << JunctionErrorInfo(string(
"Could not parse line due to incorrect number of columns. This is probably a version mismatch. Check file and portcullis versions. Expected ")
+ std::to_string(expected_cols) + " columns. Found "
+ std::to_string(parts.size()) + ". Line: " + line));
+ std::to_string(parts.size()) + ". Line:\n" + sparts));
}
// Create intron
IntronPtr intron = make_shared<Intron>(
Expand Down
12 changes: 8 additions & 4 deletions scripts/portcullis/portcullis/rule_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def create_training_sets(args):

# Load portcullis junctions into dataframe
print("Loading input junctions ... ", end="", flush=True)
original = DataFrame.read_csv(args.input, sep='\t', header=0)
original = DataFrame.from_csv(args.input, sep='\t', header=0)
fieldnames = [key for key in dict(original.dtypes)]
print("done.", len(original), "junctions loaded.")

Expand Down Expand Up @@ -165,7 +165,9 @@ def create_training_sets(args):
# Create pandas command
pandas_cmd_in, pandas_cmd_out = json2pandas(open(json_file), fieldnames, "df")

# print(pandas_cmd)
if args.verbose:
print(pandas_cmd_in)
print(pandas_cmd_out)

# Execute the pandas command, result should be a filtered dataframe
pos_juncs = eval(pandas_cmd_in)
Expand Down Expand Up @@ -256,7 +258,9 @@ def create_training_sets(args):
# Create pandas command
pandas_cmd_in, pandas_cmd_out = json2pandas(open(json_file), fieldnames, "other_juncs")

#print(pandas_cmd)
if args.verbose:
print(pandas_cmd_in)
print(pandas_cmd_out)

# Execute the pandas command, result should be a filtered dataframe
neg_juncs = eval(pandas_cmd_in)
Expand Down Expand Up @@ -324,7 +328,7 @@ def filter_one(args):
# Load portcullis junctions into dataframe
if args.verbose:
print("Loading input junctions ... ", end="", flush=True)
original = DataFrame.read_csv(args.input, sep='\t', header=0)
original = DataFrame.from_csv(args.input, sep='\t', header=0)
if args.verbose:
print("done.")

Expand Down
130 changes: 109 additions & 21 deletions src/junction_filter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <regex>
#include <fstream>
#include <string>
#include <iostream>
Expand All @@ -29,6 +30,7 @@
using std::boolalpha;
using std::ifstream;
using std::string;
using std::stoi;
using std::pair;
using std::map;
using std::unordered_map;
Expand All @@ -37,6 +39,8 @@ using std::vector;
using std::to_string;
using std::cout;
using std::cerr;
using std::regex_match;
using std::regex_search;

#include <boost/algorithm/string.hpp>
#include <boost/exception/all.hpp>
Expand Down Expand Up @@ -67,12 +71,13 @@ using portcullis::PyHelper;
#include "prepare.hpp"

portcullis::JunctionFilter::JunctionFilter(const path& _prepDir, const path& _junctionFile,
const path& _output, bool _precise): precise(_precise) {
const path& _output, const path& _initial) { // bool _precise): precise(_precise) {
junctionFile = _junctionFile;
prepData.setPrepDir(_prepDir);
modelFile = "";
genuineFile = "";
output = _output;
initial = _initial;
filterFile = "";
referenceFile = "";
saveBad = false;
Expand All @@ -88,6 +93,27 @@ portcullis::JunctionFilter::JunctionFilter(const path& _prepDir, const path& _ju
enn = true;
}


bool sort_jsons(string& json1, string& json2) {
// selftrain_initial_pos.layer3.json < selftrain_initial_pos.layer4.json
std::smatch jmatch1;
std::smatch jmatch2;

int json1_pos;
int json2_pos;

std::regex rgx(".*layer([0-9]*)\\.json");

std::regex_search(json1, jmatch1, rgx);
std::regex_search(json2, jmatch2, rgx);

json1_pos = std::stoi(jmatch1.str(1));
json2_pos = std::stoi(jmatch2.str(1));
return json1_pos < json2_pos;

}


void portcullis::JunctionFilter::filter() {
path outputDir = output.parent_path();
string outputPrefix = output.leaf().string();
Expand Down Expand Up @@ -226,22 +252,47 @@ void portcullis::JunctionFilter::filter() {
path rf_script = path("portcullis") / "rule_filter.py";
vector<string> args;
args.push_back(rf_script.string());

string ruleset = dataDir.string() + (precise ? "/precise" : "/balanced");

args.push_back("--pos_json");
args.push_back(ruleset + "/selftrain_initial_pos.layer1.json");
args.push_back(ruleset + "/selftrain_initial_pos.layer2.json");
args.push_back(ruleset + "/selftrain_initial_pos.layer3.json");

args.push_back("--neg_json");
args.push_back(ruleset + "/selftrain_initial_neg.layer1.json");
args.push_back(ruleset + "/selftrain_initial_neg.layer2.json");
args.push_back(ruleset + "/selftrain_initial_neg.layer3.json");
args.push_back(ruleset + "/selftrain_initial_neg.layer4.json");
args.push_back(ruleset + "/selftrain_initial_neg.layer5.json");
args.push_back(ruleset + "/selftrain_initial_neg.layer6.json");
args.push_back(ruleset + "/selftrain_initial_neg.layer7.json");

string ruleset = dataDir.string() + "/" + initial.string();

// string ruleset = dataDir.string() + (precise ? "/precise" : "/balanced");
vector<string> pos_jsons;
vector<string> neg_jsons;
for (const auto & entry : bfs::directory_iterator(ruleset)) {
std::smatch sm;
std::ostringstream entry_ss;
entry_ss << entry;
std::string entry_path = entry_ss.str();
std::regex r(".*layer[0-9]{1,}\\.json");
if ( std::regex_search( entry_path, sm, r) ) {
// Remove quotes, they break the Python caller
entry_path.erase(remove(entry_path.begin(), entry_path.end(), '\"'), entry_path.end());
if (entry_path.find("neg") != std::string::npos) {

neg_jsons.push_back(entry_path);
} else if (entry_path.find("pos") != std::string::npos) {
pos_jsons.push_back(entry_path);
}
}
}

// Now sort the vectors, and check that they are not empty.
if (neg_jsons.empty() || pos_jsons.empty() ) {
BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string("Not enough positive and negative layers found in " + ruleset)));
}

sort(neg_jsons.begin(), neg_jsons.end(), sort_jsons);
sort(pos_jsons.begin(), pos_jsons.end(), sort_jsons);

args.push_back("--pos_json");
for (vector<int>::size_type i = 0; i != pos_jsons.size(); ++i) {
args.push_back(pos_jsons[i]);
}

args.push_back("--neg_json");
for (vector<int>::size_type i = 0; i != neg_jsons.size(); ++i) {
args.push_back(neg_jsons[i]);
}

args.push_back("--prefix=" + output.string() + ".selftrain.initialset");

Expand All @@ -258,18 +309,52 @@ void portcullis::JunctionFilter::filter() {

if (verbose) {
string arg_str = boost::algorithm::join(args, " ");
cout << "Executing python script with this command: " << rf_script.string() << " " << arg_str << endl;
cout << "Executing python script with this command: " << arg_str << endl;
}

cout << "Executing python script." << endl;
PyHelper::getInstance().execute(rf_script.string(), (int) args.size(), char_args);
cout << "Executed Python script" << endl << endl;


// Load junction system
JunctionSystem posSystem(path(output.string() + ".selftrain.initialset.pos.junctions.tab"));
JunctionSystem negSystem(path(output.string() + ".selftrain.initialset.neg.junctions.tab"));
posSystem.sort();
negSystem.sort();
JunctionList pos = posSystem.getJunctions();
JunctionList neg = negSystem.getJunctions();
JunctionList neg = negSystem.getJunctions();

// Load junction system
// string pos_string = output.string() + ".selftrain.initialset.pos.junctions.tab";
// string neg_string = output.string() + ".selftrain.initialset.neg.junctions.tab";
// path posPath = (path) pos_string;
// path negPath = (path) neg_string;
// JunctionSystem posSystem;
// JunctionSystem negSystem;
// try {
// cout << "Loading file " << pos_string << endl;
// JunctionSystem posSystem(posPath);
// cout << "Loaded file " << pos_string << endl;
// } catch (const std::exception& ex) {
// BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string("Error in loading " + pos_string + "\n")));
// }
// try {
// cout << "Loading file " << neg_string << endl;
// JunctionSystem negSystem(negPath);
// cout << "Loaded file " << neg_string << endl;
// } catch (const std::exception& ex) {
// BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string("Error in loading " + neg_string + "\n")));
// }
// posSystem.sort();
// negSystem.sort();
// JunctionList pos = posSystem.getJunctions();
// JunctionList neg = negSystem.getJunctions();
// cout << "Initial training set consists of " << pos.size() << " positive and " << neg.size() << " negative junctions." << endl << endl;
// if (pos.size() == 0) {
// BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string("No positive junction recovered from " + pos_string + "\n")));
// } else if (neg.size() == 0) {
// BOOST_THROW_EXCEPTION(JuncFilterException() << JuncFilterErrorInfo(string("No positive junction recovered from " + neg_string + "\n")));
// }

// Ensure positive and negative set have the genuine flag set appropriately
for (auto & j : pos) {
Expand Down Expand Up @@ -660,6 +745,7 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) {
bool balanced;
uint32_t mincov;
string source;
path initial;
bool no_smote;
bool enn;
double threshold;
Expand Down Expand Up @@ -705,6 +791,8 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) {
"Only keep junctions with a number of split reads greater than or equal to this number")
("threshold", po::value<double>(&threshold)->default_value(DEFAULT_FILTER_THRESHOLD),
"The threshold score at which we determine a junction to be genuine or not. Increase value towards 1.0 to increase precision, decrease towards 0.0 to increase sensitivity. We generally find that increasing sensitivity helps when using high coverage data, or when the aligner has already performed some form of junction filtering.")
("training_rule", po::value<path>(&initial)->default_value("balanced"),
"Pre-set to use for the self-training. Currently supported: balanced, precise. Default: balanced.")
//("balanced", po::bool_switch(&balanced)->default_value(false),
//"Uses rules that should provide a balanced training set of positive and negative junctions. By default, portcullis tends towards more precise predictions, which is useful for datasets with high coverage. Setting to balanced tends to work better on smaller datasets with less coverage.")
;
Expand Down Expand Up @@ -753,7 +841,7 @@ int portcullis::JunctionFilter::main(int argc, char *argv[]) {
cout << "Running portcullis in junction filter mode" << endl
<< "------------------------------------------" << endl << endl;
// Create the prepare class
JunctionFilter filter(prepDir, junctionFile, output, false);
JunctionFilter filter(prepDir, junctionFile, output, initial);
filter.setSaveBad(saveBad);
filter.setSource(source);
filter.setVerbose(verbose);
Expand Down
3 changes: 2 additions & 1 deletion src/junction_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ namespace portcullis {
bool enn;
bool precise;
bool verbose;
path initial;


public:
Expand All @@ -120,7 +121,7 @@ namespace portcullis {
JunctionFilter(const path& _prepDir,
const path& _junctionFile,
const path& _output,
bool precise);
const path& _initial);

virtual ~JunctionFilter() {
}
Expand Down
2 changes: 1 addition & 1 deletion src/portcullis.cc
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ int mainFull(int argc, char *argv[]) {
<< "-------------------" << endl << endl;
path filtOut = outputDir.string() + "/3-filt/portcullis_filtered";
path juncTab = juncDir.string() + "/portcullis_all.junctions.tab";
JunctionFilter filter(prepDir, juncTab, filtOut, false);
JunctionFilter filter(prepDir, juncTab, filtOut, "balanced");
filter.setVerbose(verbose);
filter.setSource(source);
filter.setMaxLength(max_length);
Expand Down

0 comments on commit 42f15f0

Please sign in to comment.