-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Update version number to 2.5.0-SNAPSHOT * Update Swagger to 3.4.5 * Update dependency versions * Update OpenCSV version * Change from using the Reflections library to FastClasspathScanner, which is both faster and supports Java 9. Fixes #51. * New servlet to retrieve default values (e.g. default ContentExtractor) * Fix Issue #49 (Plankton should have default content extractor selected) * Upgrade to Elasticsearch 5.6.4 * Rename UimaContentExtractor to PlainTextContentExtractor * Add null check in to prevent collection reader hanging if metadata is missing * Move away from IOUtils, which can cause freezing on large documents - Fixes Issue #50 * Remove deprecated APIs * Add NCA copyright * Add NCA copyright * Add NCA copyright * Add NCA copyright * Add new Vulnerability type to type system * New annotators: CVE, Epoch Time, IPv6 and Lenient URL * New collection readers: CSV Folder, MBOX, SQL Cell, SQL DB Cell, SQL Row * New consumers: Elastic-Kibana, Gremlin * Add copyright statement * Update version * Remove use of deprecated function * Update dependency versions * Update dependency version * Update test resources * Update dependencies * Fix issue with child nodes in JSoup * Dependency updates * Wrap Stream in a try-resource block so that it is closed after use. I believe this would have been closed when it went out of scope anyway, but the approach here ensures that is the case. * Wrap ResultSet in a try-resource block so that it is closed after use. * Wrap Stream in a try-resource block so that it is closed after use. * Wrap Stream in a try-resource block so that it is closed after use. * Wrap ResultSet in a try-resource block so that it is closed after use. * Wrap resources in a try-resource block so that it is closed after use. * Bug fix for cases where a file path is used rather than a URI * Wrap ResultSet in a try-resource block so that it is closed after use. * Ensure SQL resources are closed * Change client variable to be of type TransportClient * Remove incorrect Javadoc * Make variable final
- Loading branch information
1 parent
1278c4f
commit bf1ed82
Showing
115 changed files
with
4,541 additions
and
1,074 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,4 +53,7 @@ local.properties | |
target/ | ||
|
||
# Baleen specific | ||
testing.log | ||
testing.log | ||
|
||
# IntelliJ IDEA | ||
*.iml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
34 changes: 34 additions & 0 deletions
34
baleen-annotators/src/main/java/uk/gov/dstl/baleen/annotators/regex/Cve.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
//NCA (c) Crown Copyright 2017 | ||
package uk.gov.dstl.baleen.annotators.regex; | ||
|
||
import com.google.common.collect.ImmutableSet; | ||
import org.apache.uima.jcas.JCas; | ||
import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractRegexAnnotator; | ||
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; | ||
import uk.gov.dstl.baleen.types.common.Vulnerability; | ||
|
||
import java.util.Collections; | ||
import java.util.regex.Matcher; | ||
|
||
/** | ||
* Extracts CVE (Common Vulnerabilities and Exposures) references from text using a regular expression, | ||
* and annotate them as Vulnerability entities. | ||
* | ||
* @baleen.javadoc | ||
*/ | ||
public class Cve extends AbstractRegexAnnotator<Vulnerability> { | ||
|
||
public Cve(){ | ||
super("\\bCVE-[0-9]{4}-[0-9]+\\b", false, 1.0); | ||
} | ||
|
||
@Override | ||
protected Vulnerability create(JCas jCas, Matcher matcher) { | ||
return new Vulnerability(jCas); | ||
} | ||
|
||
@Override | ||
public AnalysisEngineAction getAction() { | ||
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Vulnerability.class)); | ||
} | ||
} |
93 changes: 93 additions & 0 deletions
93
baleen-annotators/src/main/java/uk/gov/dstl/baleen/annotators/regex/EpochTime.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
//NCA (c) Crown Copyright 2017 | ||
package uk.gov.dstl.baleen.annotators.regex; | ||
|
||
import com.google.common.collect.ImmutableSet; | ||
import org.apache.uima.fit.descriptor.ConfigurationParameter; | ||
import org.apache.uima.jcas.JCas; | ||
import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractRegexAnnotator; | ||
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; | ||
import uk.gov.dstl.baleen.types.semantic.Temporal; | ||
|
||
import java.util.Collections; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
/** | ||
* Extract UNIX Epoch timestamps from text and annotate them as Temporal entities | ||
* | ||
* @baleen.javadoc | ||
*/ | ||
public class EpochTime extends AbstractRegexAnnotator<Temporal> { | ||
|
||
/** | ||
* What's the earliest timestamp that is acceptable? | ||
* | ||
* By default, must be after 1st January 2000. | ||
* | ||
* @baleen.config 946684800 | ||
*/ | ||
public static final String PARAM_EARLIEST = "earliest"; | ||
@ConfigurationParameter(name = PARAM_EARLIEST, defaultValue = "946684800") | ||
private long earliest; | ||
|
||
/** | ||
* What's the latest timestamp that is acceptable? | ||
* A value of -1 will indicate no maximum. | ||
* | ||
* @baleen.config -1 | ||
*/ | ||
public static final String PARAM_LATEST = "latest"; | ||
@ConfigurationParameter(name = PARAM_LATEST, defaultValue = "-1") | ||
private long latest; | ||
|
||
/** | ||
* Is the timestamp in milliseconds (rather than seconds). | ||
* Milliseconds will be converted into seconds (floored), | ||
* as Baleen does not support timestamps of millisecond resolution. | ||
* | ||
* @baleen.config false | ||
*/ | ||
public static final String PARAM_MILLIS = "millis"; | ||
@ConfigurationParameter(name = PARAM_MILLIS, defaultValue = "false") | ||
private boolean millis; | ||
|
||
public EpochTime(){ | ||
super(Pattern.compile("\\b\\d+\\b"), 1.0); | ||
} | ||
|
||
@Override | ||
protected Temporal create(JCas jCas, Matcher matcher) { | ||
Long l; | ||
try { | ||
l = Long.parseLong(matcher.group()); | ||
}catch(NumberFormatException nfe){ | ||
return null; | ||
} | ||
|
||
if(millis){ | ||
l = l / 1000; | ||
} | ||
|
||
if(l < earliest) | ||
return null; | ||
|
||
if(latest >= 0 && l > latest) | ||
return null; | ||
|
||
Temporal t = new Temporal(jCas); | ||
|
||
t.setScope("SINGLE"); | ||
t.setTemporalType("DATETIME"); | ||
t.setPrecision("EXACT"); | ||
|
||
t.setTimestampStart(l); | ||
t.setTimestampStop(l + 1); | ||
|
||
return t; | ||
} | ||
|
||
@Override | ||
public AnalysisEngineAction getAction() { | ||
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Temporal.class)); | ||
} | ||
} |
41 changes: 41 additions & 0 deletions
41
baleen-annotators/src/main/java/uk/gov/dstl/baleen/annotators/regex/IpV6.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
//NCA (c) Crown Copyright 2017 | ||
package uk.gov.dstl.baleen.annotators.regex; | ||
|
||
import com.google.common.collect.ImmutableSet; | ||
import com.google.common.net.InetAddresses; | ||
import org.apache.uima.jcas.JCas; | ||
import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractRegexAnnotator; | ||
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; | ||
import uk.gov.dstl.baleen.types.common.CommsIdentifier; | ||
|
||
import java.util.Collections; | ||
import java.util.regex.Matcher; | ||
|
||
/** | ||
* Extract IPv6 addresses using RegEx, and validate it using the Java InetAddresses class. | ||
* | ||
* @baleen.javadoc | ||
*/ | ||
public class IpV6 extends AbstractRegexAnnotator<CommsIdentifier> { | ||
private static String IPV6 = "([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3})"; | ||
|
||
public IpV6(){ | ||
super(IPV6, false, 1.0); | ||
} | ||
|
||
@Override | ||
protected CommsIdentifier create(JCas jCas, Matcher matcher) { | ||
//Validate IP | ||
if(!InetAddresses.isInetAddress(matcher.group())) | ||
return null; | ||
|
||
CommsIdentifier ip = new CommsIdentifier(jCas); | ||
ip.setSubType("ipv6address"); | ||
return ip; | ||
} | ||
|
||
@Override | ||
public AnalysisEngineAction getAction() { | ||
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(CommsIdentifier.class)); | ||
} | ||
} |
40 changes: 40 additions & 0 deletions
40
baleen-annotators/src/main/java/uk/gov/dstl/baleen/annotators/regex/LenientUrl.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
//NCA (c) Crown Copyright 2017 | ||
package uk.gov.dstl.baleen.annotators.regex; | ||
|
||
import com.google.common.collect.ImmutableSet; | ||
import org.apache.uima.jcas.JCas; | ||
import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractRegexAnnotator; | ||
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; | ||
import uk.gov.dstl.baleen.types.common.Url; | ||
|
||
import java.util.Collections; | ||
import java.util.regex.Matcher; | ||
|
||
/** | ||
* Extract URLs in a more lenient fashion than the standard URL extractor, | ||
* by making the http or https optional. | ||
* | ||
* The regex used to perform this extraction is: | ||
* <code>\b(?<!@)(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?([?\/]\S*)?\b</code> | ||
* | ||
* @baleen.javadoc | ||
*/ | ||
public class LenientUrl extends AbstractRegexAnnotator<Url> { | ||
|
||
private static final String URL = "\\b(?<!@)(http:\\/\\/www\\.|https:\\/\\/www\\.|http:\\/\\/|https:\\/\\/)?[a-z0-9]+([\\-\\.]{1}[a-z0-9]+)*\\.[a-z]{2,5}(:[0-9]{1,5})?([?\\/]\\S*)?\\b"; | ||
|
||
public LenientUrl(){ | ||
super(URL, false, 1.0); | ||
} | ||
|
||
|
||
@Override | ||
protected Url create(JCas jCas, Matcher matcher) { | ||
return new Url(jCas); | ||
} | ||
|
||
@Override | ||
public AnalysisEngineAction getAction() { | ||
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Url.class)); | ||
} | ||
} |
Oops, something went wrong.