Skip to content

Commit

Permalink
Merged (#61)
Browse files Browse the repository at this point in the history
* Update version number to 2.5.0-SNAPSHOT

* Update Swagger to 3.4.5

* Update dependency versions

* Update OpenCSV version

* Change from using the Reflections library to FastClasspathScanner, which is both faster and supports Java 9. Fixes #51.

* New servlet to retrieve default values (e.g. default ContentExtractor)

* Fix Issue #49 (Plankton should have default content extractor selected)

* Upgrade to Elasticsearch 5.6.4

* Rename UimaContentExtractor to PlainTextContentExtractor

* Add null check in to prevent collection reader hanging if metadata is missing

* Move away from IOUtils, which can cause freezing on large documents - Fixes Issue #50

* Remove deprecated APIs

* Add NCA copyright

* Add NCA copyright

* Add NCA copyright

* Add NCA copyright

* Add new Vulnerability type to type system

* New annotators: CVE, Epoch Time, IPv6 and Lenient URL

* New collection readers: CSV Folder, MBOX, SQL Cell, SQL DB Cell, SQL Row

* New consumers: Elastic-Kibana, Gremlin

* Add copyright statement

* Update version

* Remove use of deprecated function

* Update dependency versions

* Update dependency version

* Update test resources

* Update dependencies

* Fix issue with child nodes in JSoup

* Dependency updates

* Wrap Stream in a try-resource block so that it is closed after use.

I believe this would have been closed when it went out of scope anyway,
but the approach here ensures that is the case.

* Wrap ResultSet in a try-resource block so that it is closed after use.

* Wrap Stream in a try-resource block so that it is closed after use.

* Wrap Stream in a try-resource block so that it is closed after use.

* Wrap ResultSet in a try-resource block so that it is closed after use.

* Wrap resources in a try-resource block so that it is closed after use.

* Bug fix for cases where a file path is used rather than a URI

* Wrap ResultSet in a try-resource block so that it is closed after use.

* Ensure SQL resources are closed

* Change client variable to be of type TransportClient

* Remove incorrect Javadoc

* Make variable final
  • Loading branch information
jbaker-nca authored and JohnDaws committed Mar 12, 2018
1 parent 1278c4f commit bf1ed82
Show file tree
Hide file tree
Showing 115 changed files with 4,541 additions and 1,074 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,7 @@ local.properties
target/

# Baleen specific
testing.log
testing.log

# IntelliJ IDEA
*.iml
2 changes: 1 addition & 1 deletion BUILD.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@
2. Run `mvn package` from the Baleen project directory
3. Optionally run `mvn javadoc:aggregate-jar` to build Javadoc
4. The Baleen JAR will be built and saved in the target directory under the top level project directory
5. Run Baleen by running `java -jar baleen-2.4.0.jar` and then navigating to <http://localhost:6413>
5. Run Baleen by running `java -jar baleen-2.5.0-SNAPSHOT.jar` and then navigating to <http://localhost:6413>
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Baleen includes an in-built server, which hosts full documentation and guides on
To get started, you will need to launch this server and read this documentation.
To launch the server, run the following command.

> java -jar baleen-2.4.0.jar
> java -jar baleen-2.5.0-SNAPSHOT.jar
Once running, the server can be accessed at [http://localhost:6413](http://localhost:6413).

Expand Down
2 changes: 1 addition & 1 deletion baleen-annotators/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<parent>
<groupId>uk.gov.dstl.baleen</groupId>
<artifactId>baleen</artifactId>
<version>2.4.1-SNAPSHOT</version>
<version>2.5.0-SNAPSHOT</version>
</parent>
<artifactId>baleen-annotators</artifactId>
<name>Baleen Annotators</name>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,9 @@
//Dstl (c) Crown Copyright 2017
//Modified by NCA (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.cleaners;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;
import org.apache.uima.UIMAException;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
Expand All @@ -20,10 +13,6 @@
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;

import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.core.utils.ReflectionUtils;
import uk.gov.dstl.baleen.exceptions.BaleenException;
Expand All @@ -33,6 +22,11 @@
import uk.gov.dstl.baleen.uima.utils.TypeSystemSingleton;
import uk.gov.dstl.baleen.uima.utils.TypeUtils;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
* Coreference entities where a series of entities of the same type appears in brackets.
*
Expand Down Expand Up @@ -116,7 +110,7 @@ public void doProcess(JCas jCas) throws AnalysisEngineProcessException {

@Override
public AnalysisEngineAction getAction() {
Set<Class<? extends Entity>> types = ReflectionUtils.getInstance().getSubTypesOf(Entity.class);
Set<Class<? extends Entity>> types = ReflectionUtils.getSubTypes(Entity.class);
types.removeAll(classTypes);

Set<Class<? extends Annotation>> annotations = new HashSet<>();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,7 @@
//Dstl (c) Crown Copyright 2017
//Modified by NCA (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.cleaners;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.uima.UIMAException;
import org.apache.uima.UimaContext;
import org.apache.uima.cas.FSIterator;
Expand All @@ -18,14 +11,15 @@
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.maltparser.core.helper.HashSet;

import uk.gov.dstl.baleen.annotators.cleaners.helpers.AbstractNestedEntities;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.core.utils.ReflectionUtils;
import uk.gov.dstl.baleen.exceptions.BaleenException;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.uima.utils.TypeSystemSingleton;
import uk.gov.dstl.baleen.uima.utils.TypeUtils;

import java.util.*;
/**
* Remove entities which are contained within other entities of the same type
*
Expand Down Expand Up @@ -101,7 +95,7 @@ public void doDestroy() {

@Override
public AnalysisEngineAction getAction() {
Set<Class<? extends Entity>> types = ReflectionUtils.getInstance().getSubTypesOf(Entity.class);
Set<Class<? extends Entity>> types = ReflectionUtils.getSubTypes(Entity.class);
types.removeAll(classTypes);

Set<Class<? extends Annotation>> annotations = new HashSet<>();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,7 @@
//Dstl (c) Crown Copyright 2017
//Modified by NCA (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.cleaners;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.uima.UIMAException;
import org.apache.uima.UimaContext;
import org.apache.uima.cas.FSIterator;
Expand All @@ -16,7 +10,6 @@
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;

import uk.gov.dstl.baleen.annotators.cleaners.helpers.AbstractNestedEntities;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.core.utils.ReflectionUtils;
Expand All @@ -25,6 +18,8 @@
import uk.gov.dstl.baleen.uima.utils.TypeSystemSingleton;
import uk.gov.dstl.baleen.uima.utils.TypeUtils;

import java.util.*;

/**
* Remove entities which are contained within other entities of any type.
* <p>
Expand Down Expand Up @@ -100,7 +95,7 @@ public void doDestroy() {

@Override
public AnalysisEngineAction getAction() {
Set<Class<? extends Entity>> types = ReflectionUtils.getInstance().getSubTypesOf(Entity.class);
Set<Class<? extends Entity>> types = ReflectionUtils.getSubTypes(Entity.class);
types.removeAll(classTypes);

Set<Class<? extends Annotation>> annotations = new HashSet<>();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//NCA (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.regex;

import com.google.common.collect.ImmutableSet;
import org.apache.uima.jcas.JCas;
import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractRegexAnnotator;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.common.Vulnerability;

import java.util.Collections;
import java.util.regex.Matcher;

/**
* Extracts CVE (Common Vulnerabilities and Exposures) references from text using a regular expression,
* and annotate them as Vulnerability entities.
*
* @baleen.javadoc
*/
public class Cve extends AbstractRegexAnnotator<Vulnerability> {

public Cve(){
super("\\bCVE-[0-9]{4}-[0-9]+\\b", false, 1.0);
}

@Override
protected Vulnerability create(JCas jCas, Matcher matcher) {
return new Vulnerability(jCas);
}

@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Vulnerability.class));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
//NCA (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.regex;

import com.google.common.collect.ImmutableSet;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractRegexAnnotator;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.semantic.Temporal;

import java.util.Collections;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Extract UNIX Epoch timestamps from text and annotate them as Temporal entities
*
* @baleen.javadoc
*/
public class EpochTime extends AbstractRegexAnnotator<Temporal> {

/**
* What's the earliest timestamp that is acceptable?
*
* By default, must be after 1st January 2000.
*
* @baleen.config 946684800
*/
public static final String PARAM_EARLIEST = "earliest";
@ConfigurationParameter(name = PARAM_EARLIEST, defaultValue = "946684800")
private long earliest;

/**
* What's the latest timestamp that is acceptable?
* A value of -1 will indicate no maximum.
*
* @baleen.config -1
*/
public static final String PARAM_LATEST = "latest";
@ConfigurationParameter(name = PARAM_LATEST, defaultValue = "-1")
private long latest;

/**
* Is the timestamp in milliseconds (rather than seconds).
* Milliseconds will be converted into seconds (floored),
* as Baleen does not support timestamps of millisecond resolution.
*
* @baleen.config false
*/
public static final String PARAM_MILLIS = "millis";
@ConfigurationParameter(name = PARAM_MILLIS, defaultValue = "false")
private boolean millis;

public EpochTime(){
super(Pattern.compile("\\b\\d+\\b"), 1.0);
}

@Override
protected Temporal create(JCas jCas, Matcher matcher) {
Long l;
try {
l = Long.parseLong(matcher.group());
}catch(NumberFormatException nfe){
return null;
}

if(millis){
l = l / 1000;
}

if(l < earliest)
return null;

if(latest >= 0 && l > latest)
return null;

Temporal t = new Temporal(jCas);

t.setScope("SINGLE");
t.setTemporalType("DATETIME");
t.setPrecision("EXACT");

t.setTimestampStart(l);
t.setTimestampStop(l + 1);

return t;
}

@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Temporal.class));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
//NCA (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.regex;

import com.google.common.collect.ImmutableSet;
import com.google.common.net.InetAddresses;
import org.apache.uima.jcas.JCas;
import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractRegexAnnotator;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.common.CommsIdentifier;

import java.util.Collections;
import java.util.regex.Matcher;

/**
* Extract IPv6 addresses using RegEx, and validate it using the Java InetAddresses class.
*
* @baleen.javadoc
*/
public class IpV6 extends AbstractRegexAnnotator<CommsIdentifier> {
private static String IPV6 = "([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3})";

public IpV6(){
super(IPV6, false, 1.0);
}

@Override
protected CommsIdentifier create(JCas jCas, Matcher matcher) {
//Validate IP
if(!InetAddresses.isInetAddress(matcher.group()))
return null;

CommsIdentifier ip = new CommsIdentifier(jCas);
ip.setSubType("ipv6address");
return ip;
}

@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(CommsIdentifier.class));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
//NCA (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.regex;

import com.google.common.collect.ImmutableSet;
import org.apache.uima.jcas.JCas;
import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractRegexAnnotator;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.common.Url;

import java.util.Collections;
import java.util.regex.Matcher;

/**
* Extract URLs in a more lenient fashion than the standard URL extractor,
* by making the http or https optional.
*
* The regex used to perform this extraction is:
* <code>\b(?<!@)(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?([?\/]\S*)?\b</code>
*
* @baleen.javadoc
*/
public class LenientUrl extends AbstractRegexAnnotator<Url> {

private static final String URL = "\\b(?<!@)(http:\\/\\/www\\.|https:\\/\\/www\\.|http:\\/\\/|https:\\/\\/)?[a-z0-9]+([\\-\\.]{1}[a-z0-9]+)*\\.[a-z]{2,5}(:[0-9]{1,5})?([?\\/]\\S*)?\\b";

public LenientUrl(){
super(URL, false, 1.0);
}


@Override
protected Url create(JCas jCas, Matcher matcher) {
return new Url(jCas);
}

@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Url.class));
}
}
Loading

0 comments on commit bf1ed82

Please sign in to comment.