From 8169d827acdcd2659c513a18881c57565941da00 Mon Sep 17 00:00:00 2001 From: Konstantinos Servis Date: Thu, 14 Jun 2018 13:59:02 +0300 Subject: [PATCH] Fix unnecessarily-created JSON object as a result of the PII Enrichment (closes snowplow/snowplow#3636) --- .../pii/PiiPseudonymizerEnrichment.scala | 10 ++-- .../pii/PiiPseudonymizerEnrichmentSpec.scala | 46 +++++++++++++++++++ 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/registry/pii/PiiPseudonymizerEnrichment.scala b/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/registry/pii/PiiPseudonymizerEnrichment.scala index ff950f0..2ebd067 100644 --- a/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/registry/pii/PiiPseudonymizerEnrichment.scala +++ b/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/registry/pii/PiiPseudonymizerEnrichment.scala @@ -21,7 +21,7 @@ import scala.collection.mutable.MutableList // Scala libraries import org.json4s -import org.json4s.{DefaultFormats, JValue} +import org.json4s.{DefaultFormats, Diff, JValue} import org.json4s.JsonAST._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods @@ -275,7 +275,11 @@ final case class PiiJson(fieldMutator: Mutator, schemaCriterion: SchemaCriterion val documentContext2 = documentContext.map( jsonPath, new ScrambleMapFunction(strategy, modifiedFields, fieldMutator.fieldName, jsonPath, schema)) - (JsonMethods.fromJsonNode(documentContext2.json[JsonNode]), modifiedFields.toList) + // make sure it is a structure preserving method, see #3636 + val transformedJValue = JsonMethods.fromJsonNode(documentContext.json[JsonNode]()) + val Diff(_, erroneouslyAdded, _) = jValue diff transformedJValue + val Diff(_, withoutCruft, _) = erroneouslyAdded diff transformedJValue + (withoutCruft, modifiedFields.toList) } } @@ -295,7 +299,7 @@ private final class ScrambleMapFunction(strategy: PiiStrategy, case t: TextNode => val originalValue = t.asText() val newValue = strategy.scramble(originalValue) - val _ = modifiedFields += JsonModifiedField(fieldName, originalValue, newValue, jsonPath, schema) + modifiedFields += JsonModifiedField(fieldName, originalValue, newValue, jsonPath, schema) newValue case default: AnyRef => default } diff --git a/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/registry/pii/PiiPseudonymizerEnrichmentSpec.scala b/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/registry/pii/PiiPseudonymizerEnrichmentSpec.scala index f1d1114..806b672 100644 --- a/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/registry/pii/PiiPseudonymizerEnrichmentSpec.scala +++ b/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/registry/pii/PiiPseudonymizerEnrichmentSpec.scala @@ -34,6 +34,8 @@ import common.loaders.{CollectorApi, CollectorContext, CollectorPayload, Collect import common.outputs.EnrichedEvent import utils.TestResourcesRepositoryRef import common.SpecHelpers.toNameValuePairs +import common.utils.TestResourcesRepositoryRef +import utils.ScalazJson4sUtils // Iglu import iglu.client.SchemaCriterion @@ -54,6 +56,7 @@ class PiiPseudonymizerEnrichmentSpec extends Specification with ValidationMatche Hashing configured JSON fields in POJO should work when multiple fields are matched through schemacriterion $e5 Hashing configured JSON fields in POJO should silently ignore unsupported types $e6 Hashing configured JSON and scalar fields in POJO emits a correct pii_transformation event $e7 + Hashing configured JSON fields in POJO should not create new fields $e8 """ def commonSetup(enrichmentMap: EnrichmentMap): List[ValidatedEnrichedEvent] = { @@ -471,4 +474,47 @@ class PiiPseudonymizerEnrichmentSpec extends Specification with ValidationMatche (((unstructEventJ \ "data") \ "data" \ "myVar2").extract[String] must_== "awesome") } } + + def e8 = { + val enrichmentMap = Map( + ("ip_lookups" -> ipEnrichment), + ("pii_enrichment_config" -> PiiPseudonymizerEnrichment( + List( + PiiJson( + fieldMutator = JsonMutators.get("contexts").get, + schemaCriterion = SchemaCriterion.parse("iglu:com.acme/email_sent/jsonschema/1-0-0").toOption.get, + jsonPath = "$.['emailAddress', 'nonExistentEmailAddress']" + ) + ), + true, + PiiStrategyPseudonymize("SHA-256", hashFunction = DigestUtils.sha256Hex(_: Array[Byte]), "pepper123") + )) + ) + val output = commonSetup(enrichmentMap = enrichmentMap) + val expected = new EnrichedEvent() + expected.app_id = "ads" + expected.user_id = "john@acme.com" + expected.user_ipaddress = "70.46.123.145" + expected.ip_domain = null + expected.user_fingerprint = "its_you_again!" + expected.geo_city = "Delray Beach" + expected.etl_tstamp = "1970-01-18 08:40:00.000" + expected.collector_tstamp = "2017-07-14 03:39:39.000" + output.size must_== 1 + val out = output(0) + out must beSuccessful.like { + case enrichedEvent => { + implicit val formats = org.json4s.DefaultFormats + val contextJ = parse(enrichedEvent.contexts) + (((contextJ \ "data")(0) \ "data" \ "emailAddress") + .extract[String] must_== "72f323d5359eabefc69836369e4cabc6257c43ab6419b05dfb2211d0e44284c6") and + (ScalazJson4sUtils.fieldExists(((contextJ \ "data")(0) \ "data"), "nonExistentEmailAddress") must_== false) and + (((contextJ \ "data")(0) \ "data" \ "emailAddress2") + .extract[String] must_== "bob@acme.com") and + (((contextJ \ "data")(1) \ "data" \ "emailAddress") + .extract[String] must_== "tim@acme.com") and + (((contextJ \ "data")(1) \ "data" \ "emailAddress2").extract[String] must_== "tom@acme.com") + } + } + } }