From b7cd91c7b0b87ba0953cbe7feff3048f0191061c Mon Sep 17 00:00:00 2001 From: Benjamin Benoist Date: Mon, 21 Sep 2020 18:18:47 +0200 Subject: [PATCH] Common: fix PII enrichment adding empty objects instead of missing properties (close #351) --- .../pii/PiiPseudonymizerEnrichment.scala | 38 +++++++-- .../pii/PiiPseudonymizerEnrichmentSpec.scala | 77 ++++++++++++++----- 2 files changed, 91 insertions(+), 24 deletions(-) diff --git a/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/registry/pii/PiiPseudonymizerEnrichment.scala b/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/registry/pii/PiiPseudonymizerEnrichment.scala index 25f8286e4..c1991b0d9 100644 --- a/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/registry/pii/PiiPseudonymizerEnrichment.scala +++ b/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/registry/pii/PiiPseudonymizerEnrichment.scala @@ -133,6 +133,37 @@ object PiiPseudonymizerEnrichment extends ParseableEnrichment { .get(fieldName) .map(_.asRight) .getOrElse(s"The specified json field $fieldName is not supported".asLeft) + + /** Helper to remove fields that were wrongly added and are not in the original JSON. See #351. */ + private[pii] def removeAddedFields(hashed: Json, original: Json): Json = + hashed.asObject.map(_.toMap) match { + case Some(hashed_fields) => // hashed is JSON object + original.asObject.map(_.toMap) match { + case Some(orig_fields) => + val newMap = + hashed_fields + .collect { + case (k, v) if orig_fields.isDefinedAt(k) => + (k, removeAddedFields(v, orig_fields.get(k).get)) + } + Json.fromFields(newMap) + case None => + hashed // should never happen. Would mean change of type of one field + } + case None => + hashed.asArray match { + case Some(hashed_arr) => // hashed is array (can contain JSON objects) + original.asArray match { + case Some(orig_arr) => + // we can use zip because there should never be new fields in an array, only in an object + val values = hashed_arr.zip(orig_arr).map { case (hashed, orig) => removeAddedFields(hashed, orig) } + Json.fromValues(values) + case None => + Json.fromValues(hashed_arr) // should never happen. Would mean change of type of one field + } + case None => hashed // hashed is neither JSON object nor array + } + } } /** @@ -204,7 +235,8 @@ final case class PiiJson( ) } .getOrElse((parsed, List.empty[JsonModifiedField])) - } yield (substituted.noSpaces, modifiedFields.toList)).getOrElse((null, List.empty)) + } yield (PiiPseudonymizerEnrichment.removeAddedFields(substituted, parsed).noSpaces, modifiedFields.toList)) + .getOrElse((null, List.empty)) /** Map context top fields with strategy if they match. */ private def mapContextTopFields(tuple: (String, Json), strategy: PiiStrategy): (String, (Json, List[JsonModifiedField])) = @@ -272,10 +304,6 @@ final case class PiiJson( jsonPath, new ScrambleMapFunction(strategy, modifiedFields, fieldMutator.fieldName, jsonPath, schema) ) - // make sure it is a structure preserving method, see #3636 - //val transformedJValue = JsonMethods.fromJsonNode(documentContext.json[JsonNode]()) - //val Diff(_, erroneouslyAdded, _) = jValue diff transformedJValue - //val Diff(_, withoutCruft, _) = erroneouslyAdded diff transformedJValue (jacksonToCirce(documentContext2.json[JsonNode]()), modifiedFields.toList) } } diff --git a/modules/common/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/registry/pii/PiiPseudonymizerEnrichmentSpec.scala b/modules/common/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/registry/pii/PiiPseudonymizerEnrichmentSpec.scala index 8f5419eae..32afa53a5 100644 --- a/modules/common/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/registry/pii/PiiPseudonymizerEnrichmentSpec.scala +++ b/modules/common/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/registry/pii/PiiPseudonymizerEnrichmentSpec.scala @@ -56,6 +56,7 @@ class PiiPseudonymizerEnrichmentSpec extends Specification with ValidatedMatcher Hashing configured JSON fields in POJO should silently ignore unsupported types $e6 Hashing configured JSON and scalar fields in POJO emits a correct pii_transformation event $e7 Hashing configured JSON fields in POJO should not create new fields $e8 + removeAddedFields should remove fields added by PII enrichment $e9 """ def commonSetup(enrichmentReg: EnrichmentRegistry[Id]): List[Validated[BadRow, EnrichedEvent]] = { @@ -740,30 +741,68 @@ class PiiPseudonymizerEnrichmentSpec extends Specification with ValidatedMatcher ).some ) val output = commonSetup(enrichmentReg) - val expected = new EnrichedEvent() - expected.app_id = "ads" - expected.user_id = "john@acme.com" - expected.user_ipaddress = "70.46.123.145" - expected.ip_domain = null - expected.user_fingerprint = "its_you_again!" - expected.geo_city = "Delray Beach" - expected.etl_tstamp = "1970-01-18 08:40:00.000" - expected.collector_tstamp = "2017-07-14 03:39:39.000" val size = output.size must_== 1 val validOut = output.head must beValid.like { case enrichedEvent => - val contextJ = parse(enrichedEvent.contexts).toOption.get.hcursor.downField("data") - val firstElem = contextJ.downArray.downField("data") - val secondElem = contextJ.downArray.right.downField("data") + val context = parse(enrichedEvent.contexts).toOption.get.hcursor.downField("data").downArray + val data = context.downField("data") - (firstElem.get[String]("emailAddress") must beRight( - "72f323d5359eabefc69836369e4cabc6257c43ab6419b05dfb2211d0e44284c6" - )) and - (firstElem.downField("data").get[String]("nonExistentEmailAddress") must beLeft) and - (firstElem.get[String]("emailAddress2") must beRight("bob@acme.com")) and - (secondElem.get[String]("emailAddress") must beRight("tim@acme.com")) and - (secondElem.get[String]("emailAddress2") must beRight("tom@acme.com")) + val one = data.get[String]("emailAddress") must beRight("72f323d5359eabefc69836369e4cabc6257c43ab6419b05dfb2211d0e44284c6") + val two = data.get[String]("emailAddress2") must beRight("bob@acme.com") + val three = data.downField("nonExistentEmailAddress").focus must beNone + + one and two and three } size and validOut } + + def e9 = { + val orig = json""" + { + "schema" : "iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0", + "data" : [ + { + "schema" : "iglu:com.acme/email_sent/jsonschema/1-0-0", + "data" : { + "emailAddress" : "foo@bar.com", + "emailAddress2" : "bob@acme.com" + } + } + ] + } + """ + + val hashed = json""" + { + "schema" : "iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0", + "data" : [ + { + "schema" : "iglu:com.acme/email_sent/jsonschema/1-0-0", + "data" : { + "emailAddress" : "72f323d5359eabefc69836369e4cabc6257c43ab6419b05dfb2211d0e44284c6", + "emailAddress2" : "bob@acme.com", + "nonExistentEmailAddress" : {} + } + } + ] + } + """ + + val expected = json""" + { + "schema" : "iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0", + "data" : [ + { + "schema" : "iglu:com.acme/email_sent/jsonschema/1-0-0", + "data" : { + "emailAddress" : "72f323d5359eabefc69836369e4cabc6257c43ab6419b05dfb2211d0e44284c6", + "emailAddress2" : "bob@acme.com" + } + } + ] + } + """ + + PiiPseudonymizerEnrichment.removeAddedFields(hashed, orig) must beEqualTo(expected) + } }