diff --git a/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/registry/pii/PiiPseudonymizerEnrichment.scala b/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/registry/pii/PiiPseudonymizerEnrichment.scala index 25f8286e4..c1991b0d9 100644 --- a/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/registry/pii/PiiPseudonymizerEnrichment.scala +++ b/modules/common/src/main/scala/com.snowplowanalytics.snowplow.enrich/common/enrichments/registry/pii/PiiPseudonymizerEnrichment.scala @@ -133,6 +133,37 @@ object PiiPseudonymizerEnrichment extends ParseableEnrichment { .get(fieldName) .map(_.asRight) .getOrElse(s"The specified json field $fieldName is not supported".asLeft) + + /** Helper to remove fields that were wrongly added and are not in the original JSON. See #351. */ + private[pii] def removeAddedFields(hashed: Json, original: Json): Json = + hashed.asObject.map(_.toMap) match { + case Some(hashed_fields) => // hashed is JSON object + original.asObject.map(_.toMap) match { + case Some(orig_fields) => + val newMap = + hashed_fields + .collect { + case (k, v) if orig_fields.isDefinedAt(k) => + (k, removeAddedFields(v, orig_fields.get(k).get)) + } + Json.fromFields(newMap) + case None => + hashed // should never happen. Would mean change of type of one field + } + case None => + hashed.asArray match { + case Some(hashed_arr) => // hashed is array (can contain JSON objects) + original.asArray match { + case Some(orig_arr) => + // we can use zip because there should never be new fields in an array, only in an object + val values = hashed_arr.zip(orig_arr).map { case (hashed, orig) => removeAddedFields(hashed, orig) } + Json.fromValues(values) + case None => + Json.fromValues(hashed_arr) // should never happen. Would mean change of type of one field + } + case None => hashed // hashed is neither JSON object nor array + } + } } /** @@ -204,7 +235,8 @@ final case class PiiJson( ) } .getOrElse((parsed, List.empty[JsonModifiedField])) - } yield (substituted.noSpaces, modifiedFields.toList)).getOrElse((null, List.empty)) + } yield (PiiPseudonymizerEnrichment.removeAddedFields(substituted, parsed).noSpaces, modifiedFields.toList)) + .getOrElse((null, List.empty)) /** Map context top fields with strategy if they match. */ private def mapContextTopFields(tuple: (String, Json), strategy: PiiStrategy): (String, (Json, List[JsonModifiedField])) = @@ -272,10 +304,6 @@ final case class PiiJson( jsonPath, new ScrambleMapFunction(strategy, modifiedFields, fieldMutator.fieldName, jsonPath, schema) ) - // make sure it is a structure preserving method, see #3636 - //val transformedJValue = JsonMethods.fromJsonNode(documentContext.json[JsonNode]()) - //val Diff(_, erroneouslyAdded, _) = jValue diff transformedJValue - //val Diff(_, withoutCruft, _) = erroneouslyAdded diff transformedJValue (jacksonToCirce(documentContext2.json[JsonNode]()), modifiedFields.toList) } } diff --git a/modules/common/src/test/resources/iglu-schemas/schemas/com.acme/email_sent/jsonschema/1-0-0 b/modules/common/src/test/resources/iglu-schemas/schemas/com.acme/email_sent/jsonschema/1-0-0 index 087d4e6cd..18dd216f5 100644 --- a/modules/common/src/test/resources/iglu-schemas/schemas/com.acme/email_sent/jsonschema/1-0-0 +++ b/modules/common/src/test/resources/iglu-schemas/schemas/com.acme/email_sent/jsonschema/1-0-0 @@ -14,6 +14,9 @@ }, "emailAddress2": { "type": "string" + }, + "emailAddress3": { + "type": "string" } }, "required": ["emailAddress", "emailAddress2"], diff --git a/modules/common/src/test/resources/iglu-schemas/schemas/com.acme/email_sent/jsonschema/2-0-0 b/modules/common/src/test/resources/iglu-schemas/schemas/com.acme/email_sent/jsonschema/2-0-0 new file mode 100644 index 000000000..eca4ca19d --- /dev/null +++ b/modules/common/src/test/resources/iglu-schemas/schemas/com.acme/email_sent/jsonschema/2-0-0 @@ -0,0 +1,24 @@ +{ + "$schema": "http://iglucentral.com/schemas/com.snowplowanalytics.self-desc/schema/jsonschema/1-0-0#", + "description": "Schema for acme stuff", + "self": { + "vendor": "com.acme", + "name": "email_sent", + "format": "jsonschema", + "version": "1-1-0" + }, + "type": "object", + "properties": { + "emailAddress": { + "type": "string" + }, + "emailAddress2": { + "type": "string" + }, + "emailAddress3": { + "type": ["string", "null"] + } + }, + "required": ["emailAddress", "emailAddress2"], + "additionalProperties": false +} diff --git a/modules/common/src/test/resources/iglu-schemas/schemas/com.test/array/jsonschema/1-0-0 b/modules/common/src/test/resources/iglu-schemas/schemas/com.test/array/jsonschema/1-0-0 index 97e2490a3..b2310754d 100644 --- a/modules/common/src/test/resources/iglu-schemas/schemas/com.test/array/jsonschema/1-0-0 +++ b/modules/common/src/test/resources/iglu-schemas/schemas/com.test/array/jsonschema/1-0-0 @@ -28,6 +28,10 @@ "type": "string" } } + }, + "field4": { + "type": "string", + "maxLength": 64 } }, "required": ["field"], diff --git a/modules/common/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/EnrichmentManagerSpec.scala b/modules/common/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/EnrichmentManagerSpec.scala index b76e01c12..7fd1a4289 100644 --- a/modules/common/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/EnrichmentManagerSpec.scala +++ b/modules/common/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/EnrichmentManagerSpec.scala @@ -17,21 +17,23 @@ package enrichments import cats.Id import cats.implicits._ import cats.data.NonEmptyList - import io.circe.literal._ - import org.joda.time.DateTime - import com.snowplowanalytics.snowplow.badrows._ -import com.snowplowanalytics.iglu.core.{SchemaKey, SchemaVer} - +import com.snowplowanalytics.iglu.core.{SchemaCriterion, SchemaKey, SchemaVer} import loaders._ import adapters.RawEvent +import com.snowplowanalytics.snowplow.enrich.common.enrichments.registry.pii.{ + JsonMutators, + PiiJson, + PiiPseudonymizerEnrichment, + PiiStrategyPseudonymize +} import com.snowplowanalytics.snowplow.enrich.common.outputs.EnrichedEvent import utils.Clock._ import utils.ConversionUtils import enrichments.registry.{IabEnrichment, JavascriptScriptEnrichment, YauaaEnrichment} - +import org.apache.commons.codec.digest.DigestUtils import org.specs2.mutable.Specification import org.specs2.matcher.EitherMatchers @@ -87,7 +89,7 @@ class EnrichmentManagerSpec extends Specification with EitherMatchers { "data": { "emailAddress": "hello@world.com", "emailAddress2": "foo@bar.org", - "emailAddress3": "foo@bar.org" + "unallowedAdditionalField": "foo@bar.org" } } }""" @@ -267,6 +269,314 @@ class EnrichmentManagerSpec extends Specification with EitherMatchers { enriched.value must beRight } + "emit an EnrichedEvent if a PII value that needs to be hashed is an empty string" >> { + val parameters = Map( + "e" -> "ue", + "tv" -> "js-0.13.1", + "p" -> "web", + "co" -> """ + { + "schema": "iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0", + "data": [ + { + "schema":"iglu:com.acme/email_sent/jsonschema/1-0-0", + "data": { + "emailAddress": "hello@world.com", + "emailAddress2": "foo@bar.org" + } + } + ] + } + """, + "ue_pr" -> """ + { + "schema":"iglu:com.snowplowanalytics.snowplow/unstruct_event/jsonschema/1-0-0", + "data":{ + "schema":"iglu:com.acme/email_sent/jsonschema/1-0-0", + "data": { + "emailAddress": "hello@world.com", + "emailAddress2": "foo@bar.org", + "emailAddress3": "" + } + } + }""" + ) + val rawEvent = RawEvent(api, parameters, None, source, context) + val enrichmentReg = EnrichmentRegistry[Id]( + piiPseudonymizer = PiiPseudonymizerEnrichment( + List( + PiiJson( + fieldMutator = JsonMutators("unstruct_event"), + schemaCriterion = SchemaCriterion("com.acme", "email_sent", "jsonschema", 1, 0, 0), + jsonPath = "$.emailAddress3" + ) + ), + false, + PiiStrategyPseudonymize( + "MD5", + hashFunction = DigestUtils.sha256Hex(_: Array[Byte]), + "pepper123" + ) + ).some + ) + val enriched = EnrichmentManager.enrichEvent( + enrichmentReg, + client, + processor, + timestamp, + rawEvent + ) + enriched.value must beRight + } + + "emit an EnrichedEvent if a PII value that needs to be hashed is null" >> { + val parameters = Map( + "e" -> "ue", + "tv" -> "js-0.13.1", + "p" -> "web", + "co" -> """ + { + "schema": "iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0", + "data": [ + { + "schema":"iglu:com.acme/email_sent/jsonschema/1-0-0", + "data": { + "emailAddress": "hello@world.com", + "emailAddress2": "foo@bar.org" + } + } + ] + } + """, + "ue_pr" -> """ + { + "schema":"iglu:com.snowplowanalytics.snowplow/unstruct_event/jsonschema/1-0-0", + "data":{ + "schema":"iglu:com.acme/email_sent/jsonschema/2-0-0", + "data": { + "emailAddress": "hello@world.com", + "emailAddress2": "foo@bar.org", + "emailAddress3": null + } + } + }""" + ) + val rawEvent = RawEvent(api, parameters, None, source, context) + val enrichmentReg = EnrichmentRegistry[Id]( + piiPseudonymizer = PiiPseudonymizerEnrichment( + List( + PiiJson( + fieldMutator = JsonMutators("unstruct_event"), + schemaCriterion = SchemaCriterion("com.acme", "email_sent", "jsonschema", 1, 0, 0), + jsonPath = "$.emailAddress3" + ) + ), + false, + PiiStrategyPseudonymize( + "MD5", + hashFunction = DigestUtils.sha256Hex(_: Array[Byte]), + "pepper123" + ) + ).some + ) + val enriched = EnrichmentManager.enrichEvent( + enrichmentReg, + client, + processor, + timestamp, + rawEvent + ) + enriched.value must beRight + } + + "fail to emit an EnrichedEvent if a PII value that needs to be hashed is an empty object" >> { + val parameters = Map( + "e" -> "ue", + "tv" -> "js-0.13.1", + "p" -> "web", + "co" -> """ + { + "schema": "iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0", + "data": [ + { + "schema":"iglu:com.acme/email_sent/jsonschema/1-0-0", + "data": { + "emailAddress": "hello@world.com", + "emailAddress2": "foo@bar.org" + } + } + ] + } + """, + "ue_pr" -> """ + { + "schema":"iglu:com.snowplowanalytics.snowplow/unstruct_event/jsonschema/1-0-0", + "data":{ + "schema":"iglu:com.acme/email_sent/jsonschema/1-0-0", + "data": { + "emailAddress": "hello@world.com", + "emailAddress2": "foo@bar.org", + "emailAddress3": {} + } + } + }""" + ) + val rawEvent = RawEvent(api, parameters, None, source, context) + val enrichmentReg = EnrichmentRegistry[Id]( + piiPseudonymizer = PiiPseudonymizerEnrichment( + List( + PiiJson( + fieldMutator = JsonMutators("unstruct_event"), + schemaCriterion = SchemaCriterion("com.acme", "email_sent", "jsonschema", 1, 0, 0), + jsonPath = "$.emailAddress3" + ) + ), + false, + PiiStrategyPseudonymize( + "MD5", + hashFunction = DigestUtils.sha256Hex(_: Array[Byte]), + "pepper123" + ) + ).some + ) + val enriched = EnrichmentManager.enrichEvent( + enrichmentReg, + client, + processor, + timestamp, + rawEvent + ) + enriched.value must beLeft + } + + "fail to emit an EnrichedEvent if a context PII value that needs to be hashed is an empty object" >> { + val parameters = Map( + "e" -> "ue", + "tv" -> "js-0.13.1", + "p" -> "web", + "co" -> """ + { + "schema": "iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0", + "data": [ + { + "schema":"iglu:com.acme/email_sent/jsonschema/1-0-0", + "data": { + "emailAddress": "hello@world.com", + "emailAddress2": "foo@bar.org", + "emailAddress3": {} + } + } + ] + } + """, + "ue_pr" -> """ + { + "schema":"iglu:com.snowplowanalytics.snowplow/unstruct_event/jsonschema/1-0-0", + "data":{ + "schema":"iglu:com.acme/email_sent/jsonschema/1-0-0", + "data": { + "emailAddress": "hello@world.com", + "emailAddress2": "foo@bar.org" + } + } + }""" + ) + val rawEvent = RawEvent(api, parameters, None, source, context) + val enrichmentReg = EnrichmentRegistry[Id]( + piiPseudonymizer = PiiPseudonymizerEnrichment( + List( + PiiJson( + fieldMutator = JsonMutators("contexts"), + schemaCriterion = SchemaCriterion("com.acme", "email_sent", "jsonschema", 1, 0, 0), + jsonPath = "$.emailAddress3" + ) + ), + false, + PiiStrategyPseudonymize( + "MD5", + hashFunction = DigestUtils.sha256Hex(_: Array[Byte]), + "pepper123" + ) + ).some + ) + def enriched = + EnrichmentManager.enrichEvent( + enrichmentReg, + client, + processor, + timestamp, + rawEvent + ) + enriched.value must beLeft + } + + "fail to emit an EnrichedEvent if a PII value needs to be hashed in both co and ue and is invalid in one of them" >> { + val parameters = Map( + "e" -> "ue", + "tv" -> "js-0.13.1", + "p" -> "web", + "co" -> """ + { + "schema": "iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0", + "data": [ + { + "schema":"iglu:com.acme/email_sent/jsonschema/1-0-0", + "data": { + "emailAddress": "hello@world.com", + "emailAddress2": "foo@bar.org", + "emailAddress3": {} + } + } + ] + } + """, + "ue_pr" -> """ + { + "schema":"iglu:com.snowplowanalytics.snowplow/unstruct_event/jsonschema/1-0-0", + "data":{ + "schema":"iglu:com.acme/email_sent/jsonschema/1-0-0", + "data": { + "emailAddress": "hello@world.com", + "emailAddress2": "foo@bar.org", + "emailAddress3": "" + } + } + }""" + ) + val rawEvent = RawEvent(api, parameters, None, source, context) + val enrichmentReg = EnrichmentRegistry[Id]( + piiPseudonymizer = PiiPseudonymizerEnrichment( + List( + PiiJson( + fieldMutator = JsonMutators("contexts"), + schemaCriterion = SchemaCriterion("com.acme", "email_sent", "jsonschema", 1, 0, 0), + jsonPath = "$.emailAddress3" + ), + PiiJson( + fieldMutator = JsonMutators("unstruct_event"), + schemaCriterion = SchemaCriterion("com.acme", "email_sent", "jsonschema", 1, 0, 0), + jsonPath = "$.emailAddress3" + ) + ), + false, + PiiStrategyPseudonymize( + "MD5", + hashFunction = DigestUtils.sha256Hex(_: Array[Byte]), + "pepper123" + ) + ).some + ) + def enriched = + EnrichmentManager.enrichEvent( + enrichmentReg, + client, + processor, + timestamp, + rawEvent + ) + enriched.value must beLeft + } + "have a preference of 'ua' query string parameter over user agent of HTTP header" >> { val qs_ua = "Mozilla/5.0 (X11; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0" val parameters = Map( diff --git a/modules/common/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/registry/pii/PiiPseudonymizerEnrichmentSpec.scala b/modules/common/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/registry/pii/PiiPseudonymizerEnrichmentSpec.scala index 4837631f1..32afa53a5 100644 --- a/modules/common/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/registry/pii/PiiPseudonymizerEnrichmentSpec.scala +++ b/modules/common/src/test/scala/com.snowplowanalytics.snowplow.enrich.common/enrichments/registry/pii/PiiPseudonymizerEnrichmentSpec.scala @@ -56,6 +56,7 @@ class PiiPseudonymizerEnrichmentSpec extends Specification with ValidatedMatcher Hashing configured JSON fields in POJO should silently ignore unsupported types $e6 Hashing configured JSON and scalar fields in POJO emits a correct pii_transformation event $e7 Hashing configured JSON fields in POJO should not create new fields $e8 + removeAddedFields should remove fields added by PII enrichment $e9 """ def commonSetup(enrichmentReg: EnrichmentRegistry[Id]): List[Validated[BadRow, EnrichedEvent]] = { @@ -118,7 +119,8 @@ class PiiPseudonymizerEnrichmentSpec extends Specification with ValidatedMatcher | "data": { | "field" : ["hello", "world"], | "field2" : null, - | "field3": null + | "field3": null, + | "field4": "" | } | } | ] @@ -363,6 +365,11 @@ class PiiPseudonymizerEnrichmentSpec extends Specification with ValidatedMatcher fieldMutator = JsonMutators("unstruct_event"), schemaCriterion = SchemaCriterion("com.mailgun", "message_clicked", "jsonschema", 1, 0, 0), jsonPath = "$.ip" + ), + PiiJson( + fieldMutator = JsonMutators("contexts"), + schemaCriterion = SchemaCriterion("com.test", "array", "jsonschema", 1, 0, 0), + jsonPath = "$.field4" ) ), false, @@ -447,7 +454,12 @@ class PiiPseudonymizerEnrichmentSpec extends Specification with ValidatedMatcher .downField("field3") .focus must beSome.like { case json => json.isNull }) - first and second and third + // Test that empty string in Pii field gets hashed + val fourth = contextJThirdElement + .downField("data") + .get[String]("field4") must beRight("7a3477dad66e666bd203b834c54b6dfe8b546bdbc5283462ad14052abfb06600") + + first and second and third and fourth } size and validOut @@ -729,30 +741,68 @@ class PiiPseudonymizerEnrichmentSpec extends Specification with ValidatedMatcher ).some ) val output = commonSetup(enrichmentReg) - val expected = new EnrichedEvent() - expected.app_id = "ads" - expected.user_id = "john@acme.com" - expected.user_ipaddress = "70.46.123.145" - expected.ip_domain = null - expected.user_fingerprint = "its_you_again!" - expected.geo_city = "Delray Beach" - expected.etl_tstamp = "1970-01-18 08:40:00.000" - expected.collector_tstamp = "2017-07-14 03:39:39.000" val size = output.size must_== 1 val validOut = output.head must beValid.like { case enrichedEvent => - val contextJ = parse(enrichedEvent.contexts).toOption.get.hcursor.downField("data") - val firstElem = contextJ.downArray.downField("data") - val secondElem = contextJ.downArray.right.downField("data") + val context = parse(enrichedEvent.contexts).toOption.get.hcursor.downField("data").downArray + val data = context.downField("data") - (firstElem.get[String]("emailAddress") must beRight( - "72f323d5359eabefc69836369e4cabc6257c43ab6419b05dfb2211d0e44284c6" - )) and - (firstElem.downField("data").get[String]("nonExistentEmailAddress") must beLeft) and - (firstElem.get[String]("emailAddress2") must beRight("bob@acme.com")) and - (secondElem.get[String]("emailAddress") must beRight("tim@acme.com")) and - (secondElem.get[String]("emailAddress2") must beRight("tom@acme.com")) + val one = data.get[String]("emailAddress") must beRight("72f323d5359eabefc69836369e4cabc6257c43ab6419b05dfb2211d0e44284c6") + val two = data.get[String]("emailAddress2") must beRight("bob@acme.com") + val three = data.downField("nonExistentEmailAddress").focus must beNone + + one and two and three } size and validOut } + + def e9 = { + val orig = json""" + { + "schema" : "iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0", + "data" : [ + { + "schema" : "iglu:com.acme/email_sent/jsonschema/1-0-0", + "data" : { + "emailAddress" : "foo@bar.com", + "emailAddress2" : "bob@acme.com" + } + } + ] + } + """ + + val hashed = json""" + { + "schema" : "iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0", + "data" : [ + { + "schema" : "iglu:com.acme/email_sent/jsonschema/1-0-0", + "data" : { + "emailAddress" : "72f323d5359eabefc69836369e4cabc6257c43ab6419b05dfb2211d0e44284c6", + "emailAddress2" : "bob@acme.com", + "nonExistentEmailAddress" : {} + } + } + ] + } + """ + + val expected = json""" + { + "schema" : "iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0", + "data" : [ + { + "schema" : "iglu:com.acme/email_sent/jsonschema/1-0-0", + "data" : { + "emailAddress" : "72f323d5359eabefc69836369e4cabc6257c43ab6419b05dfb2211d0e44284c6", + "emailAddress2" : "bob@acme.com" + } + } + ] + } + """ + + PiiPseudonymizerEnrichment.removeAddedFields(hashed, orig) must beEqualTo(expected) + } }