Skip to content

Commit

Permalink
Common: fix PII enrichment adding empty objects instead of missing pr…
Browse files Browse the repository at this point in the history
…operties (close #351)
  • Loading branch information
benjben committed Sep 21, 2020
1 parent ca8b3b7 commit b7cd91c
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,37 @@ object PiiPseudonymizerEnrichment extends ParseableEnrichment {
.get(fieldName)
.map(_.asRight)
.getOrElse(s"The specified json field $fieldName is not supported".asLeft)

/** Helper to remove fields that were wrongly added and are not in the original JSON. See #351. */
private[pii] def removeAddedFields(hashed: Json, original: Json): Json =
hashed.asObject.map(_.toMap) match {
case Some(hashed_fields) => // hashed is JSON object
original.asObject.map(_.toMap) match {
case Some(orig_fields) =>
val newMap =
hashed_fields
.collect {
case (k, v) if orig_fields.isDefinedAt(k) =>
(k, removeAddedFields(v, orig_fields.get(k).get))
}
Json.fromFields(newMap)
case None =>
hashed // should never happen. Would mean change of type of one field
}
case None =>
hashed.asArray match {
case Some(hashed_arr) => // hashed is array (can contain JSON objects)
original.asArray match {
case Some(orig_arr) =>
// we can use zip because there should never be new fields in an array, only in an object
val values = hashed_arr.zip(orig_arr).map { case (hashed, orig) => removeAddedFields(hashed, orig) }
Json.fromValues(values)
case None =>
Json.fromValues(hashed_arr) // should never happen. Would mean change of type of one field
}
case None => hashed // hashed is neither JSON object nor array
}
}
}

/**
Expand Down Expand Up @@ -204,7 +235,8 @@ final case class PiiJson(
)
}
.getOrElse((parsed, List.empty[JsonModifiedField]))
} yield (substituted.noSpaces, modifiedFields.toList)).getOrElse((null, List.empty))
} yield (PiiPseudonymizerEnrichment.removeAddedFields(substituted, parsed).noSpaces, modifiedFields.toList))
.getOrElse((null, List.empty))

/** Map context top fields with strategy if they match. */
private def mapContextTopFields(tuple: (String, Json), strategy: PiiStrategy): (String, (Json, List[JsonModifiedField])) =
Expand Down Expand Up @@ -272,10 +304,6 @@ final case class PiiJson(
jsonPath,
new ScrambleMapFunction(strategy, modifiedFields, fieldMutator.fieldName, jsonPath, schema)
)
// make sure it is a structure preserving method, see #3636
//val transformedJValue = JsonMethods.fromJsonNode(documentContext.json[JsonNode]())
//val Diff(_, erroneouslyAdded, _) = jValue diff transformedJValue
//val Diff(_, withoutCruft, _) = erroneouslyAdded diff transformedJValue
(jacksonToCirce(documentContext2.json[JsonNode]()), modifiedFields.toList)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class PiiPseudonymizerEnrichmentSpec extends Specification with ValidatedMatcher
Hashing configured JSON fields in POJO should silently ignore unsupported types $e6
Hashing configured JSON and scalar fields in POJO emits a correct pii_transformation event $e7
Hashing configured JSON fields in POJO should not create new fields $e8
removeAddedFields should remove fields added by PII enrichment $e9
"""

def commonSetup(enrichmentReg: EnrichmentRegistry[Id]): List[Validated[BadRow, EnrichedEvent]] = {
Expand Down Expand Up @@ -740,30 +741,68 @@ class PiiPseudonymizerEnrichmentSpec extends Specification with ValidatedMatcher
).some
)
val output = commonSetup(enrichmentReg)
val expected = new EnrichedEvent()
expected.app_id = "ads"
expected.user_id = "john@acme.com"
expected.user_ipaddress = "70.46.123.145"
expected.ip_domain = null
expected.user_fingerprint = "its_you_again!"
expected.geo_city = "Delray Beach"
expected.etl_tstamp = "1970-01-18 08:40:00.000"
expected.collector_tstamp = "2017-07-14 03:39:39.000"
val size = output.size must_== 1
val validOut = output.head must beValid.like {
case enrichedEvent =>
val contextJ = parse(enrichedEvent.contexts).toOption.get.hcursor.downField("data")
val firstElem = contextJ.downArray.downField("data")
val secondElem = contextJ.downArray.right.downField("data")
val context = parse(enrichedEvent.contexts).toOption.get.hcursor.downField("data").downArray
val data = context.downField("data")

(firstElem.get[String]("emailAddress") must beRight(
"72f323d5359eabefc69836369e4cabc6257c43ab6419b05dfb2211d0e44284c6"
)) and
(firstElem.downField("data").get[String]("nonExistentEmailAddress") must beLeft) and
(firstElem.get[String]("emailAddress2") must beRight("bob@acme.com")) and
(secondElem.get[String]("emailAddress") must beRight("tim@acme.com")) and
(secondElem.get[String]("emailAddress2") must beRight("tom@acme.com"))
val one = data.get[String]("emailAddress") must beRight("72f323d5359eabefc69836369e4cabc6257c43ab6419b05dfb2211d0e44284c6")
val two = data.get[String]("emailAddress2") must beRight("bob@acme.com")
val three = data.downField("nonExistentEmailAddress").focus must beNone

one and two and three
}
size and validOut
}

def e9 = {
val orig = json"""
{
"schema" : "iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0",
"data" : [
{
"schema" : "iglu:com.acme/email_sent/jsonschema/1-0-0",
"data" : {
"emailAddress" : "foo@bar.com",
"emailAddress2" : "bob@acme.com"
}
}
]
}
"""

val hashed = json"""
{
"schema" : "iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0",
"data" : [
{
"schema" : "iglu:com.acme/email_sent/jsonschema/1-0-0",
"data" : {
"emailAddress" : "72f323d5359eabefc69836369e4cabc6257c43ab6419b05dfb2211d0e44284c6",
"emailAddress2" : "bob@acme.com",
"nonExistentEmailAddress" : {}
}
}
]
}
"""

val expected = json"""
{
"schema" : "iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0",
"data" : [
{
"schema" : "iglu:com.acme/email_sent/jsonschema/1-0-0",
"data" : {
"emailAddress" : "72f323d5359eabefc69836369e4cabc6257c43ab6419b05dfb2211d0e44284c6",
"emailAddress2" : "bob@acme.com"
}
}
]
}
"""

PiiPseudonymizerEnrichment.removeAddedFields(hashed, orig) must beEqualTo(expected)
}
}

0 comments on commit b7cd91c

Please sign in to comment.