mandiant · mr-tz · Jun 13, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -37,7 +37,7 @@ jobs:
     - name: Install dependencies
       run: |
         pip install -r requirements.txt
-        pip install -e .[dev]
+        pip install -e .[dev,scripts]
     - name: Lint with ruff
       run: pre-commit run ruff
     - name: Lint with isort
@@ -65,7 +65,7 @@ jobs:
     - name: Install capa
       run: |
         pip install -r requirements.txt
-        pip install -e .[dev]
+        pip install -e .[dev,scripts]
     - name: Run rule linter
       run: python scripts/lint.py rules/
 
@@ -102,7 +102,7 @@ jobs:
     - name: Install capa
       run: |
         pip install -r requirements.txt
-        pip install -e .[dev]
+        pip install -e .[dev,scripts]
     - name: Run tests (fast)
       # this set of tests runs about 80% of the cases in 20% of the time,
       # and should catch most errors quickly.
@@ -139,7 +139,7 @@ jobs:
       if: ${{ env.BN_SERIAL != 0 }}
       run: |
         pip install -r requirements.txt
-        pip install -e .[dev]
+        pip install -e .[dev,scripts]
     - name: install Binary Ninja
       if: ${{ env.BN_SERIAL != 0 }}
       run: |
@@ -198,7 +198,7 @@ jobs:
     - name: Install capa
       run: |
         pip install -r requirements.txt
-        pip install -e .[dev] 
+        pip install -e .[dev,scripts]
     - name: Run tests
       run: | 
         mkdir ./.github/ghidra/project

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@
 - optimize rule matching #2080 @williballenthin
 - add aarch64 as a valid architecture #2144 mehunhoff@google.com @williballenthin
 - relax dependency version requirements for the capa library #2053 @williballenthin
+- add scripts dependency group and update documentation #2145 @mr-tz
 
 ### Breaking Changes
 

diff --git a/doc/installation.md b/doc/installation.md
@@ -107,7 +107,8 @@ We use the following tools to ensure consistent code style and formatting:
 
 To install these development dependencies, run:
 
-`$ pip install -e /local/path/to/src[dev]`
+- `$ pip install -e /local/path/to/src[dev]` or
+- `$ pip install -e /local/path/to/src[dev,scripts]` to also install all script dependencies
 
 We use [pre-commit](https://pre-commit.com/) so that its trivial to run the same linters & configuration locally as in CI.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -138,9 +138,6 @@ dev = [
     "black==24.4.2",
     "isort==5.13.2",
     "mypy==1.10.0",
-    "psutil==5.9.2",
-    "stix2==3.0.1",
-    "requests==2.31.0",
     "mypy-protobuf==3.6.0",
     # type stubs for mypy
     "types-backports==0.1.3",
@@ -162,6 +159,13 @@ build = [
     "setuptools==70.0.0",
     "build==1.2.1"
 ]
+scripts = [
+    "jschema_to_python==1.2.3",
+    "psutil==5.9.2",
+    "stix2==3.0.1",
+    "sarif_om==1.0.4",
+    "requests==2.31.0",
+]
 
 [tool.deptry]
 extend_exclude = [

diff --git a/scripts/capa2sarif.py b/scripts/capa2sarif.py
@@ -72,9 +72,7 @@ def _parse_args() -> argparse.Namespace:
         help="Compatibility for Radare r2sarif plugin v2.0",
     )
     parser.add_argument("-t", "--tag", help="Filter on rule meta field values (ruleid)")
-    parser.add_argument(
-        "--version", action="version", version=f"%(prog)s {__version__}"
-    )
+    parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
 
     return parser.parse_args()
 
@@ -89,24 +87,18 @@ def main() -> int:
         with Path(args.capa_output).open() as capa_output:
             json_data = json.load(capa_output)
     except ValueError:
-        logger.error(
-            "Input data was not valid JSON, input should be a capa json output file."
-        )
+        logger.error("Input data was not valid JSON, input should be a capa json output file.")
         return -1
     except json.JSONDecodeError:
         # An exception has occured
-        logger.error(
-            "Input data was not valid JSON, input should be a capa json output file."
-        )
+        logger.error("Input data was not valid JSON, input should be a capa json output file.")
         return -2
 
     # Marshall json into Sarif
     # Create baseline sarif structure to be populated from json data
-    sarif_structure: Optional[dict] = _sarif_boilerplate(
-        json_data["meta"], json_data["rules"]
-    )
+    sarif_structure: Optional[dict] = _sarif_boilerplate(json_data["meta"], json_data["rules"])
     if sarif_structure is None:
-        logger.errort("An Error has occured creating default sarif structure.")
+        logger.error("An Error has occured creating default sarif structure.")
         return -3
 
     _populate_artifact(sarif_structure, json_data["meta"])
@@ -120,9 +112,7 @@ def main() -> int:
 
         # artifacts must include a description as well with a text field.
         if "artifacts" in sarif_structure["runs"][0]:
-            sarif_structure["runs"][0]["artifacts"][0]["description"] = {
-                "text": "placeholder"
-            }
+            sarif_structure["runs"][0]["artifacts"][0]["description"] = {"text": "placeholder"}
 
         # For better compliance with Ghidra table. Iteraction through properties['additionalProperties']
         """
@@ -170,13 +160,9 @@ def _sarif_boilerplate(data_meta: dict, data_rules: dict) -> Optional[dict]:
                 "id": id,
                 "name": data_rules[key]["meta"]["name"],
                 "shortDescription": {"text": data_rules[key]["meta"]["name"]},
-                "messageStrings": {
-                    "default": {"text": data_rules[key]["meta"]["name"]}
-                },
+                "messageStrings": {"default": {"text": data_rules[key]["meta"]["name"]}},
                 "properties": {
-                    "namespace": data_rules[key]["meta"]["namespace"]
-                    if "namespace" in data_rules[key]["meta"]
-                    else [],
+                    "namespace": data_rules[key]["meta"]["namespace"] if "namespace" in data_rules[key]["meta"] else [],
                     "scopes": data_rules[key]["meta"]["scopes"],
                     "references": data_rules[key]["meta"]["references"],
                     "lib": data_rules[key]["meta"]["lib"],
@@ -258,39 +244,36 @@ def _populate_invocations(sarif_log: dict, meta_data: dict) -> None:
 
 def _enumerate_evidence(node: dict, related_count: int) -> List[dict]:
     related_locations = []
-    if node.get("success") and node.get("node").get("type") != "statement":
+    if node.get("success") and node.get("node", {}).get("type") != "statement":
         label = ""
-        if node.get("node").get("type") == "feature":
-            if node.get("node").get("feature").get("type") == "api":
-                label = "api: " + node.get("node").get("feature").get("api")
-            elif node.get("node").get("feature").get("type") == "match":
-                label = "match: " + node.get("node").get("feature").get("match")
-            elif node.get("node").get("feature").get("type") == "number":
-                label = f"number: {node.get('node').get('feature').get('description')} ({node.get('node').get('feature').get('number')})"
-            elif node.get("node").get("feature").get("type") == "offset":
-                label = f"offset: {node.get('node').get('feature').get('description')} ({node.get('node').get('feature').get('offset')})"
-            elif node.get("node").get("feature").get("type") == "mnemonic":
-                label = f"mnemonic: {node.get('node').get('feature').get('mnemonic')}"
-            elif node.get("node").get("feature").get("type") == "characteristic":
-                label = f"characteristic: {node.get('node').get('feature').get('characteristic')}"
-            elif node.get("node").get("feature").get("type") == "os":
-                label = f"os: {node.get('node').get('feature').get('os')}"
-            elif node.get("node").get("feature").get("type") == "operand number":
-                label = f"operand: ({node.get('node').get('feature').get('index')} ) {node.get('node').get('feature').get('description')} ({node.get('node').get('feature').get('operand_number')})"
+        if node.get("node", {}).get("type") == "feature":
+            if node.get("node", {}).get("feature", {}).get("type") == "api":
+                label = "api: " + node.get("node", {}).get("feature", {}).get("api")
+            elif node.get("node", {}).get("feature", {}).get("type") == "match":
+                label = "match: " + node.get("node", {}).get("feature", {}).get("match")
+            elif node.get("node", {}).get("feature", {}).get("type") == "number":
+                label = f"number: {node.get('node', {}).get('feature', {}).get('description')} ({node.get('node', {}).get('feature', {}).get('number')})"
+            elif node.get("node", {}).get("feature", {}).get("type") == "offset":
+                label = f"offset: {node.get('node', {}).get('feature', {}).get('description')} ({node.get('node', {}).get('feature', {}).get('offset')})"
+            elif node.get("node", {}).get("feature", {}).get("type") == "mnemonic":
+                label = f"mnemonic: {node.get('node', {}).get('feature', {}).get('mnemonic')}"
+            elif node.get("node", {}).get("feature", {}).get("type") == "characteristic":
+                label = f"characteristic: {node.get('node', {}).get('feature', {}).get('characteristic')}"
+            elif node.get("node", {}).get("feature", {}).get("type") == "os":
+                label = f"os: {node.get('node', {}).get('feature', {}).get('os')}"
+            elif node.get("node", {}).get("feature", {}).get("type") == "operand number":
+                label = f"operand: ({node.get('node', {}).get('feature', {}).get('index')} ) {node.get('node', {}).get('feature', {}).get('description')} ({node.get('node', {}).get('feature', {}).get('operand_number')})"
             else:
                 logger.error(
                     "Not implemented %s",
-                    node.get("node").get("feature").get("type"),
-                    file=sys.stderr,
+                    node.get("node", {}).get("feature", {}).get("type"),
                 )
                 return []
         else:
-            logger.error(
-                "Not implemented %s", node.get("node").get("type"), file=sys.stderr
-            )
+            logger.error("Not implemented %s", node.get("node", {}).get("type"))
             return []
 
-        for loc in node.get("locations"):
+        for loc in node.get("locations", []):
             if loc["type"] != "absolute":
                 continue
 
@@ -303,8 +286,8 @@ def _enumerate_evidence(node: dict, related_count: int) -> List[dict]:
             )
             related_count += 1
 
-    if node.get("success") and node.get("node").get("type") == "statement":
-        for child in node.get("children"):
+    if node.get("success") and node.get("node", {}).get("type") == "statement":
+        for child in node.get("children", []):
             related_locations += _enumerate_evidence(child, related_count)
 
     return related_locations

diff --git a/tests/test_scripts.py b/tests/test_scripts.py
@@ -40,7 +40,10 @@ def get_rule_path():
     [
         pytest.param("capa2yara.py", [get_rules_path()]),
         pytest.param("capafmt.py", [get_rule_path()]),
-        pytest.param("capa2sarif.py", [Path(__file__).resolve().parent / "data" / "rd" / "Practical Malware Analysis Lab 01-01.dll_.json"]),
+        pytest.param(
+            "capa2sarif.py",
+            [Path(__file__).resolve().parent / "data" / "rd" / "Practical Malware Analysis Lab 01-01.dll_.json"],
+        ),
         # testing some variations of linter script
         pytest.param("lint.py", ["-t", "create directory", get_rules_path()]),
         # `create directory` rule has native and .NET example PEs