Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Added context column into match table
Revision ID: f623e1057b00
Revises: 6b495d5a4855
Create Date: 2024-11-13 15:14:14.618258
"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = "f623e1057b00"
down_revision = "702d19cfa063"
branch_labels = None
depends_on = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column("match", sa.Column("context", sa.JSON(), nullable=False))
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("match", "context")
# ### end Alembic commands ###
3 changes: 3 additions & 0 deletions src/models/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,6 @@ class Match(SQLModel, table=True):
)
)
job: Job = Relationship(back_populates="matches")
context: Dict[str, Dict[str, Dict[str, str]]] = Field(
sa_column=Column(JSON, nullable=False)
)
56 changes: 52 additions & 4 deletions src/tasks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List, Optional, cast
import base64
from typing import List, Optional, cast, Dict
import logging
from rq import get_current_job, Queue # type: ignore
from redis import Redis
Expand Down Expand Up @@ -68,7 +69,12 @@ def get_datasets(self) -> List[str]:
return list(result["result"]["datasets"].keys())

def update_metadata(
self, job: JobId, orig_name: str, path: str, matches: List[str]
self,
job: JobId,
orig_name: str,
path: str,
matches: List[str],
context: Dict[str, Dict[str, Dict[str, str]]],
) -> None:
"""Saves matches to the database, and runs appropriate metadata
plugins.
Expand All @@ -93,7 +99,9 @@ def update_metadata(
del metadata["path"]

# Update the database.
match = Match(file=orig_name, meta=metadata, matches=matches)
match = Match(
file=orig_name, meta=metadata, matches=matches, context=context
)
self.db.add_match(job, match)

def execute_yara(self, job: Job, files: List[str]) -> None:
Expand All @@ -108,10 +116,18 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
path = self.plugins.filter(orig_name)
if not path:
continue

matches = rule.match(path)
if matches:
with open(path, "rb") as file:
data = file.read()

self.update_metadata(
job.id, orig_name, path, [r.rule for r in matches]
job.id,
orig_name,
path,
[r.rule for r in matches],
get_match_contexts(data, matches),
)
num_matches += 1
except yara.Error:
Expand Down Expand Up @@ -290,3 +306,35 @@ def run_yara_batch(job_id: JobId, iterator: str, batch_size: int) -> None:

agent.execute_yara(job, pop_result.files)
agent.add_tasks_in_progress(job, -1)


def get_match_contexts(
data: bytes, matches: List[yara.Match]
) -> Dict[str, Dict[str, Dict[str, str]]]:
context = {}
for yara_match in matches:
match_context = {}
for string_match in yara_match.strings:
first = string_match.instances[0]

(before, matching, after) = read_bytes_with_context(
data, first.offset, first.matched_length
)
match_context[string_match.identifier] = {
"before": base64.b64encode(before).decode("utf-8"),
"matching": base64.b64encode(matching).decode("utf-8"),
"after": base64.b64encode(after).decode("utf-8"),
}

context[yara_match.rule] = match_context
return context


def read_bytes_with_context(
data: bytes, offset: int, length: int, context: int = 32
) -> tuple[bytes, bytes, bytes]:
"""Return `matched_length` bytes from `offset`, along with `byte_range` bytes before and after the match."""
before = data[max(0, offset - context) : offset]
matching = data[offset : offset + length]
after = data[offset + length : offset + length + context]
return before, matching, after
Loading