Skip to content

deidentifier

Classes:

Functions:

Deidentifier

Deidentifier()

Methods:

Source code in src/hiperhealth/skills/privacy/deidentifier.py
def __init__(self) -> None:
    """
    title: Initialize the Presidio Analyzer and Anonymizer engines.
    """
    self.analyzer = AnalyzerEngine()
    self.anonymizer = AnonymizerEngine()  # type: ignore[no-untyped-call]

add_custom_recognizer

add_custom_recognizer(
    entity_name: str,
    regex_pattern: str,
    score: float = 0.85,
    language: str = 'en',
) -> None

If a recognizer for the same entity_name already exists, this method replaces the old one to prevent duplicate definitions.

      Args:

entity_name: The name for the new entity (e.g., "CUSTOM_ID"). regex_pattern: The regex pattern to detect the entity. score: The confidence score for the detection (0.0 to 1.0). language: The language for the recognizer registry. parameters: entity_name: type: str description: Value for entity_name. regex_pattern: type: str description: Value for regex_pattern. score: type: float description: Value for score. language: type: str description: Value for language.

Source code in src/hiperhealth/skills/privacy/deidentifier.py
def add_custom_recognizer(
    self,
    entity_name: str,
    regex_pattern: str,
    score: float = 0.85,
    language: str = 'en',
) -> None:
    """
    title: Add a custom PII entity recognizer using a regular expression.
    summary: |-
      If a recognizer for the same entity_name already exists, this method
              replaces the old one to prevent duplicate definitions.

              Args:
      entity_name: The name for the new entity (e.g., "CUSTOM_ID").
                  regex_pattern: The regex pattern to detect the entity.
      score: The confidence score for the detection (0.0 to 1.0).
                  language: The language for the recognizer registry.
    parameters:
      entity_name:
        type: str
        description: Value for entity_name.
      regex_pattern:
        type: str
        description: Value for regex_pattern.
      score:
        type: float
        description: Value for score.
      language:
        type: str
        description: Value for language.
    """
    if not (0.0 <= score <= 1.0):
        raise ValueError('Score must be between 0.0 and 1.0.')

    # To prevent duplicates, remove any existing PatternRecognizer with the
    # same name. This is done by rebuilding the list of recognizers.
    existing_recognizers = self.analyzer.registry.get_recognizers(
        language=language, all_fields=True
    )

    recognizers_to_keep = []
    for rec in existing_recognizers:
        # Using type() ensures we only check generic PatternRecognizers and
        # not specialized subclasses like the built-in CreditCardRecognizer
        if type(rec) is not PatternRecognizer:
            recognizers_to_keep.append(rec)
            continue

        # Keep recognizers that don't match the name of the new one.
        if entity_name not in rec.supported_entities:
            recognizers_to_keep.append(rec)

    # Replace the registry's list with the filtered list.
    self.analyzer.registry.recognizers = recognizers_to_keep

    # Add the new recognizer to the updated list.
    custom_recognizer = PatternRecognizer(
        supported_entity=entity_name,
        patterns=[
            Pattern(name=entity_name, regex=regex_pattern, score=score)
        ],
    )
    self.analyzer.registry.add_recognizer(custom_recognizer)
    logger.info(f"Custom recognizer '{entity_name}' added successfully.")

analyze

analyze(
    text: str,
    entities: Optional[list[str]] = None,
    language: str = 'en',
) -> list[RecognizerResult]
Source code in src/hiperhealth/skills/privacy/deidentifier.py
def analyze(
    self,
    text: str,
    entities: Optional[list[str]] = None,
    language: str = 'en',
) -> list[RecognizerResult]:
    """
    title: Analyze text to detect and locate PII entities.
    parameters:
      text:
        type: str
        description: Value for text.
      entities:
        type: Optional[list[str]]
        description: Value for entities.
      language:
        type: str
        description: Value for language.
    returns:
      type: list[RecognizerResult]
      description: Return value.
    """
    return self.analyzer.analyze(
        text=text, entities=entities, language=language
    )

deidentify

deidentify(
    text: str, strategy: str = 'mask', language: str = 'en'
) -> str
Source code in src/hiperhealth/skills/privacy/deidentifier.py
def deidentify(
    self, text: str, strategy: str = 'mask', language: str = 'en'
) -> str:
    """
    title: Anonymize detected PII in the text using a specified strategy.
    parameters:
      text:
        type: str
        description: Value for text.
      strategy:
        type: str
        description: Value for strategy.
      language:
        type: str
        description: Value for language.
    returns:
      type: str
      description: Return value.
    """
    # First, ensure the provided strategy is supported.
    supported_strategies = ['mask', 'hash']
    if strategy not in supported_strategies:
        raise ValueError(
            f"Unsupported strategy: '{strategy}'. "
            f'Available options are: {", ".join(supported_strategies)}'
        )

    analyzer_results = self.analyze(text, language=language)

    if not analyzer_results:
        return text

    # The 'mask' strategy is handled manually to ensure the mask's length
    # dynamically matches the original PII token's length.
    if strategy == 'mask':
        sorted_results = sorted(
            analyzer_results, key=lambda x: x.end, reverse=True
        )
        anonymized_text = text
        for res in sorted_results:
            anonymized_text = (
                anonymized_text[: res.start]
                + '*' * (res.end - res.start)
                + anonymized_text[res.end :]
            )
        return anonymized_text

    # Other strategies are handled by the AnonymizerEngine.
    operators = {
        'hash': {
            'DEFAULT': OperatorConfig('hash', {'hash_type': 'sha256'})
        }
    }

    anonymized_result = self.anonymizer.anonymize(
        text=text,
        analyzer_results=analyzer_results,  # type: ignore[arg-type]
        operators=operators.get(strategy),
    )
    return anonymized_result.text

PrivacySkill

PrivacySkill()

Bases: BaseSkill

Methods:

Source code in src/hiperhealth/skills/privacy/deidentifier.py
def __init__(self) -> None:
    """
    title: Initialize the built-in privacy skill and deidentifier.
    """
    super().__init__(
        SkillMetadata(
            name='hiperhealth.privacy',
            version='0.4.0',
            stages=(Stage.SCREENING, Stage.INTAKE),
            description=(
                'De-identifies PII from patient data before processing.'
            ),
        )
    )
    self._deidentifier = Deidentifier()

check_requirements

check_requirements(
    stage: str, ctx: PipelineContext
) -> list[Inquiry]

Override to return a list of Inquiry objects describing what additional data the skill needs. The default implementation returns an empty list (no extra data needed). Inquiries use three priority levels: - required: must have before this stage can run - supplementary: improves results, available now - deferred: only available after a future pipeline step parameters: stage: type: str ctx: type: PipelineContext returns: type: list[Inquiry]

Source code in src/hiperhealth/pipeline/skill.py
def check_requirements(
    self, stage: str, ctx: PipelineContext
) -> list[Inquiry]:
    """
    title: Determine what information is needed before execution.
    summary: |-
      Override to return a list of Inquiry objects describing
      what additional data the skill needs.  The default
      implementation returns an empty list (no extra data needed).
      Inquiries use three priority levels:
      - required: must have before this stage can run
      - supplementary: improves results, available now
      - deferred: only available after a future pipeline step
    parameters:
      stage:
        type: str
      ctx:
        type: PipelineContext
    returns:
      type: list[Inquiry]
    """
    return []

execute

execute(
    stage: str, ctx: PipelineContext
) -> PipelineContext
Source code in src/hiperhealth/skills/privacy/deidentifier.py
def execute(self, stage: str, ctx: PipelineContext) -> PipelineContext:
    """
    title: De-identify PII in patient data.
    parameters:
      stage:
        type: str
        description: Value for stage.
      ctx:
        type: PipelineContext
        description: Value for ctx.
    returns:
      type: PipelineContext
      description: Return value.
    """
    if ctx.patient:
        ctx.patient = dict(
            deidentify_patient_record(
                dict(ctx.patient),
                self._deidentifier,
            )
        )
    return ctx

post

post(stage: str, ctx: PipelineContext) -> PipelineContext
Source code in src/hiperhealth/pipeline/skill.py
def post(self, stage: str, ctx: PipelineContext) -> PipelineContext:
    """
    title: Called after the stage's main execution.
    parameters:
      stage:
        type: str
      ctx:
        type: PipelineContext
    returns:
      type: PipelineContext
    """
    return ctx

pre

pre(stage: str, ctx: PipelineContext) -> PipelineContext
Source code in src/hiperhealth/pipeline/skill.py
def pre(self, stage: str, ctx: PipelineContext) -> PipelineContext:
    """
    title: Called before the stage's main execution.
    parameters:
      stage:
        type: str
      ctx:
        type: PipelineContext
    returns:
      type: PipelineContext
    """
    return ctx

deidentify_patient_record

deidentify_patient_record(
    record: dict[str, object],
    deidentifier: Deidentifier,
    keys_to_deidentify: frozenset[str] | None = None,
) -> dict[str, object]
Source code in src/hiperhealth/skills/privacy/deidentifier.py
def deidentify_patient_record(
    record: dict[str, object],
    deidentifier: Deidentifier,
    keys_to_deidentify: frozenset[str] | None = None,
) -> dict[str, object]:
    """
    title: Recursively find and de-identify string values in a patient record.
    summary: |-
      Args:
              record: The patient data dictionary.
              deidentifier: An instance of the Deidentifier class.
    parameters:
      record:
        type: dict[str, object]
        description: Value for record.
      deidentifier:
        type: Deidentifier
        description: Value for deidentifier.
      keys_to_deidentify:
        type: frozenset[str] | None
        description: Value for keys_to_deidentify.
    returns:
      type: dict[str, object]
      description: Return value.
    """
    effective_keys = (
        keys_to_deidentify
        if keys_to_deidentify is not None
        else _DEFAULT_KEYS_TO_DEIDENTIFY
    )

    for key, value in record.items():
        if isinstance(value, dict):
            deidentify_patient_record(value, deidentifier, effective_keys)
        elif isinstance(value, str) and key in effective_keys:
            record[key] = deidentifier.deidentify(value)

    return record