Source code for crawler.dicom

""" A simple regex based parser for DICOM (DCMTK output to be more precise). """
import re
from typing import List, Dict, Tuple

PATIENT_NAME = "PatientName"
PATIENT_BIRTHDATE = "PatientBirthDate"
PATIENT_ID = "PatientID"
PATIENT_SEX = "PatientSex"
STUDY_DATE = "StudyDate"
SERIES_DATE = "SeriesDate"
SERIES_TIME = "SeriesTime"
MODALITY = "Modality"
BODY_PART_EXAMINED = "BodyPartExamined"
STUDY_DESCRIPTION = "StudyDescription"
SERIES_DESCRIPTION = "SeriesDescription"
ACCESSION_NUMBER = "AccessionNumber"
STUDY_ID = "StudyID"
SERIES_NUMBER = "SeriesNumber"
INSTANCE_NUMBER = "InstanceNumber"
REFERRING_PHYSICIAN_NAME = "ReferringPhysicianName"
INSTANCE_AVAILABILITY = "InstanceAvailability"
INSTITUTION_NAME = "InstitutionName"
STUDY_INSTANCE_UID = "StudyInstanceUID"
SERIES_INSTANCE_UID = "SeriesInstanceUID"
SPECIFIC_CHARACTER_SET = "SpecificCharacterSet"
QUERY_RETRIEVE_LEVEL = "QueryRetrieveLevel"
RETRIEVE_AE_TITLE = "RetrieveAETitle"
STATION_NAME = "StationName"
PROTOCOL_NAME = "ProtocolName"

TAGS = {
    "(0010,0010)": PATIENT_NAME,
    "(0010,0030)": PATIENT_BIRTHDATE,
    "(0010,0020)": PATIENT_ID,
    "(0010,0040)": PATIENT_SEX,
    "(0008,0020)": STUDY_DATE,
    "(0008,0021)": SERIES_DATE,
    "(0008,0031)": SERIES_TIME,
    "(0008,0060)": MODALITY,
    "(0018,0015)": BODY_PART_EXAMINED,
    "(0008,1010)": STATION_NAME,
    "(0008,1030)": STUDY_DESCRIPTION,
    "(0008,103e)": SERIES_DESCRIPTION,
    "(0008,0050)": ACCESSION_NUMBER,
    "(0020,0010)": STUDY_ID,
    "(0020,0011)": SERIES_NUMBER,
    "(0020,0013)": INSTANCE_NUMBER,
    "(0008,0090)": REFERRING_PHYSICIAN_NAME,
    "(0008,0056)": INSTANCE_AVAILABILITY,
    "(0008,0080)": INSTITUTION_NAME,
    "(0018,1030)": PROTOCOL_NAME,
    "(0020,000d)": STUDY_INSTANCE_UID,
    "(0020,000e)": SERIES_INSTANCE_UID,
    "(0008,0005)": SPECIFIC_CHARACTER_SET,
    "(0008,0052)": QUERY_RETRIEVE_LEVEL,
    "(0008,0054)": RETRIEVE_AE_TITLE,
}

START_OR_END = re.compile(r"^I:\s*$")


[docs]def get_results(strings: List[str]) -> List[Dict[str, str]]: """ Get list of results found. A single result is a dictionary where the keys are the DICOM tags and the value is the DICOM value. :param strings: list of strings :return: list of results (result is a dictionary) """ result = [] single_result = {} for line in strings: if _is_valid(line): single_result[_get_tag(line)] = _get_value(line) if _is_start_or_end(line) and single_result: result.append(single_result.copy()) single_result.clear() result.append(single_result.copy()) return result[1:]
def _is_start_or_end(line: str) -> bool: """ Returns True if it is the start or end of a DICOM header. """ return line.startswith("I: ---------------------------") or line.startswith( "W: ---------------------------" ) def _is_valid(line: str) -> bool: return ( line.startswith("I:") and "(" in line and ")" in line and "[" in line and "]" in line ) or ( line.startswith("W:") and "(" in line and ")" in line and "[" in line and "]" in line ) def _get_tag_value(line: str) -> Tuple[str, str]: return _get_tag(line), _get_value(line) def _get_tag(line: str) -> str: """ Returns the resolved tag value of the line. It first gets the content between the first round brackets and then makes a lookup to get the tag value. For example on this line I: (0010,0040) CS [F ] tag value would be (0010,0040) and resolved would be 'Modality'. """ return TAGS[line[3:14]] def _get_value(line: str) -> str: """ Returns the value of the line, which is everything between the first and last square bracket. :param line: a line of the Dicom file :return: value """ start = line.find("[") + 1 end = line.rfind("]") return line[start:end].strip(" \t\r\n\0")