FHIRFLARE-IG-Toolkit/services.py

1480 lines
82 KiB
Python

# app/services.py
import requests
import os
import tarfile
import json
import re
import logging
from flask import current_app
from collections import defaultdict
from pathlib import Path
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Constants
FHIR_REGISTRY_BASE_URL = "https://packages.fhir.org"
DOWNLOAD_DIR_NAME = "fhir_packages"
CANONICAL_PACKAGE = ("hl7.fhir.r4.core", "4.0.1") # Define the canonical FHIR package
# --- Helper Functions ---
def _get_download_dir():
"""Gets the absolute path to the download directory, creating it if needed."""
instance_path = None
try:
# Try to get instance_path from Flask app context if available
instance_path = current_app.instance_path
logger.debug(f"Using instance path from current_app: {instance_path}")
except RuntimeError:
# Fallback if no app context (e.g., running script directly)
logger.warning("No app context for instance_path, constructing relative path.")
# Assume services.py is in /app, instance folder sibling to /app
instance_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'instance'))
logger.debug(f"Constructed instance path: {instance_path}")
if not instance_path:
logger.error("Fatal Error: Could not determine instance path.")
return None
download_dir = os.path.join(instance_path, DOWNLOAD_DIR_NAME)
try:
os.makedirs(download_dir, exist_ok=True)
# Add check for flask config path
if 'FHIR_PACKAGES_DIR' not in current_app.config:
current_app.config['FHIR_PACKAGES_DIR'] = download_dir
logger.info(f"Set current_app.config['FHIR_PACKAGES_DIR'] to {download_dir}")
return download_dir
except OSError as e:
logger.error(f"Fatal Error creating dir {download_dir}: {e}", exc_info=True)
return None
except RuntimeError: # Catch if current_app doesn't exist here either
logger.warning("No app context available to set FHIR_PACKAGES_DIR config.")
# Still attempt to create and return the path for non-Flask use cases
try:
os.makedirs(download_dir, exist_ok=True)
return download_dir
except OSError as e:
logger.error(f"Fatal Error creating dir {download_dir}: {e}", exc_info=True)
return None
def sanitize_filename_part(text):
"""Basic sanitization for name/version parts of filename."""
safe_text = "".join(c if c.isalnum() or c in ['.', '-'] else '_' for c in text)
safe_text = re.sub(r'_+', '_', safe_text)
safe_text = safe_text.strip('_-.')
return safe_text if safe_text else "invalid_name"
def _construct_tgz_filename(name, version):
"""Constructs the standard filename using the sanitized parts."""
return f"{sanitize_filename_part(name)}-{sanitize_filename_part(version)}.tgz"
def find_and_extract_sd(tgz_path, resource_identifier):
"""Helper to find and extract SD json from a given tgz path by ID, Name, or Type."""
sd_data = None
found_path = None
if not tgz_path or not os.path.exists(tgz_path):
logger.error(f"File not found in find_and_extract_sd: {tgz_path}")
return None, None
try:
with tarfile.open(tgz_path, "r:gz") as tar:
logger.debug(f"Searching for SD matching '{resource_identifier}' in {os.path.basename(tgz_path)}")
for member in tar:
if not (member.isfile() and member.name.startswith('package/') and member.name.lower().endswith('.json')):
continue
if os.path.basename(member.name).lower() in ['package.json', '.index.json', 'validation-summary.json', 'validation-oo.json']:
continue
fileobj = None
try:
fileobj = tar.extractfile(member)
if fileobj:
content_bytes = fileobj.read()
# Handle potential BOM (Byte Order Mark)
content_string = content_bytes.decode('utf-8-sig')
data = json.loads(content_string)
if isinstance(data, dict) and data.get('resourceType') == 'StructureDefinition':
sd_id = data.get('id')
sd_name = data.get('name')
sd_type = data.get('type') # The type the SD describes (e.g., Patient)
# Match if requested identifier matches ID, Name, or the Base Type the SD describes
# Case-insensitive matching might be safer for identifiers
if resource_identifier and (resource_identifier.lower() == str(sd_type).lower() or
resource_identifier.lower() == str(sd_id).lower() or
resource_identifier.lower() == str(sd_name).lower()):
sd_data = data
found_path = member.name
logger.info(f"Found matching SD for '{resource_identifier}' at path: {found_path} (Matched on Type/ID/Name)")
break # Stop searching once found
except json.JSONDecodeError as e:
logger.warning(f"Could not parse JSON in {member.name}: {e}")
except UnicodeDecodeError as e:
logger.warning(f"Could not decode UTF-8 in {member.name}: {e}")
except tarfile.TarError as e:
logger.warning(f"Tar error reading member {member.name}: {e}")
# Potentially break or continue depending on severity preference
except Exception as e:
logger.warning(f"Could not read/parse potential SD {member.name}: {e}")
finally:
if fileobj:
fileobj.close()
if sd_data is None:
logger.info(f"SD matching '{resource_identifier}' not found within archive {os.path.basename(tgz_path)} - caller may attempt fallback")
except tarfile.ReadError as e:
logger.error(f"Tar ReadError (possibly corrupted file) reading {tgz_path}: {e}")
# Decide if this should raise or return None
return None, None # Or raise custom error
except tarfile.TarError as e:
logger.error(f"TarError reading {tgz_path} in find_and_extract_sd: {e}")
raise tarfile.TarError(f"Error reading package archive: {e}") from e
except FileNotFoundError as e:
logger.error(f"FileNotFoundError reading {tgz_path} in find_and_extract_sd: {e}")
raise
except Exception as e:
logger.error(f"Unexpected error in find_and_extract_sd for {tgz_path}: {e}", exc_info=True)
raise
return sd_data, found_path
def save_package_metadata(name, version, dependency_mode, dependencies, complies_with_profiles=None, imposed_profiles=None):
"""Saves the dependency mode, imported dependencies, and profile relationships as metadata alongside the package."""
download_dir = _get_download_dir()
if not download_dir:
logger.error("Could not get download directory for metadata saving.")
return False
metadata = {
'package_name': name,
'version': version,
'dependency_mode': dependency_mode,
'imported_dependencies': dependencies,
'complies_with_profiles': complies_with_profiles or [],
'imposed_profiles': imposed_profiles or []
}
metadata_filename = f"{sanitize_filename_part(name)}-{sanitize_filename_part(version)}.metadata.json"
metadata_path = os.path.join(download_dir, metadata_filename)
try:
with open(metadata_path, 'w', encoding='utf-8') as f: # Specify encoding
json.dump(metadata, f, indent=2)
logger.info(f"Saved metadata for {name}#{version} at {metadata_path}")
return True
except Exception as e:
logger.error(f"Failed to save metadata for {name}#{version}: {e}")
return False
def get_package_metadata(name, version):
"""Retrieves the metadata for a given package."""
download_dir = _get_download_dir()
if not download_dir:
logger.error("Could not get download directory for metadata retrieval.")
return None
metadata_filename = f"{sanitize_filename_part(name)}-{sanitize_filename_part(version)}.metadata.json"
metadata_path = os.path.join(download_dir, metadata_filename)
if os.path.exists(metadata_path):
try:
with open(metadata_path, 'r', encoding='utf-8') as f: # Specify encoding
return json.load(f)
except Exception as e:
logger.error(f"Failed to read metadata for {name}#{version}: {e}")
return None
return None
# --- New navigate_fhir_path ---
def navigate_fhir_path(resource, path):
"""Navigate a FHIR resource path, handling arrays, nested structures, and choice types."""
keys = path.split('.')
# Remove the root resource type if present (e.g., Patient.name -> name)
if keys and resource and isinstance(resource, dict) and keys[0] == resource.get('resourceType'):
keys = keys[1:]
current = resource
for i, key in enumerate(keys):
is_last_key = (i == len(keys) - 1)
# logger.debug(f"Navigating: key='{key}', is_last={is_last_key}, current_type={type(current)}") # Uncomment for debug
if current is None:
# logger.debug(f"Navigation stopped, current became None before processing key '{key}'.")
return None
if isinstance(current, dict):
# Handle direct key access
if key in current:
current = current.get(key) # Use .get() for safety
# Handle choice type e.g., value[x]
elif '[x]' in key:
base_key = key.replace('[x]', '')
found_choice = False
for k, v in current.items():
if k.startswith(base_key):
current = v
found_choice = True
break
if not found_choice:
# logger.debug(f"Choice key '{key}' (base: {base_key}) not found in dict keys: {list(current.keys())}")
return None
else:
# logger.debug(f"Key '{key}' not found in dict keys: {list(current.keys())}")
return None
elif isinstance(current, list):
# If it's the last key, the path refers to the list itself.
# The validation logic needs to handle checking the list.
if is_last_key:
# logger.debug(f"Path ends on a list for key '{key}'. Returning list: {current}")
return current # Return the list itself for the validator to check
# --- If not the last key, we need to look inside list elements ---
# This is tricky. FHIRPath has complex list navigation.
# For simple validation (does element X exist?), we might assume
# we just need to find *one* item in the list that has the subsequent path.
# Let's try finding the first match within the list.
found_in_list = False
results_from_list = []
remaining_path = '.'.join(keys[i:]) # The rest of the path including current key
# logger.debug(f"List encountered for key '{key}'. Searching elements for remaining path: '{remaining_path}'")
for item in current:
# Recursively navigate into the item using the *remaining* path
sub_result = navigate_fhir_path(item, remaining_path)
if sub_result is not None:
# Collect all non-None results if validating cardinality or specific values later
if isinstance(sub_result, list):
results_from_list.extend(sub_result)
else:
results_from_list.append(sub_result)
# For basic existence check, finding one is enough, but let's collect all
# found_in_list = True
# break # Or collect all? Let's collect for now.
if not results_from_list:
# logger.debug(f"Remaining path '{remaining_path}' not found in any list items.")
return None # Path not found in any list element
# What to return? The first result? All results?
# If the final part of the path should be a single value, return first.
# If it could be multiple (e.g., Patient.name.given returns multiple strings), return list.
# Let's return the list of found items. The validator can check if it's non-empty.
# logger.debug(f"Found results in list for '{remaining_path}': {results_from_list}")
return results_from_list # Return list of found values/sub-structures
else:
# Current is not a dict or list, cannot navigate further
# logger.debug(f"Cannot navigate further, current is not dict/list (key='{key}').")
return None
# logger.debug(f"Final result for path '{path}': {current}")
return current
# --- End New navigate_fhir_path ---
def validate_resource_against_profile(package_name, version, resource, include_dependencies=True):
"""Validate a single FHIR resource against a package's StructureDefinitions."""
logger.debug(f"Starting validation for resource: {resource.get('resourceType')}/{resource.get('id')} against {package_name}#{version}")
try:
# Find the resource's type
resource_type = resource.get('resourceType')
if not resource_type:
return {'valid': False, 'errors': ['Resource is missing resourceType.'], 'warnings': []}
# Get StructureDefinition
# Ensure download dir is fetched and config potentially set
download_dir = _get_download_dir()
if not download_dir:
return {'valid': False, 'errors': ['Could not determine FHIR package directory.'], 'warnings': []}
# Construct path using helper for consistency
tgz_filename = _construct_tgz_filename(package_name, version)
# Use absolute path from download_dir
tgz_path = os.path.join(download_dir, tgz_filename)
logger.debug(f"Attempting to load SD for type '{resource_type}' from tgz: {tgz_path}")
sd_data, sd_path_in_tar = find_and_extract_sd(tgz_path, resource_type)
if not sd_data:
logger.error(f"No StructureDefinition found for type '{resource_type}' in package {package_name}#{version} at {tgz_path}")
# Try falling back to canonical package if not the one requested? Maybe not here.
return {'valid': False, 'errors': [f"StructureDefinition for resource type '{resource_type}' not found in package {package_name}#{version}."], 'warnings': []}
logger.debug(f"Found SD for '{resource_type}' in tar at '{sd_path_in_tar}'")
# Prefer snapshot if available, otherwise use differential
elements = sd_data.get('snapshot', {}).get('element', [])
if not elements:
elements = sd_data.get('differential', {}).get('element', [])
logger.debug("Using differential elements for validation (snapshot missing).")
if not elements:
logger.error(f"StructureDefinition {sd_data.get('id', resource_type)} has no snapshot or differential elements.")
return {'valid': False, 'errors': [f"StructureDefinition '{sd_data.get('id', resource_type)}' is invalid (no elements)."], 'warnings': []}
must_support_paths = []
for element in elements:
if element.get('mustSupport', False):
path = element.get('path', '')
if path:
must_support_paths.append(path)
errors = []
warnings = []
# --- Revised Required Field Validation (min >= 1) ---
logger.debug(f"Checking required fields for {resource_type} based on SD {sd_data.get('id')}...")
element_definitions = {e.get('path'): e for e in elements if e.get('path')} # Cache elements by path
for element in elements:
path = element.get('path', '')
min_val = element.get('min', 0)
# Skip base element (e.g., "Patient") as it's always present if resourceType matches
if '.' not in path:
continue
if min_val >= 1:
logger.debug(f"Checking required path: {path} (min={min_val})")
# --- START: Parent Presence Check ---
parent_path = '.'.join(path.split('.')[:-1])
parent_is_present_or_not_applicable = True # Assume true unless parent is optional AND absent
# Check only if parent_path is a valid element path (not just the root type)
if '.' in parent_path:
parent_element_def = element_definitions.get(parent_path)
if parent_element_def:
parent_min_val = parent_element_def.get('min', 0)
# If the parent element itself is optional (min: 0)...
if parent_min_val == 0:
# ...check if the parent element actually exists in the instance data
parent_value = navigate_fhir_path(resource, parent_path)
if parent_value is None or (isinstance(parent_value, (list, str, dict)) and not parent_value):
# Optional parent is missing, so child cannot be required. Skip the check for this element.
parent_is_present_or_not_applicable = False
logger.debug(f"-> Requirement check for '{path}' skipped: Optional parent '{parent_path}' is absent.")
else:
# This case indicates an issue with the SD structure or path generation, but we'll be lenient
logger.warning(f"Could not find definition for parent path '{parent_path}' while checking requirement for '{path}'. Proceeding with check.")
# --- END: Parent Presence Check ---
# Only proceed with checking the element itself if its optional parent is present,
# or if the parent is required, or if it's a top-level element.
if parent_is_present_or_not_applicable:
value = navigate_fhir_path(resource, path)
# 1. Check for presence (is it None or an empty container?)
is_missing_or_empty = False
if value is None:
is_missing_or_empty = True
logger.debug(f"-> Path '{path}' value is None.")
elif isinstance(value, (list, str, dict)) and not value:
is_missing_or_empty = True
logger.debug(f"-> Path '{path}' value is an empty {type(value).__name__}.")
elif isinstance(value, bool) and value is False: pass # Valid presence
elif isinstance(value, (int, float)) and value == 0: pass # Valid presence
if is_missing_or_empty:
# Log the error only if the parent context allowed the check
errors.append(f"Required field '{path}' is missing or empty.")
logger.warning(f"Validation Error: Required field '{path}' missing or empty (Context: Parent '{parent_path}' required or present).")
continue # Skip further checks for this element if missing
# 2. Check specific FHIR types if present (value is not None/empty)
# (This part of the logic remains the same as before)
element_types = element.get('type', [])
type_codes = {t.get('code') for t in element_types if t.get('code')}
is_codeable_concept = 'CodeableConcept' in type_codes
is_reference = 'Reference' in type_codes
is_coding = 'Coding' in type_codes
if is_codeable_concept and isinstance(value, dict):
codings = value.get('coding')
if not value.get('text'):
if not isinstance(codings, list) or not any(isinstance(c, dict) and c.get('code') and c.get('system') for c in codings):
errors.append(f"Required CodeableConcept '{path}' lacks text or a valid coding (must include system and code).")
logger.warning(f"Validation Error: Required CC '{path}' invalid structure.")
elif is_coding and isinstance(value, dict):
if not value.get('code') or not value.get('system'):
errors.append(f"Required Coding '{path}' lacks a system or code.")
logger.warning(f"Validation Error: Required Coding '{path}' invalid structure.")
elif is_reference and isinstance(value, dict):
if not value.get('reference') and not value.get('identifier'):
errors.append(f"Required Reference '{path}' lacks a reference or identifier.")
logger.warning(f"Validation Error: Required Reference '{path}' invalid structure.")
# --- Revised Must-Support Field Validation ---
logger.debug(f"Checking must-support fields for {resource_type}...")
unique_must_support_paths = sorted(list(set(must_support_paths))) # Avoid duplicate checks if in both snapshot/diff
for path in unique_must_support_paths:
# Skip base element
if '.' not in path:
continue
logger.debug(f"Checking must-support path: {path}")
value = navigate_fhir_path(resource, path)
# 1. Check for presence
is_missing_or_empty = False
if value is None:
is_missing_or_empty = True
logger.debug(f"-> Path '{path}' value is None.")
elif isinstance(value, (list, str, dict)) and not value:
is_missing_or_empty = True
logger.debug(f"-> Path '{path}' value is an empty {type(value).__name__}.")
elif isinstance(value, bool) and value is False:
pass
elif isinstance(value, (int, float)) and value == 0:
pass
if is_missing_or_empty:
warnings.append(f"Must-support field '{path}' is missing or empty.")
logger.info(f"Validation Warning: Must-support field '{path}' missing or empty.") # Use INFO for MS warnings
continue
# 2. Check specific FHIR types (similar logic to required checks)
element_def = next((e for e in elements if e.get('path') == path), None)
if element_def:
element_types = element_def.get('type', [])
type_codes = {t.get('code') for t in element_types if t.get('code')}
is_codeable_concept = 'CodeableConcept' in type_codes
is_reference = 'Reference' in type_codes
is_coding = 'Coding' in type_codes
if is_codeable_concept and isinstance(value, dict):
codings = value.get('coding')
if not value.get('text'):
if not isinstance(codings, list) or not any(isinstance(c, dict) and c.get('code') and c.get('system') for c in codings):
warnings.append(f"Must-support CodeableConcept '{path}' lacks text or a valid coding (must include system and code).")
logger.info(f"Validation Warning: Must-support CC '{path}' invalid structure.")
elif is_coding and isinstance(value, dict):
if not value.get('code') or not value.get('system'):
warnings.append(f"Must-support Coding '{path}' lacks a system or code.")
logger.info(f"Validation Warning: Must-support Coding '{path}' invalid structure.")
elif is_reference and isinstance(value, dict):
if not value.get('reference') and not value.get('identifier'):
warnings.append(f"Must-support Reference '{path}' lacks a reference or identifier.")
logger.info(f"Validation Warning: Must-support Reference '{path}' invalid structure.")
# --- Dependency Validation ---
if include_dependencies:
logger.debug("Checking dependencies...")
metadata_path = Path(download_dir) / f"{sanitize_filename_part(package_name)}-{sanitize_filename_part(version)}.metadata.json"
if metadata_path.exists():
try:
with open(metadata_path, 'r', encoding='utf-8') as f:
metadata = json.load(f)
for dep in metadata.get('imported_dependencies', []):
dep_name = dep.get('name')
dep_version = dep.get('version')
if not dep_name or not dep_version:
logger.warning(f"Skipping invalid dependency entry: {dep}")
continue
logger.debug(f"Recursively validating against dependency: {dep_name}#{dep_version}")
# Pass include_dependencies=False to prevent infinite loops
dep_result = validate_resource_against_profile(dep_name, dep_version, resource, include_dependencies=False)
if not dep_result['valid']:
errors.extend([f"(Dependency {dep_name}#{dep_version}): {e}" for e in dep_result['errors']])
# Carry over warnings from dependencies as well
warnings.extend([f"(Dependency {dep_name}#{dep_version}): {w}" for w in dep_result['warnings']])
except Exception as e:
logger.error(f"Failed to load or process metadata {metadata_path} for dependencies: {e}")
errors.append(f"Failed to process dependency metadata for {package_name}#{version}.")
else:
logger.warning(f"Metadata file not found, cannot validate dependencies: {metadata_path}")
final_valid_state = len(errors) == 0
logger.info(f"Validation result for {resource_type}/{resource.get('id')} against {package_name}#{version}: Valid={final_valid_state}, Errors={len(errors)}, Warnings={len(warnings)}")
return {
'valid': final_valid_state,
'errors': errors,
'warnings': warnings
}
except FileNotFoundError:
# Specific handling if the tgz file itself wasn't found earlier
logger.error(f"Validation failed: Package file not found for {package_name}#{version}")
return {'valid': False, 'errors': [f"Package file for {package_name}#{version} not found."], 'warnings': []}
except tarfile.TarError as e:
logger.error(f"Validation failed due to TarError for {package_name}#{version}: {e}")
return {'valid': False, 'errors': [f"Error reading package archive for {package_name}#{version}: {e}"], 'warnings': []}
except Exception as e:
logger.error(f"Unexpected error during validation of {resource.get('resourceType')}/{resource.get('id')} against {package_name}#{version}: {e}", exc_info=True)
return {'valid': False, 'errors': [f'Unexpected validation error: {str(e)}'], 'warnings': []}
def validate_bundle_against_profile(package_name, version, bundle, include_dependencies=True):
"""Validate a FHIR Bundle against a package's StructureDefinitions."""
try:
if not isinstance(bundle, dict) or bundle.get('resourceType') != 'Bundle':
return {'valid': False, 'errors': ['Not a valid Bundle resource.'], 'warnings': [], 'results': {}}
results = {}
all_errors = []
all_warnings = []
bundle_valid = True
# Validate each entry's resource
logger.info(f"Validating Bundle/{bundle.get('id', 'N/A')} against {package_name}#{version}. Entries: {len(bundle.get('entry', []))}")
for i, entry in enumerate(bundle.get('entry', [])):
resource = entry.get('resource')
entry_id = f"Entry {i}"
resource_id_str = None
if not resource:
all_errors.append(f"{entry_id}: Missing 'resource' key in entry.")
bundle_valid = False
continue
if not isinstance(resource, dict):
all_errors.append(f"{entry_id}: 'resource' key does not contain a valid FHIR resource (must be a dictionary).")
bundle_valid = False
continue
resource_type = resource.get('resourceType')
resource_id = resource.get('id')
resource_id_str = f"{resource_type}/{resource_id}" if resource_type and resource_id else resource_type or f"Unnamed Resource in {entry_id}"
entry_id = f"Entry {i} ({resource_id_str})" # More descriptive ID
logger.debug(f"Validating {entry_id}...")
result = validate_resource_against_profile(package_name, version, resource, include_dependencies)
results[entry_id] = result # Store result keyed by descriptive entry ID
if not result['valid']:
bundle_valid = False
all_errors.extend([f"{entry_id}: {e}" for e in result['errors']])
all_warnings.extend([f"{entry_id}: {w}" for w in result['warnings']])
# Validate Bundle structure itself (can add more checks based on profile if needed)
if not bundle.get('type'):
all_errors.append("Bundle resource itself is missing the required 'type' field.")
bundle_valid = False
logger.info(f"Bundle validation finished. Overall Valid: {bundle_valid}, Total Errors: {len(all_errors)}, Total Warnings: {len(all_warnings)}")
return {
'valid': bundle_valid,
'errors': all_errors,
'warnings': all_warnings,
'results': results # Contains individual resource validation results
}
except Exception as e:
logger.error(f"Unexpected error during bundle validation: {str(e)}", exc_info=True)
return {'valid': False, 'errors': [f'Unexpected bundle validation error: {str(e)}'], 'warnings': [], 'results': {}}
def download_package(name, version):
"""Downloads a single FHIR package. Returns (save_path, error_message)"""
download_dir = _get_download_dir()
if not download_dir:
return None, "Could not get/create download directory."
package_id = f"{name}#{version}"
package_url = f"{FHIR_REGISTRY_BASE_URL}/{name}/{version}"
filename = _construct_tgz_filename(name, version)
save_path = os.path.join(download_dir, filename)
if os.path.exists(save_path):
# Optional: Add size check or hash check for existing files?
logger.info(f"Package already exists locally: {filename}")
return save_path, None
logger.info(f"Downloading: {package_id} from {package_url} -> {filename}")
try:
# Use a session for potential keep-alive benefits
with requests.Session() as session:
with session.get(package_url, stream=True, timeout=90) as r:
r.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
# Check content type? Should be application/gzip or similar
content_type = r.headers.get('Content-Type', '').lower()
if 'gzip' not in content_type and 'tar' not in content_type:
logger.warning(f"Unexpected Content-Type '{content_type}' for {package_url}")
# Write to temp file first? Prevents partial downloads being seen as complete.
# temp_save_path = save_path + ".part"
with open(save_path, 'wb') as f:
logger.debug(f"Opened {save_path} for writing.")
bytes_downloaded = 0
for chunk in r.iter_content(chunk_size=8192):
# filter out keep-alive new chunks
if chunk:
f.write(chunk)
bytes_downloaded += len(chunk)
logger.debug(f"Finished writing {bytes_downloaded} bytes to {save_path}")
# os.rename(temp_save_path, save_path) # Move temp file to final location
# Basic check after download
if not os.path.exists(save_path) or os.path.getsize(save_path) == 0:
err_msg = f"Download failed for {package_id}: Saved file is missing or empty."
logger.error(err_msg)
# Clean up empty file?
try: os.remove(save_path)
except OSError: pass
return None, err_msg
logger.info(f"Success: Downloaded {filename}")
return save_path, None
except requests.exceptions.HTTPError as e:
# Handle specific HTTP errors like 404 Not Found
err_msg = f"HTTP error downloading {package_id}: {e}"
logger.error(err_msg)
return None, err_msg
except requests.exceptions.ConnectionError as e:
err_msg = f"Connection error downloading {package_id}: {e}"
logger.error(err_msg)
return None, err_msg
except requests.exceptions.Timeout as e:
err_msg = f"Timeout downloading {package_id}: {e}"
logger.error(err_msg)
return None, err_msg
except requests.exceptions.RequestException as e:
err_msg = f"General download error for {package_id}: {e}"
logger.error(err_msg)
return None, err_msg
except OSError as e:
err_msg = f"File save error for {filename}: {e}"
logger.error(err_msg)
# Clean up partial file if it exists
if os.path.exists(save_path):
try: os.remove(save_path)
except OSError: pass
return None, err_msg
except Exception as e:
err_msg = f"Unexpected download error for {package_id}: {e}"
logger.error(err_msg, exc_info=True)
# Clean up partial file
if os.path.exists(save_path):
try: os.remove(save_path)
except OSError: pass
return None, err_msg
def extract_dependencies(tgz_path):
"""Extracts dependencies dict from package.json. Returns (dep_dict or None on error, error_message)"""
package_json_path = "package/package.json"
dependencies = None # Default to None
error_message = None
if not tgz_path or not os.path.exists(tgz_path):
return None, f"File not found at {tgz_path}"
try:
with tarfile.open(tgz_path, "r:gz") as tar:
# Check if package.json exists before trying to extract
try:
package_json_member = tar.getmember(package_json_path)
except KeyError:
# This is common for core packages like hl7.fhir.r4.core
logger.info(f"'{package_json_path}' not found in {os.path.basename(tgz_path)}. Assuming no dependencies.")
return {}, None # Return empty dict, no error
package_json_fileobj = tar.extractfile(package_json_member)
if package_json_fileobj:
try:
# Read bytes and decode carefully
content_bytes = package_json_fileobj.read()
content_string = content_bytes.decode('utf-8-sig')
package_data = json.loads(content_string)
dependencies = package_data.get('dependencies', {})
if not isinstance(dependencies, dict):
logger.error(f"Invalid 'dependencies' format in {package_json_path} (expected dict, got {type(dependencies)}).")
dependencies = None
error_message = f"Invalid 'dependencies' format in {package_json_path}."
except json.JSONDecodeError as e:
error_message = f"JSON parse error in {package_json_path}: {e}"
logger.error(error_message)
dependencies = None
except UnicodeDecodeError as e:
error_message = f"Encoding error reading {package_json_path}: {e}"
logger.error(error_message)
dependencies = None
finally:
package_json_fileobj.close()
else:
# Should not happen if getmember succeeded, but handle defensively
error_message = f"Could not extract {package_json_path} despite being listed in tar."
logger.error(error_message)
dependencies = None
except tarfile.ReadError as e: # Often indicates corrupted file
error_message = f"Tar ReadError (possibly corrupted) for {os.path.basename(tgz_path)}: {e}"
logger.error(error_message)
dependencies = None
except tarfile.TarError as e:
error_message = f"TarError processing {os.path.basename(tgz_path)}: {e}"
logger.error(error_message)
dependencies = None
except FileNotFoundError: # Should be caught by initial check, but include
error_message = f"Package file not found during dependency extraction: {tgz_path}"
logger.error(error_message)
dependencies = None
except Exception as e:
error_message = f"Unexpected error extracting deps from {os.path.basename(tgz_path)}: {e}"
logger.error(error_message, exc_info=True)
dependencies = None
return dependencies, error_message
def extract_used_types(tgz_path):
"""Extracts all resource types and referenced types from the package resources."""
used_types = set()
if not tgz_path or not os.path.exists(tgz_path):
logger.error(f"Cannot extract used types: File not found at {tgz_path}")
return used_types # Return empty set
try:
with tarfile.open(tgz_path, "r:gz") as tar:
for member in tar:
# Process only JSON files within the 'package/' directory
if not (member.isfile() and member.name.startswith('package/') and member.name.lower().endswith('.json')):
continue
# Skip metadata files
if os.path.basename(member.name).lower() in ['package.json', '.index.json', 'validation-summary.json', 'validation-oo.json']:
continue
fileobj = None
try:
fileobj = tar.extractfile(member)
if fileobj:
content_bytes = fileobj.read()
content_string = content_bytes.decode('utf-8-sig')
data = json.loads(content_string)
if not isinstance(data, dict): continue # Skip if not a valid JSON object
resource_type = data.get('resourceType')
if not resource_type: continue # Skip if no resourceType
# Add the resource type itself
used_types.add(resource_type)
# --- StructureDefinition Specific Extraction ---
if resource_type == 'StructureDefinition':
# Add the type this SD defines/constrains
sd_type = data.get('type')
if sd_type: used_types.add(sd_type)
# Add the base definition type if it's a profile
base_def = data.get('baseDefinition')
if base_def:
base_type = base_def.split('/')[-1]
# Avoid adding primitive types like 'Element', 'Resource' etc. if not needed
if base_type and base_type[0].isupper():
used_types.add(base_type)
# Extract types from elements (snapshot or differential)
elements = data.get('snapshot', {}).get('element', []) or data.get('differential', {}).get('element', [])
for element in elements:
if isinstance(element, dict) and 'type' in element:
for t in element.get('type', []):
# Add code (element type)
code = t.get('code')
if code and code[0].isupper(): used_types.add(code)
# Add targetProfile types (Reference targets)
for profile_uri in t.get('targetProfile', []):
if profile_uri:
profile_type = profile_uri.split('/')[-1]
if profile_type and profile_type[0].isupper(): used_types.add(profile_type)
# Add types from contentReference
content_ref = element.get('contentReference')
if content_ref and content_ref.startswith('#'):
# This usually points to another element path within the same SD
# Trying to resolve this fully can be complex.
# We might infer types based on the path referenced if needed.
pass
# --- General Resource Type Extraction ---
else:
# Look for meta.profile for referenced profiles -> add profile type
profiles = data.get('meta', {}).get('profile', [])
for profile_uri in profiles:
if profile_uri:
profile_type = profile_uri.split('/')[-1]
if profile_type and profile_type[0].isupper(): used_types.add(profile_type)
# ValueSet: Check compose.include.system (often points to CodeSystem)
if resource_type == 'ValueSet':
for include in data.get('compose', {}).get('include', []):
system = include.get('system')
# Heuristic: If it looks like a FHIR core codesystem URL, extract type
if system and system.startswith('http://hl7.org/fhir/'):
type_name = system.split('/')[-1]
# Check if it looks like a ResourceType
if type_name and type_name[0].isupper() and not type_name.startswith('sid'): # Avoid things like sid/us-ssn
used_types.add(type_name)
# Could add more heuristics for other terminology servers
# CapabilityStatement: Check rest.resource.type and rest.resource.profile
if resource_type == 'CapabilityStatement':
for rest_item in data.get('rest', []):
for resource_item in rest_item.get('resource', []):
res_type = resource_item.get('type')
if res_type and res_type[0].isupper(): used_types.add(res_type)
profile_uri = resource_item.get('profile')
if profile_uri:
profile_type = profile_uri.split('/')[-1]
if profile_type and profile_type[0].isupper(): used_types.add(profile_type)
# --- Generic recursive search for 'reference' fields? ---
# This could be expensive. Let's rely on SDs for now.
# def find_references(obj):
# if isinstance(obj, dict):
# for k, v in obj.items():
# if k == 'reference' and isinstance(v, str):
# ref_type = v.split('/')[0]
# if ref_type and ref_type[0].isupper(): used_types.add(ref_type)
# else:
# find_references(v)
# elif isinstance(obj, list):
# for item in obj:
# find_references(item)
# find_references(data)
except json.JSONDecodeError as e:
logger.warning(f"Could not parse JSON in {member.name} for used types: {e}")
except UnicodeDecodeError as e:
logger.warning(f"Could not decode {member.name} for used types: {e}")
except Exception as e:
logger.warning(f"Could not process member {member.name} for used types: {e}")
finally:
if fileobj:
fileobj.close()
except tarfile.ReadError as e:
logger.error(f"Tar ReadError extracting used types from {tgz_path}: {e}")
except tarfile.TarError as e:
logger.error(f"TarError extracting used types from {tgz_path}: {e}")
except FileNotFoundError:
logger.error(f"Package file not found for used type extraction: {tgz_path}")
except Exception as e:
logger.error(f"Error extracting used types from {tgz_path}: {e}", exc_info=True)
# Filter out potential primitives or base types that aren't resources?
# E.g., 'string', 'boolean', 'Element', 'BackboneElement', 'Resource'
core_non_resource_types = {'string', 'boolean', 'integer', 'decimal', 'uri', 'url', 'canonical',
'base64Binary', 'instant', 'date', 'dateTime', 'time', 'code', 'oid', 'id',
'markdown', 'unsignedInt', 'positiveInt', 'xhtml',
'Element', 'BackboneElement', 'Resource', 'DomainResource', 'DataType'}
final_used_types = {t for t in used_types if t not in core_non_resource_types and t[0].isupper()}
logger.debug(f"Extracted used types from {os.path.basename(tgz_path)}: {final_used_types}")
return final_used_types
def map_types_to_packages(used_types, all_dependencies):
"""Maps used types to the packages that provide them based on dependency lists."""
type_to_package = {}
processed_types = set()
# Pass 1: Exact matches in dependencies
for (pkg_name, pkg_version), deps in all_dependencies.items():
for dep_name, dep_version in deps.items():
# Simple heuristic: if type name is in dependency package name
# This is weak, needs improvement. Ideally, packages declare exported types.
for t in used_types:
# Exact match or common pattern (e.g., USCorePatient -> us.core)
# Need a better mapping strategy - this is very basic.
# Example: If 'USCorePatient' is used, and 'us.core' is a dependency.
# A more robust approach would involve loading the .index.json from dependency packages.
# For now, let's just use a simplified direct check:
# If a dependency name contains the type name (lowercase)
if t not in type_to_package and t.lower() in dep_name.lower():
type_to_package[t] = (dep_name, dep_version)
processed_types.add(t)
logger.debug(f"Mapped type '{t}' to dependency package '{dep_name}' based on name heuristic.")
# Pass 2: Check the package itself
for (pkg_name, pkg_version), deps in all_dependencies.items():
for t in used_types:
if t not in type_to_package and t.lower() in pkg_name.lower():
type_to_package[t] = (pkg_name, pkg_version)
processed_types.add(t)
logger.debug(f"Mapped type '{t}' to source package '{pkg_name}' based on name heuristic.")
# Fallback: map remaining types to the canonical package if not already mapped
canonical_name, canonical_version = CANONICAL_PACKAGE
unmapped_types = used_types - processed_types
if unmapped_types:
logger.info(f"Using canonical package {canonical_name}#{canonical_version} as fallback for unmapped types: {unmapped_types}")
for t in unmapped_types:
type_to_package[t] = CANONICAL_PACKAGE
logger.debug(f"Final type-to-package mapping: {type_to_package}")
return type_to_package
def import_package_and_dependencies(initial_name, initial_version, dependency_mode='recursive'):
"""Orchestrates recursive download and dependency extraction based on the dependency mode."""
logger.info(f"Starting import for {initial_name}#{initial_version} with dependency_mode={dependency_mode}")
results = {
'requested': (initial_name, initial_version),
'processed': set(), # Tuples (name, version) successfully processed (downloaded + deps extracted)
'downloaded': {}, # Dict {(name, version): save_path} for successfully downloaded
'all_dependencies': {}, # Dict {(name, version): {dep_name: dep_ver}} stores extracted deps for each processed pkg
'dependencies': [], # List of unique {"name": X, "version": Y} across all processed packages
'errors': [] # List of error messages encountered
}
# Queue stores (name, version) tuples to process
pending_queue = [(initial_name, initial_version)]
# Lookup stores (name, version) tuples that have been added to queue or processed, prevents cycles/re-queuing
queued_or_processed_lookup = set([(initial_name, initial_version)])
all_found_dependencies = set() # Store unique dep tuples {(name, version)} found
# --- Main Processing Loop ---
while pending_queue:
name, version = pending_queue.pop(0)
package_id_tuple = (name, version)
# Already successfully processed? Skip. (Shouldn't happen with lookup check before queueing, but safety)
if package_id_tuple in results['processed']:
logger.debug(f"Skipping already processed package: {name}#{version}")
continue
logger.info(f"Processing package from queue: {name}#{version}")
# --- Download ---
save_path, dl_error = download_package(name, version)
if dl_error:
error_msg = f"Download failed for {name}#{version}: {dl_error}"
results['errors'].append(error_msg)
logger.error(error_msg)
# Do not add to processed, leave in lookup to prevent re-queueing a known failure
continue # Move to next item in queue
else:
results['downloaded'][package_id_tuple] = save_path
logger.info(f"Successfully downloaded/verified {name}#{version} at {save_path}")
# --- Extract Dependencies ---
dependencies, dep_error = extract_dependencies(save_path)
if dep_error:
# Log error but potentially continue processing other packages if deps are just missing
error_msg = f"Dependency extraction failed for {name}#{version}: {dep_error}"
results['errors'].append(error_msg)
logger.error(error_msg)
# Mark as processed even if dep extraction fails, as download succeeded
results['processed'].add(package_id_tuple)
# Don't queue dependencies if extraction failed
continue
elif dependencies is None:
# This indicates a more severe error during extraction (e.g., corrupted tar)
error_msg = f"Dependency extraction returned critical error for {name}#{version}. Aborting dependency processing for this package."
results['errors'].append(error_msg)
logger.error(error_msg)
results['processed'].add(package_id_tuple) # Mark processed
continue
# Store extracted dependencies for this package
results['all_dependencies'][package_id_tuple] = dependencies
results['processed'].add(package_id_tuple) # Mark as successfully processed
logger.debug(f"Successfully processed {name}#{version}. Dependencies found: {list(dependencies.keys())}")
# Add unique dependencies to the overall list and potentially the queue
current_package_deps = []
for dep_name, dep_version in dependencies.items():
if isinstance(dep_name, str) and isinstance(dep_version, str) and dep_name and dep_version:
dep_tuple = (dep_name, dep_version)
current_package_deps.append({"name": dep_name, "version": dep_version}) # For metadata
if dep_tuple not in all_found_dependencies:
all_found_dependencies.add(dep_tuple)
results['dependencies'].append({"name": dep_name, "version": dep_version}) # Add to overall unique list
# --- Queue Dependencies Based on Mode ---
# Check if not already queued or processed
if dep_tuple not in queued_or_processed_lookup:
should_queue = False
if dependency_mode == 'recursive':
should_queue = True
elif dependency_mode == 'patch-canonical' and dep_tuple == CANONICAL_PACKAGE:
should_queue = True
elif dependency_mode == 'tree-shaking':
# Tree shaking requires calculating used types *after* initial pkg is processed
# This logic needs adjustment - calculate used types only once for the root package.
# Let's defer full tree-shaking queuing logic for now, treat as 'none'.
# TODO: Implement tree-shaking queuing properly outside the loop based on initial package's used types.
pass
if should_queue:
logger.debug(f"Adding dependency to queue ({dependency_mode}): {dep_name}#{dep_version}")
pending_queue.append(dep_tuple)
queued_or_processed_lookup.add(dep_tuple)
else:
logger.warning(f"Skipping invalid dependency entry in {name}#{version}: name='{dep_name}', version='{dep_version}'")
# --- Save Metadata (after successful download and dep extraction) ---
# We need profile relationship info which comes from process_package_file
# Let's call it here if needed for metadata, though it duplicates effort if called later.
# Alternative: Save basic metadata first, update later?
# Let's just save what we have now. Profile relations can be added by a separate process.
save_package_metadata(name, version, dependency_mode, current_package_deps)
# TODO: Rework metadata saving if compliesWith/imposedBy is needed during import.
# --- Post-Loop Processing (e.g., for Tree Shaking) ---
if dependency_mode == 'tree-shaking' and (initial_name, initial_version) in results['downloaded']:
logger.info("Performing tree-shaking dependency analysis...")
root_save_path = results['downloaded'][(initial_name, initial_version)]
used_types = extract_used_types(root_save_path)
if used_types:
type_to_package = map_types_to_packages(used_types, results['all_dependencies'])
logger.debug(f"Tree-shaking mapping: {type_to_package}")
tree_shaken_deps_to_ensure = set(type_to_package.values())
# Ensure canonical package is included if tree-shaking mode implies it
if CANONICAL_PACKAGE not in tree_shaken_deps_to_ensure:
logger.debug(f"Adding canonical package {CANONICAL_PACKAGE} to tree-shaking set.")
tree_shaken_deps_to_ensure.add(CANONICAL_PACKAGE)
initial_package_tuple = (initial_name, initial_version)
if initial_package_tuple in tree_shaken_deps_to_ensure:
tree_shaken_deps_to_ensure.remove(initial_package_tuple) # Don't queue self
additional_processing_needed = False
for dep_tuple in tree_shaken_deps_to_ensure:
if dep_tuple not in results['processed'] and dep_tuple not in queued_or_processed_lookup:
logger.info(f"Queueing missing tree-shaken dependency: {dep_tuple[0]}#{dep_tuple[1]}")
pending_queue.append(dep_tuple)
queued_or_processed_lookup.add(dep_tuple)
additional_processing_needed = True
# If tree-shaking added new packages, re-run the processing loop
if additional_processing_needed:
logger.info("Re-running processing loop for tree-shaken dependencies...")
# This recursive call structure isn't ideal, better to refactor loop.
# For now, let's just run the loop again conceptually.
# This requires refactoring the main loop logic to be callable.
# --- TEMPORARY WORKAROUND: Just log and state limitation ---
logger.warning("Tree-shaking identified additional dependencies. Manual re-run or refactoring needed to process them.")
results['errors'].append("Tree-shaking identified further dependencies; re-run required for full processing.")
# TODO: Refactor the while loop into a callable function to handle recursive/iterative processing.
proc_count = len(results['processed'])
dl_count = len(results['downloaded'])
err_count = len(results['errors'])
logger.info(f"Import finished for {initial_name}#{initial_version}. Processed: {proc_count}, Downloaded: {dl_count}, Errors: {err_count}")
# Make sure unique list of deps is accurate
results['dependencies'] = [ {"name": d[0], "version": d[1]} for d in all_found_dependencies]
return results
def process_package_file(tgz_path):
"""Extracts types, profile status, MS elements, examples, and profile relationships from a downloaded .tgz package."""
logger.info(f"Processing package file details: {tgz_path}")
results = {
'resource_types_info': [], # List of dicts about each Resource/Profile
'must_support_elements': {}, # Dict: { 'ResourceName/ProfileId': ['path1', 'path2'] }
'examples': {}, # Dict: { 'ResourceName/ProfileId': ['example_path1'] }
'complies_with_profiles': [], # List of canonical URLs
'imposed_profiles': [], # List of canonical URLs
'errors': []
}
# Use defaultdict for easier aggregation
# Key: SD ID if profile, otherwise ResourceType. Value: dict with info.
resource_info = defaultdict(lambda: {
'name': None, # The key (SD ID or ResourceType)
'type': None, # Base FHIR type (e.g., Patient)
'is_profile': False,
'ms_flag': False, # Does this SD define *any* MS elements?
'ms_paths': set(), # Specific MS element paths defined *in this SD*
'examples': set(), # Paths to example files linked to this type/profile
'sd_processed': False # Flag to avoid reprocessing MS flags for the same SD key
})
if not tgz_path or not os.path.exists(tgz_path):
results['errors'].append(f"Package file not found: {tgz_path}")
logger.error(f"Package file not found during processing: {tgz_path}")
return results
try:
with tarfile.open(tgz_path, "r:gz") as tar:
members = tar.getmembers() # Get all members once
logger.debug(f"Found {len(members)} members in {os.path.basename(tgz_path)}")
# --- Pass 1: Process StructureDefinitions ---
logger.debug("Processing StructureDefinitions...")
for member in members:
# Basic filtering
if not member.isfile() or not member.name.startswith('package/') or not member.name.lower().endswith('.json'):
continue
base_filename_lower = os.path.basename(member.name).lower()
if base_filename_lower in ['package.json', '.index.json', 'validation-summary.json', 'validation-oo.json']:
continue
fileobj = None
try:
fileobj = tar.extractfile(member)
if not fileobj: continue
content_bytes = fileobj.read()
content_string = content_bytes.decode('utf-8-sig')
data = json.loads(content_string)
if not isinstance(data, dict) or data.get('resourceType') != 'StructureDefinition':
continue # Only interested in SDs in this pass
# --- Process the StructureDefinition ---
profile_id = data.get('id') or data.get('name') # Use ID, fallback to name
sd_type = data.get('type') # The base FHIR type (e.g., Patient)
sd_base = data.get('baseDefinition')
is_profile_sd = bool(sd_base) # It's a profile if it has a baseDefinition
if not profile_id:
logger.warning(f"StructureDefinition in {member.name} missing 'id' and 'name', skipping.")
continue
if not sd_type:
logger.warning(f"StructureDefinition '{profile_id}' in {member.name} missing 'type', skipping.")
continue
entry_key = profile_id # Use the SD's ID as the key
entry = resource_info[entry_key]
# Only process once per entry_key
if entry.get('sd_processed'): continue
entry['name'] = entry_key
entry['type'] = sd_type
entry['is_profile'] = is_profile_sd
# Extract compliesWithProfile and imposeProfile extensions
complies_with = []
imposed = []
for ext in data.get('extension', []):
ext_url = ext.get('url')
value = ext.get('valueCanonical')
if value:
if ext_url == 'http://hl7.org/fhir/StructureDefinition/structuredefinition-compliesWithProfile':
complies_with.append(value)
elif ext_url == 'http://hl7.org/fhir/StructureDefinition/structuredefinition-imposeProfile':
imposed.append(value)
# Add to overall results (unique)
results['complies_with_profiles'].extend(c for c in complies_with if c not in results['complies_with_profiles'])
results['imposed_profiles'].extend(i for i in imposed if i not in results['imposed_profiles'])
# Find Must Support elements defined *in this specific SD*
has_ms_in_this_sd = False
ms_paths_in_this_sd = set()
# Check differential first, then snapshot if needed? Or combine? Let's combine.
elements = data.get('snapshot', {}).get('element', []) + data.get('differential', {}).get('element', [])
# De-duplicate elements based on path if combining snapshot and differential (though usually only one is primary)
processed_element_paths = set()
unique_elements = []
for el in elements:
el_path = el.get('path')
if el_path and el_path not in processed_element_paths:
unique_elements.append(el)
processed_element_paths.add(el_path)
elif not el_path: # Include elements without paths? Maybe not.
pass
for element in unique_elements:
if isinstance(element, dict) and element.get('mustSupport') is True:
element_path = element.get('path')
if element_path:
ms_paths_in_this_sd.add(element_path)
has_ms_in_this_sd = True
else:
logger.warning(f"Found mustSupport=true without path in element of {entry_key} ({member.name})")
if ms_paths_in_this_sd:
entry['ms_paths'] = ms_paths_in_this_sd
entry['ms_flag'] = True # Set flag if this SD defines MS elements
logger.debug(f"Found {len(ms_paths_in_this_sd)} MS elements defined in SD {entry_key}")
entry['sd_processed'] = True # Mark this SD as processed
except json.JSONDecodeError as e:
logger.warning(f"Could not parse JSON SD in {member.name}: {e}")
except UnicodeDecodeError as e:
logger.warning(f"Could not decode SD in {member.name}: {e}")
except Exception as e:
logger.warning(f"Could not process SD member {member.name}: {e}", exc_info=False) # Keep log cleaner
finally:
if fileobj: fileobj.close()
# --- Pass 2: Process Examples ---
logger.debug("Processing Examples...")
for member in members:
# Basic filtering
if not member.isfile() or not member.name.startswith('package/'): # Allow non-JSON examples too
continue
member_name_lower = member.name.lower()
base_filename_lower = os.path.basename(member_name_lower)
if base_filename_lower in ['package.json', '.index.json', 'validation-summary.json', 'validation-oo.json']:
continue
# Heuristic for identifying examples
# Check directory name or filename conventions
is_example = 'example' in member.name.split('/') or 'example' in base_filename_lower.split('-') or 'example' in base_filename_lower.split('.')
if not is_example: continue
logger.debug(f"Processing potential example file: {member.name}")
is_json = member_name_lower.endswith('.json')
fileobj = None
associated_key = None
try:
if is_json:
fileobj = tar.extractfile(member)
if not fileobj: continue
content_bytes = fileobj.read()
content_string = content_bytes.decode('utf-8-sig')
data = json.loads(content_string)
if not isinstance(data, dict): continue
resource_type = data.get('resourceType')
if not resource_type: continue
# Try to associate example with a profile using meta.profile
profile_meta = data.get('meta', {}).get('profile', [])
found_profile_match = False
if profile_meta and isinstance(profile_meta, list):
for profile_url in profile_meta:
# Extract profile ID from canonical URL
profile_id_from_meta = profile_url.split('/')[-1]
if profile_id_from_meta in resource_info:
associated_key = profile_id_from_meta
found_profile_match = True
logger.debug(f"Example {member.name} associated with profile {associated_key} via meta.profile")
break # Use first match
# If no profile match, associate with the base resource type SD (if any)
if not found_profile_match:
# Find SD where type matches the example's resourceType and is_profile is False
matching_base_sd_keys = [k for k, v in resource_info.items() if v.get('type') == resource_type and not v.get('is_profile') and v.get('sd_processed')]
if matching_base_sd_keys:
associated_key = matching_base_sd_keys[0] # Use the first matching base SD key
logger.debug(f"Example {member.name} associated with base type SD {associated_key}")
else:
# Fallback: If no SD processed for this base type yet, use the type itself as key
associated_key = resource_type
logger.debug(f"Example {member.name} associated with resource type {associated_key} (no specific SD found/processed yet)")
else:
# For non-JSON examples, try to guess based on filename
# e.g., patient-example.xml -> Patient
# e.g., us-core-patient-example.xml -> us-core-patient (if profile exists)
guessed_profile_id = None
if '-' in base_filename_lower:
# Try matching parts against known profile IDs
parts = base_filename_lower.split('-')
potential_id = parts[0]
if potential_id in resource_info:
guessed_profile_id = potential_id
else: # Try combining parts? e.g., us-core
if len(parts) > 1:
potential_id_2 = f"{parts[0]}-{parts[1]}"
if potential_id_2 in resource_info:
guessed_profile_id = potential_id_2
if guessed_profile_id:
associated_key = guessed_profile_id
logger.debug(f"Non-JSON Example {member.name} associated with profile {associated_key} via filename heuristic")
else:
# Fallback to guessing base type
guessed_type = base_filename_lower.split('-')[0].split('.')[0].capitalize()
matching_base_sd_keys = [k for k, v in resource_info.items() if v.get('type') == guessed_type and not v.get('is_profile') and v.get('sd_processed')]
if matching_base_sd_keys:
associated_key = matching_base_sd_keys[0]
logger.debug(f"Non-JSON Example {member.name} associated with base type SD {associated_key} via filename heuristic")
elif guessed_type:
associated_key = guessed_type
logger.debug(f"Non-JSON Example {member.name} associated with resource type {associated_key} via filename heuristic (no specific SD found/processed yet)")
# Add example path to the associated resource/profile info
if associated_key:
# Ensure the entry exists even if no SD was processed (for base types)
if associated_key not in resource_info:
resource_info[associated_key]['name'] = associated_key
# Try to infer type if possible (might be None)
resource_info[associated_key]['type'] = data.get('resourceType') if is_json else associated_key
resource_info[associated_key]['examples'].add(member.name)
else:
logger.warning(f"Could not associate example {member.name} with any known resource or profile.")
except json.JSONDecodeError as e:
logger.warning(f"Could not parse JSON example in {member.name}: {e}")
except UnicodeDecodeError as e:
logger.warning(f"Could not decode example in {member.name}: {e}")
except Exception as e:
logger.warning(f"Could not process example member {member.name}: {e}", exc_info=False)
finally:
if fileobj: fileobj.close()
# --- Final Formatting ---
final_list = []
final_ms_elements = {}
final_examples = {}
logger.debug(f"Finalizing results from resource_info keys: {list(resource_info.keys())}")
# Make sure all base resource types mentioned (even without explicit SDs) are included
all_types_mentioned = set(v['type'] for v in resource_info.values() if v.get('type'))
for type_name in all_types_mentioned:
if type_name not in resource_info:
# Add a basic entry if a type was mentioned (e.g., by an example) but had no SD
if type_name and type_name[0].isupper(): # Basic check it looks like a resource type
logger.debug(f"Adding basic entry for resource type '{type_name}' mentioned but without processed SD.")
resource_info[type_name]['name'] = type_name
resource_info[type_name]['type'] = type_name
resource_info[type_name]['is_profile'] = False
for key, info in resource_info.items():
display_name = info.get('name') or key
base_type = info.get('type')
# Skip if essential info is missing (shouldn't happen with defaultdict + population)
if not display_name or not base_type:
logger.warning(f"Skipping formatting for incomplete key: {key} - Info: {info}")
continue
logger.debug(f"Formatting item '{display_name}': type='{base_type}', profile='{info.get('is_profile', False)}', ms_flag='{info.get('ms_flag', False)}'")
final_list.append({
'name': display_name, # This is the SD ID or ResourceType
'type': base_type, # The base FHIR resource type
'is_profile': info.get('is_profile', False),
'must_support': info.get('ms_flag', False) # Does this SD *define* MS elements?
})
if info['ms_paths']:
final_ms_elements[display_name] = sorted(list(info['ms_paths']))
if info['examples']:
final_examples[display_name] = sorted(list(info['examples']))
# Sort profiles after base types, then alphabetically
results['resource_types_info'] = sorted(final_list, key=lambda x: (x.get('is_profile', False), x.get('name', '')))
results['must_support_elements'] = final_ms_elements
results['examples'] = final_examples
# Ensure relationship lists are unique (done during addition now)
# results['complies_with_profiles'] = sorted(list(set(results['complies_with_profiles'])))
# results['imposed_profiles'] = sorted(list(set(results['imposed_profiles'])))
except tarfile.ReadError as e:
err_msg = f"Tar ReadError processing package file {tgz_path}: {e}"
logger.error(err_msg)
results['errors'].append(err_msg)
except tarfile.TarError as e:
err_msg = f"TarError processing package file {tgz_path}: {e}"
logger.error(err_msg)
results['errors'].append(err_msg)
except FileNotFoundError:
err_msg = f"Package file not found during processing: {tgz_path}"
logger.error(err_msg)
results['errors'].append(err_msg)
except Exception as e:
err_msg = f"Unexpected error processing package file {tgz_path}: {e}"
logger.error(err_msg, exc_info=True)
results['errors'].append(err_msg)
# Logging counts
final_types_count = len(results['resource_types_info'])
ms_defining_count = sum(1 for r in results['resource_types_info'] if r['must_support']) # Count SDs defining MS
total_ms_paths = sum(len(v) for v in results['must_support_elements'].values())
total_examples = sum(len(v) for v in results['examples'].values())
logger.info(f"Package processing finished for {os.path.basename(tgz_path)}: "
f"{final_types_count} Resources/Profiles identified; "
f"{ms_defining_count} define MS elements ({total_ms_paths} total MS paths); "
f"{total_examples} examples found. "
f"CompliesWith: {len(results['complies_with_profiles'])}, Imposed: {len(results['imposed_profiles'])}")
return results
# --- Example Usage (if running script directly) ---
if __name__ == '__main__':
# Configure logger for direct script execution
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger.info("Running services.py directly for testing.")
# Mock Flask app context minimally for config/instance path
class MockFlaskConfig(dict):
pass
class MockFlaskCurrentApp:
config = MockFlaskConfig()
# Calculate instance path relative to this file
instance_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'instance'))
# Need to manually set current_app for testing outside Flask request context
# This is tricky. Let's bypass current_app dependency in _get_download_dir for direct testing.
# OR, provide a mock. Best approach is to structure code to reduce Flask dependency in core logic.
# For testing, let's override _get_download_dir or manually create the dir
test_download_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'instance', DOWNLOAD_DIR_NAME))
os.makedirs(test_download_dir, exist_ok=True)
logger.info(f"Using test download directory: {test_download_dir}")
# Override the helper function for testing context
original_get_download_dir = _get_download_dir
def mock_get_download_dir():
# In test, don't rely on current_app if possible
# Ensure config exists if needed by validation code
if not hasattr(mock_get_download_dir, 'config'):
mock_get_download_dir.config = {'FHIR_PACKAGES_DIR': test_download_dir}
return test_download_dir
_get_download_dir = mock_get_download_dir
# Add the FHIR_PACKAGES_DIR to the mock config directly
_get_download_dir.config = {'FHIR_PACKAGES_DIR': test_download_dir}
# --- Test Case 1: Import AU Core Patient Package ---
pkg_name = "hl7.fhir.au.core"
pkg_version = "1.0.1" # Use a specific version known to exist
logger.info(f"\n--- Testing Import: {pkg_name}#{pkg_version} ---")
import_results = import_package_and_dependencies(pkg_name, pkg_version, dependency_mode='recursive')
# print("Import Results:", json.dumps(import_results, default=lambda o: '<not serializable>', indent=2))
if not import_results['errors'] and (pkg_name, pkg_version) in import_results['downloaded']:
logger.info(f"Import successful for {pkg_name}#{pkg_version}")
# --- Test Case 2: Validate Patient Resource ---
logger.info(f"\n--- Testing Validation: Patient Example ---")
patient_resource = {
"resourceType": "Patient",
"id": "banks-mia-leanne",
"meta": { "profile": ["http://hl7.org.au/fhir/core/StructureDefinition/au-core-patient"] },
"identifier": [{
"type": {"coding": [{"system": "http://terminology.hl7.org/CodeSystem/v2-0203", "code": "NI"}], "text": "IHI"},
"system": "http://ns.electronichealth.net.au/id/hi/ihi/1.0",
"value": "8003608333647261"
}],
"name": [{"use": "usual", "family": "Banks", "given": ["Mia", "Leanne"]}],
"telecom": [{"system": "phone", "value": "0491574632", "use": "mobile"}],
"gender": "female",
"birthDate": "1983-08-25",
"address": [{"line": ["50 Sebastien St"], "city": "Minjary", "state": "NSW", "postalCode": "2720", "country": "AU"}]
# Missing communication on purpose to test warnings/errors if required by profile
}
validation_result = validate_resource_against_profile(pkg_name, pkg_version, patient_resource)
print("\nPatient Validation Result:")
print(json.dumps(validation_result, indent=2))
# --- Test Case 3: Validate Allergy Resource ---
logger.info(f"\n--- Testing Validation: Allergy Example ---")
allergy_resource = {
"resourceType": "AllergyIntolerance",
"id": "lactose",
"meta": {"profile": ["http://hl7.org.au/fhir/core/StructureDefinition/au-core-allergyintolerance"]},
"clinicalStatus": {"coding": [{"system": "http://terminology.hl7.org/CodeSystem/allergyintolerance-clinical", "code": "active"}]},
"verificationStatus": {"coding": [{"system": "http://terminology.hl7.org/CodeSystem/allergyintolerance-verification", "code": "confirmed"}]},
"code": {"coding": [{"system": "http://snomed.info/sct", "code": "782415009", "display": "Intolerance to lactose"}]},
"patient": {"reference": "Patient/banks-mia-leanne"},
"onsetDateTime": "2022", # Example of choice type
"reaction": [{
"manifestation": [{"coding": [{"system": "http://snomed.info/sct", "code": "21522001", "display": "Abdominal pain"}]}],
"severity": "mild"
}]
}
validation_result_allergy = validate_resource_against_profile(pkg_name, pkg_version, allergy_resource)
print("\nAllergy Validation Result:")
print(json.dumps(validation_result_allergy, indent=2))
else:
logger.error(f"Import failed for {pkg_name}#{pkg_version}, cannot proceed with validation tests.")
print("Import Errors:", import_results['errors'])
# Restore original function if necessary
_get_download_dir = original_get_download_dir