# app/services.py import requests import os import tarfile import json import re import logging from flask import current_app from collections import defaultdict from pathlib import Path # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Constants FHIR_REGISTRY_BASE_URL = "https://packages.fhir.org" DOWNLOAD_DIR_NAME = "fhir_packages" CANONICAL_PACKAGE = ("hl7.fhir.r4.core", "4.0.1") # Define the canonical FHIR package # --- Helper Functions --- def _get_download_dir(): """Gets the absolute path to the download directory, creating it if needed.""" instance_path = None try: # Try to get instance_path from Flask app context if available instance_path = current_app.instance_path logger.debug(f"Using instance path from current_app: {instance_path}") except RuntimeError: # Fallback if no app context (e.g., running script directly) logger.warning("No app context for instance_path, constructing relative path.") # Assume services.py is in /app, instance folder sibling to /app instance_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'instance')) logger.debug(f"Constructed instance path: {instance_path}") if not instance_path: logger.error("Fatal Error: Could not determine instance path.") return None download_dir = os.path.join(instance_path, DOWNLOAD_DIR_NAME) try: os.makedirs(download_dir, exist_ok=True) # Add check for flask config path if 'FHIR_PACKAGES_DIR' not in current_app.config: current_app.config['FHIR_PACKAGES_DIR'] = download_dir logger.info(f"Set current_app.config['FHIR_PACKAGES_DIR'] to {download_dir}") return download_dir except OSError as e: logger.error(f"Fatal Error creating dir {download_dir}: {e}", exc_info=True) return None except RuntimeError: # Catch if current_app doesn't exist here either logger.warning("No app context available to set FHIR_PACKAGES_DIR config.") # Still attempt to create and return the path for non-Flask use cases try: os.makedirs(download_dir, exist_ok=True) return download_dir except OSError as e: logger.error(f"Fatal Error creating dir {download_dir}: {e}", exc_info=True) return None def sanitize_filename_part(text): """Basic sanitization for name/version parts of filename.""" safe_text = "".join(c if c.isalnum() or c in ['.', '-'] else '_' for c in text) safe_text = re.sub(r'_+', '_', safe_text) safe_text = safe_text.strip('_-.') return safe_text if safe_text else "invalid_name" def _construct_tgz_filename(name, version): """Constructs the standard filename using the sanitized parts.""" return f"{sanitize_filename_part(name)}-{sanitize_filename_part(version)}.tgz" def find_and_extract_sd(tgz_path, resource_identifier): """Helper to find and extract SD json from a given tgz path by ID, Name, or Type.""" sd_data = None found_path = None if not tgz_path or not os.path.exists(tgz_path): logger.error(f"File not found in find_and_extract_sd: {tgz_path}") return None, None try: with tarfile.open(tgz_path, "r:gz") as tar: logger.debug(f"Searching for SD matching '{resource_identifier}' in {os.path.basename(tgz_path)}") for member in tar: if not (member.isfile() and member.name.startswith('package/') and member.name.lower().endswith('.json')): continue if os.path.basename(member.name).lower() in ['package.json', '.index.json', 'validation-summary.json', 'validation-oo.json']: continue fileobj = None try: fileobj = tar.extractfile(member) if fileobj: content_bytes = fileobj.read() # Handle potential BOM (Byte Order Mark) content_string = content_bytes.decode('utf-8-sig') data = json.loads(content_string) if isinstance(data, dict) and data.get('resourceType') == 'StructureDefinition': sd_id = data.get('id') sd_name = data.get('name') sd_type = data.get('type') # The type the SD describes (e.g., Patient) # Match if requested identifier matches ID, Name, or the Base Type the SD describes # Case-insensitive matching might be safer for identifiers if resource_identifier and (resource_identifier.lower() == str(sd_type).lower() or resource_identifier.lower() == str(sd_id).lower() or resource_identifier.lower() == str(sd_name).lower()): sd_data = data found_path = member.name logger.info(f"Found matching SD for '{resource_identifier}' at path: {found_path} (Matched on Type/ID/Name)") break # Stop searching once found except json.JSONDecodeError as e: logger.warning(f"Could not parse JSON in {member.name}: {e}") except UnicodeDecodeError as e: logger.warning(f"Could not decode UTF-8 in {member.name}: {e}") except tarfile.TarError as e: logger.warning(f"Tar error reading member {member.name}: {e}") # Potentially break or continue depending on severity preference except Exception as e: logger.warning(f"Could not read/parse potential SD {member.name}: {e}") finally: if fileobj: fileobj.close() if sd_data is None: logger.info(f"SD matching '{resource_identifier}' not found within archive {os.path.basename(tgz_path)} - caller may attempt fallback") except tarfile.ReadError as e: logger.error(f"Tar ReadError (possibly corrupted file) reading {tgz_path}: {e}") # Decide if this should raise or return None return None, None # Or raise custom error except tarfile.TarError as e: logger.error(f"TarError reading {tgz_path} in find_and_extract_sd: {e}") raise tarfile.TarError(f"Error reading package archive: {e}") from e except FileNotFoundError as e: logger.error(f"FileNotFoundError reading {tgz_path} in find_and_extract_sd: {e}") raise except Exception as e: logger.error(f"Unexpected error in find_and_extract_sd for {tgz_path}: {e}", exc_info=True) raise return sd_data, found_path def save_package_metadata(name, version, dependency_mode, dependencies, complies_with_profiles=None, imposed_profiles=None): """Saves the dependency mode, imported dependencies, and profile relationships as metadata alongside the package.""" download_dir = _get_download_dir() if not download_dir: logger.error("Could not get download directory for metadata saving.") return False metadata = { 'package_name': name, 'version': version, 'dependency_mode': dependency_mode, 'imported_dependencies': dependencies, 'complies_with_profiles': complies_with_profiles or [], 'imposed_profiles': imposed_profiles or [] } metadata_filename = f"{sanitize_filename_part(name)}-{sanitize_filename_part(version)}.metadata.json" metadata_path = os.path.join(download_dir, metadata_filename) try: with open(metadata_path, 'w', encoding='utf-8') as f: # Specify encoding json.dump(metadata, f, indent=2) logger.info(f"Saved metadata for {name}#{version} at {metadata_path}") return True except Exception as e: logger.error(f"Failed to save metadata for {name}#{version}: {e}") return False def get_package_metadata(name, version): """Retrieves the metadata for a given package.""" download_dir = _get_download_dir() if not download_dir: logger.error("Could not get download directory for metadata retrieval.") return None metadata_filename = f"{sanitize_filename_part(name)}-{sanitize_filename_part(version)}.metadata.json" metadata_path = os.path.join(download_dir, metadata_filename) if os.path.exists(metadata_path): try: with open(metadata_path, 'r', encoding='utf-8') as f: # Specify encoding return json.load(f) except Exception as e: logger.error(f"Failed to read metadata for {name}#{version}: {e}") return None return None # --- New navigate_fhir_path --- def navigate_fhir_path(resource, path): """Navigate a FHIR resource path, handling arrays, nested structures, and choice types.""" keys = path.split('.') # Remove the root resource type if present (e.g., Patient.name -> name) if keys and resource and isinstance(resource, dict) and keys[0] == resource.get('resourceType'): keys = keys[1:] current = resource for i, key in enumerate(keys): is_last_key = (i == len(keys) - 1) # logger.debug(f"Navigating: key='{key}', is_last={is_last_key}, current_type={type(current)}") # Uncomment for debug if current is None: # logger.debug(f"Navigation stopped, current became None before processing key '{key}'.") return None if isinstance(current, dict): # Handle direct key access if key in current: current = current.get(key) # Use .get() for safety # Handle choice type e.g., value[x] elif '[x]' in key: base_key = key.replace('[x]', '') found_choice = False for k, v in current.items(): if k.startswith(base_key): current = v found_choice = True break if not found_choice: # logger.debug(f"Choice key '{key}' (base: {base_key}) not found in dict keys: {list(current.keys())}") return None else: # logger.debug(f"Key '{key}' not found in dict keys: {list(current.keys())}") return None elif isinstance(current, list): # If it's the last key, the path refers to the list itself. # The validation logic needs to handle checking the list. if is_last_key: # logger.debug(f"Path ends on a list for key '{key}'. Returning list: {current}") return current # Return the list itself for the validator to check # --- If not the last key, we need to look inside list elements --- # This is tricky. FHIRPath has complex list navigation. # For simple validation (does element X exist?), we might assume # we just need to find *one* item in the list that has the subsequent path. # Let's try finding the first match within the list. found_in_list = False results_from_list = [] remaining_path = '.'.join(keys[i:]) # The rest of the path including current key # logger.debug(f"List encountered for key '{key}'. Searching elements for remaining path: '{remaining_path}'") for item in current: # Recursively navigate into the item using the *remaining* path sub_result = navigate_fhir_path(item, remaining_path) if sub_result is not None: # Collect all non-None results if validating cardinality or specific values later if isinstance(sub_result, list): results_from_list.extend(sub_result) else: results_from_list.append(sub_result) # For basic existence check, finding one is enough, but let's collect all # found_in_list = True # break # Or collect all? Let's collect for now. if not results_from_list: # logger.debug(f"Remaining path '{remaining_path}' not found in any list items.") return None # Path not found in any list element # What to return? The first result? All results? # If the final part of the path should be a single value, return first. # If it could be multiple (e.g., Patient.name.given returns multiple strings), return list. # Let's return the list of found items. The validator can check if it's non-empty. # logger.debug(f"Found results in list for '{remaining_path}': {results_from_list}") return results_from_list # Return list of found values/sub-structures else: # Current is not a dict or list, cannot navigate further # logger.debug(f"Cannot navigate further, current is not dict/list (key='{key}').") return None # logger.debug(f"Final result for path '{path}': {current}") return current # --- End New navigate_fhir_path --- def validate_resource_against_profile(package_name, version, resource, include_dependencies=True): """Validate a single FHIR resource against a package's StructureDefinitions.""" logger.debug(f"Starting validation for resource: {resource.get('resourceType')}/{resource.get('id')} against {package_name}#{version}") try: # Find the resource's type resource_type = resource.get('resourceType') if not resource_type: return {'valid': False, 'errors': ['Resource is missing resourceType.'], 'warnings': []} # Get StructureDefinition # Ensure download dir is fetched and config potentially set download_dir = _get_download_dir() if not download_dir: return {'valid': False, 'errors': ['Could not determine FHIR package directory.'], 'warnings': []} # Construct path using helper for consistency tgz_filename = _construct_tgz_filename(package_name, version) # Use absolute path from download_dir tgz_path = os.path.join(download_dir, tgz_filename) logger.debug(f"Attempting to load SD for type '{resource_type}' from tgz: {tgz_path}") sd_data, sd_path_in_tar = find_and_extract_sd(tgz_path, resource_type) if not sd_data: logger.error(f"No StructureDefinition found for type '{resource_type}' in package {package_name}#{version} at {tgz_path}") # Try falling back to canonical package if not the one requested? Maybe not here. return {'valid': False, 'errors': [f"StructureDefinition for resource type '{resource_type}' not found in package {package_name}#{version}."], 'warnings': []} logger.debug(f"Found SD for '{resource_type}' in tar at '{sd_path_in_tar}'") # Prefer snapshot if available, otherwise use differential elements = sd_data.get('snapshot', {}).get('element', []) if not elements: elements = sd_data.get('differential', {}).get('element', []) logger.debug("Using differential elements for validation (snapshot missing).") if not elements: logger.error(f"StructureDefinition {sd_data.get('id', resource_type)} has no snapshot or differential elements.") return {'valid': False, 'errors': [f"StructureDefinition '{sd_data.get('id', resource_type)}' is invalid (no elements)."], 'warnings': []} must_support_paths = [] for element in elements: if element.get('mustSupport', False): path = element.get('path', '') if path: must_support_paths.append(path) errors = [] warnings = [] # --- Revised Required Field Validation (min >= 1) --- logger.debug(f"Checking required fields for {resource_type} based on SD {sd_data.get('id')}...") element_definitions = {e.get('path'): e for e in elements if e.get('path')} # Cache elements by path for element in elements: path = element.get('path', '') min_val = element.get('min', 0) # Skip base element (e.g., "Patient") as it's always present if resourceType matches if '.' not in path: continue if min_val >= 1: logger.debug(f"Checking required path: {path} (min={min_val})") # --- START: Parent Presence Check --- parent_path = '.'.join(path.split('.')[:-1]) parent_is_present_or_not_applicable = True # Assume true unless parent is optional AND absent # Check only if parent_path is a valid element path (not just the root type) if '.' in parent_path: parent_element_def = element_definitions.get(parent_path) if parent_element_def: parent_min_val = parent_element_def.get('min', 0) # If the parent element itself is optional (min: 0)... if parent_min_val == 0: # ...check if the parent element actually exists in the instance data parent_value = navigate_fhir_path(resource, parent_path) if parent_value is None or (isinstance(parent_value, (list, str, dict)) and not parent_value): # Optional parent is missing, so child cannot be required. Skip the check for this element. parent_is_present_or_not_applicable = False logger.debug(f"-> Requirement check for '{path}' skipped: Optional parent '{parent_path}' is absent.") else: # This case indicates an issue with the SD structure or path generation, but we'll be lenient logger.warning(f"Could not find definition for parent path '{parent_path}' while checking requirement for '{path}'. Proceeding with check.") # --- END: Parent Presence Check --- # Only proceed with checking the element itself if its optional parent is present, # or if the parent is required, or if it's a top-level element. if parent_is_present_or_not_applicable: value = navigate_fhir_path(resource, path) # 1. Check for presence (is it None or an empty container?) is_missing_or_empty = False if value is None: is_missing_or_empty = True logger.debug(f"-> Path '{path}' value is None.") elif isinstance(value, (list, str, dict)) and not value: is_missing_or_empty = True logger.debug(f"-> Path '{path}' value is an empty {type(value).__name__}.") elif isinstance(value, bool) and value is False: pass # Valid presence elif isinstance(value, (int, float)) and value == 0: pass # Valid presence if is_missing_or_empty: # Log the error only if the parent context allowed the check errors.append(f"Required field '{path}' is missing or empty.") logger.warning(f"Validation Error: Required field '{path}' missing or empty (Context: Parent '{parent_path}' required or present).") continue # Skip further checks for this element if missing # 2. Check specific FHIR types if present (value is not None/empty) # (This part of the logic remains the same as before) element_types = element.get('type', []) type_codes = {t.get('code') for t in element_types if t.get('code')} is_codeable_concept = 'CodeableConcept' in type_codes is_reference = 'Reference' in type_codes is_coding = 'Coding' in type_codes if is_codeable_concept and isinstance(value, dict): codings = value.get('coding') if not value.get('text'): if not isinstance(codings, list) or not any(isinstance(c, dict) and c.get('code') and c.get('system') for c in codings): errors.append(f"Required CodeableConcept '{path}' lacks text or a valid coding (must include system and code).") logger.warning(f"Validation Error: Required CC '{path}' invalid structure.") elif is_coding and isinstance(value, dict): if not value.get('code') or not value.get('system'): errors.append(f"Required Coding '{path}' lacks a system or code.") logger.warning(f"Validation Error: Required Coding '{path}' invalid structure.") elif is_reference and isinstance(value, dict): if not value.get('reference') and not value.get('identifier'): errors.append(f"Required Reference '{path}' lacks a reference or identifier.") logger.warning(f"Validation Error: Required Reference '{path}' invalid structure.") # --- Revised Must-Support Field Validation --- logger.debug(f"Checking must-support fields for {resource_type}...") unique_must_support_paths = sorted(list(set(must_support_paths))) # Avoid duplicate checks if in both snapshot/diff for path in unique_must_support_paths: # Skip base element if '.' not in path: continue logger.debug(f"Checking must-support path: {path}") value = navigate_fhir_path(resource, path) # 1. Check for presence is_missing_or_empty = False if value is None: is_missing_or_empty = True logger.debug(f"-> Path '{path}' value is None.") elif isinstance(value, (list, str, dict)) and not value: is_missing_or_empty = True logger.debug(f"-> Path '{path}' value is an empty {type(value).__name__}.") elif isinstance(value, bool) and value is False: pass elif isinstance(value, (int, float)) and value == 0: pass if is_missing_or_empty: warnings.append(f"Must-support field '{path}' is missing or empty.") logger.info(f"Validation Warning: Must-support field '{path}' missing or empty.") # Use INFO for MS warnings continue # 2. Check specific FHIR types (similar logic to required checks) element_def = next((e for e in elements if e.get('path') == path), None) if element_def: element_types = element_def.get('type', []) type_codes = {t.get('code') for t in element_types if t.get('code')} is_codeable_concept = 'CodeableConcept' in type_codes is_reference = 'Reference' in type_codes is_coding = 'Coding' in type_codes if is_codeable_concept and isinstance(value, dict): codings = value.get('coding') if not value.get('text'): if not isinstance(codings, list) or not any(isinstance(c, dict) and c.get('code') and c.get('system') for c in codings): warnings.append(f"Must-support CodeableConcept '{path}' lacks text or a valid coding (must include system and code).") logger.info(f"Validation Warning: Must-support CC '{path}' invalid structure.") elif is_coding and isinstance(value, dict): if not value.get('code') or not value.get('system'): warnings.append(f"Must-support Coding '{path}' lacks a system or code.") logger.info(f"Validation Warning: Must-support Coding '{path}' invalid structure.") elif is_reference and isinstance(value, dict): if not value.get('reference') and not value.get('identifier'): warnings.append(f"Must-support Reference '{path}' lacks a reference or identifier.") logger.info(f"Validation Warning: Must-support Reference '{path}' invalid structure.") # --- Dependency Validation --- if include_dependencies: logger.debug("Checking dependencies...") metadata_path = Path(download_dir) / f"{sanitize_filename_part(package_name)}-{sanitize_filename_part(version)}.metadata.json" if metadata_path.exists(): try: with open(metadata_path, 'r', encoding='utf-8') as f: metadata = json.load(f) for dep in metadata.get('imported_dependencies', []): dep_name = dep.get('name') dep_version = dep.get('version') if not dep_name or not dep_version: logger.warning(f"Skipping invalid dependency entry: {dep}") continue logger.debug(f"Recursively validating against dependency: {dep_name}#{dep_version}") # Pass include_dependencies=False to prevent infinite loops dep_result = validate_resource_against_profile(dep_name, dep_version, resource, include_dependencies=False) if not dep_result['valid']: errors.extend([f"(Dependency {dep_name}#{dep_version}): {e}" for e in dep_result['errors']]) # Carry over warnings from dependencies as well warnings.extend([f"(Dependency {dep_name}#{dep_version}): {w}" for w in dep_result['warnings']]) except Exception as e: logger.error(f"Failed to load or process metadata {metadata_path} for dependencies: {e}") errors.append(f"Failed to process dependency metadata for {package_name}#{version}.") else: logger.warning(f"Metadata file not found, cannot validate dependencies: {metadata_path}") final_valid_state = len(errors) == 0 logger.info(f"Validation result for {resource_type}/{resource.get('id')} against {package_name}#{version}: Valid={final_valid_state}, Errors={len(errors)}, Warnings={len(warnings)}") return { 'valid': final_valid_state, 'errors': errors, 'warnings': warnings } except FileNotFoundError: # Specific handling if the tgz file itself wasn't found earlier logger.error(f"Validation failed: Package file not found for {package_name}#{version}") return {'valid': False, 'errors': [f"Package file for {package_name}#{version} not found."], 'warnings': []} except tarfile.TarError as e: logger.error(f"Validation failed due to TarError for {package_name}#{version}: {e}") return {'valid': False, 'errors': [f"Error reading package archive for {package_name}#{version}: {e}"], 'warnings': []} except Exception as e: logger.error(f"Unexpected error during validation of {resource.get('resourceType')}/{resource.get('id')} against {package_name}#{version}: {e}", exc_info=True) return {'valid': False, 'errors': [f'Unexpected validation error: {str(e)}'], 'warnings': []} def validate_bundle_against_profile(package_name, version, bundle, include_dependencies=True): """Validate a FHIR Bundle against a package's StructureDefinitions.""" try: if not isinstance(bundle, dict) or bundle.get('resourceType') != 'Bundle': return {'valid': False, 'errors': ['Not a valid Bundle resource.'], 'warnings': [], 'results': {}} results = {} all_errors = [] all_warnings = [] bundle_valid = True # Validate each entry's resource logger.info(f"Validating Bundle/{bundle.get('id', 'N/A')} against {package_name}#{version}. Entries: {len(bundle.get('entry', []))}") for i, entry in enumerate(bundle.get('entry', [])): resource = entry.get('resource') entry_id = f"Entry {i}" resource_id_str = None if not resource: all_errors.append(f"{entry_id}: Missing 'resource' key in entry.") bundle_valid = False continue if not isinstance(resource, dict): all_errors.append(f"{entry_id}: 'resource' key does not contain a valid FHIR resource (must be a dictionary).") bundle_valid = False continue resource_type = resource.get('resourceType') resource_id = resource.get('id') resource_id_str = f"{resource_type}/{resource_id}" if resource_type and resource_id else resource_type or f"Unnamed Resource in {entry_id}" entry_id = f"Entry {i} ({resource_id_str})" # More descriptive ID logger.debug(f"Validating {entry_id}...") result = validate_resource_against_profile(package_name, version, resource, include_dependencies) results[entry_id] = result # Store result keyed by descriptive entry ID if not result['valid']: bundle_valid = False all_errors.extend([f"{entry_id}: {e}" for e in result['errors']]) all_warnings.extend([f"{entry_id}: {w}" for w in result['warnings']]) # Validate Bundle structure itself (can add more checks based on profile if needed) if not bundle.get('type'): all_errors.append("Bundle resource itself is missing the required 'type' field.") bundle_valid = False logger.info(f"Bundle validation finished. Overall Valid: {bundle_valid}, Total Errors: {len(all_errors)}, Total Warnings: {len(all_warnings)}") return { 'valid': bundle_valid, 'errors': all_errors, 'warnings': all_warnings, 'results': results # Contains individual resource validation results } except Exception as e: logger.error(f"Unexpected error during bundle validation: {str(e)}", exc_info=True) return {'valid': False, 'errors': [f'Unexpected bundle validation error: {str(e)}'], 'warnings': [], 'results': {}} def download_package(name, version): """Downloads a single FHIR package. Returns (save_path, error_message)""" download_dir = _get_download_dir() if not download_dir: return None, "Could not get/create download directory." package_id = f"{name}#{version}" package_url = f"{FHIR_REGISTRY_BASE_URL}/{name}/{version}" filename = _construct_tgz_filename(name, version) save_path = os.path.join(download_dir, filename) if os.path.exists(save_path): # Optional: Add size check or hash check for existing files? logger.info(f"Package already exists locally: {filename}") return save_path, None logger.info(f"Downloading: {package_id} from {package_url} -> {filename}") try: # Use a session for potential keep-alive benefits with requests.Session() as session: with session.get(package_url, stream=True, timeout=90) as r: r.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx) # Check content type? Should be application/gzip or similar content_type = r.headers.get('Content-Type', '').lower() if 'gzip' not in content_type and 'tar' not in content_type: logger.warning(f"Unexpected Content-Type '{content_type}' for {package_url}") # Write to temp file first? Prevents partial downloads being seen as complete. # temp_save_path = save_path + ".part" with open(save_path, 'wb') as f: logger.debug(f"Opened {save_path} for writing.") bytes_downloaded = 0 for chunk in r.iter_content(chunk_size=8192): # filter out keep-alive new chunks if chunk: f.write(chunk) bytes_downloaded += len(chunk) logger.debug(f"Finished writing {bytes_downloaded} bytes to {save_path}") # os.rename(temp_save_path, save_path) # Move temp file to final location # Basic check after download if not os.path.exists(save_path) or os.path.getsize(save_path) == 0: err_msg = f"Download failed for {package_id}: Saved file is missing or empty." logger.error(err_msg) # Clean up empty file? try: os.remove(save_path) except OSError: pass return None, err_msg logger.info(f"Success: Downloaded {filename}") return save_path, None except requests.exceptions.HTTPError as e: # Handle specific HTTP errors like 404 Not Found err_msg = f"HTTP error downloading {package_id}: {e}" logger.error(err_msg) return None, err_msg except requests.exceptions.ConnectionError as e: err_msg = f"Connection error downloading {package_id}: {e}" logger.error(err_msg) return None, err_msg except requests.exceptions.Timeout as e: err_msg = f"Timeout downloading {package_id}: {e}" logger.error(err_msg) return None, err_msg except requests.exceptions.RequestException as e: err_msg = f"General download error for {package_id}: {e}" logger.error(err_msg) return None, err_msg except OSError as e: err_msg = f"File save error for {filename}: {e}" logger.error(err_msg) # Clean up partial file if it exists if os.path.exists(save_path): try: os.remove(save_path) except OSError: pass return None, err_msg except Exception as e: err_msg = f"Unexpected download error for {package_id}: {e}" logger.error(err_msg, exc_info=True) # Clean up partial file if os.path.exists(save_path): try: os.remove(save_path) except OSError: pass return None, err_msg def extract_dependencies(tgz_path): """Extracts dependencies dict from package.json. Returns (dep_dict or None on error, error_message)""" package_json_path = "package/package.json" dependencies = None # Default to None error_message = None if not tgz_path or not os.path.exists(tgz_path): return None, f"File not found at {tgz_path}" try: with tarfile.open(tgz_path, "r:gz") as tar: # Check if package.json exists before trying to extract try: package_json_member = tar.getmember(package_json_path) except KeyError: # This is common for core packages like hl7.fhir.r4.core logger.info(f"'{package_json_path}' not found in {os.path.basename(tgz_path)}. Assuming no dependencies.") return {}, None # Return empty dict, no error package_json_fileobj = tar.extractfile(package_json_member) if package_json_fileobj: try: # Read bytes and decode carefully content_bytes = package_json_fileobj.read() content_string = content_bytes.decode('utf-8-sig') package_data = json.loads(content_string) dependencies = package_data.get('dependencies', {}) if not isinstance(dependencies, dict): logger.error(f"Invalid 'dependencies' format in {package_json_path} (expected dict, got {type(dependencies)}).") dependencies = None error_message = f"Invalid 'dependencies' format in {package_json_path}." except json.JSONDecodeError as e: error_message = f"JSON parse error in {package_json_path}: {e}" logger.error(error_message) dependencies = None except UnicodeDecodeError as e: error_message = f"Encoding error reading {package_json_path}: {e}" logger.error(error_message) dependencies = None finally: package_json_fileobj.close() else: # Should not happen if getmember succeeded, but handle defensively error_message = f"Could not extract {package_json_path} despite being listed in tar." logger.error(error_message) dependencies = None except tarfile.ReadError as e: # Often indicates corrupted file error_message = f"Tar ReadError (possibly corrupted) for {os.path.basename(tgz_path)}: {e}" logger.error(error_message) dependencies = None except tarfile.TarError as e: error_message = f"TarError processing {os.path.basename(tgz_path)}: {e}" logger.error(error_message) dependencies = None except FileNotFoundError: # Should be caught by initial check, but include error_message = f"Package file not found during dependency extraction: {tgz_path}" logger.error(error_message) dependencies = None except Exception as e: error_message = f"Unexpected error extracting deps from {os.path.basename(tgz_path)}: {e}" logger.error(error_message, exc_info=True) dependencies = None return dependencies, error_message def extract_used_types(tgz_path): """Extracts all resource types and referenced types from the package resources.""" used_types = set() if not tgz_path or not os.path.exists(tgz_path): logger.error(f"Cannot extract used types: File not found at {tgz_path}") return used_types # Return empty set try: with tarfile.open(tgz_path, "r:gz") as tar: for member in tar: # Process only JSON files within the 'package/' directory if not (member.isfile() and member.name.startswith('package/') and member.name.lower().endswith('.json')): continue # Skip metadata files if os.path.basename(member.name).lower() in ['package.json', '.index.json', 'validation-summary.json', 'validation-oo.json']: continue fileobj = None try: fileobj = tar.extractfile(member) if fileobj: content_bytes = fileobj.read() content_string = content_bytes.decode('utf-8-sig') data = json.loads(content_string) if not isinstance(data, dict): continue # Skip if not a valid JSON object resource_type = data.get('resourceType') if not resource_type: continue # Skip if no resourceType # Add the resource type itself used_types.add(resource_type) # --- StructureDefinition Specific Extraction --- if resource_type == 'StructureDefinition': # Add the type this SD defines/constrains sd_type = data.get('type') if sd_type: used_types.add(sd_type) # Add the base definition type if it's a profile base_def = data.get('baseDefinition') if base_def: base_type = base_def.split('/')[-1] # Avoid adding primitive types like 'Element', 'Resource' etc. if not needed if base_type and base_type[0].isupper(): used_types.add(base_type) # Extract types from elements (snapshot or differential) elements = data.get('snapshot', {}).get('element', []) or data.get('differential', {}).get('element', []) for element in elements: if isinstance(element, dict) and 'type' in element: for t in element.get('type', []): # Add code (element type) code = t.get('code') if code and code[0].isupper(): used_types.add(code) # Add targetProfile types (Reference targets) for profile_uri in t.get('targetProfile', []): if profile_uri: profile_type = profile_uri.split('/')[-1] if profile_type and profile_type[0].isupper(): used_types.add(profile_type) # Add types from contentReference content_ref = element.get('contentReference') if content_ref and content_ref.startswith('#'): # This usually points to another element path within the same SD # Trying to resolve this fully can be complex. # We might infer types based on the path referenced if needed. pass # --- General Resource Type Extraction --- else: # Look for meta.profile for referenced profiles -> add profile type profiles = data.get('meta', {}).get('profile', []) for profile_uri in profiles: if profile_uri: profile_type = profile_uri.split('/')[-1] if profile_type and profile_type[0].isupper(): used_types.add(profile_type) # ValueSet: Check compose.include.system (often points to CodeSystem) if resource_type == 'ValueSet': for include in data.get('compose', {}).get('include', []): system = include.get('system') # Heuristic: If it looks like a FHIR core codesystem URL, extract type if system and system.startswith('http://hl7.org/fhir/'): type_name = system.split('/')[-1] # Check if it looks like a ResourceType if type_name and type_name[0].isupper() and not type_name.startswith('sid'): # Avoid things like sid/us-ssn used_types.add(type_name) # Could add more heuristics for other terminology servers # CapabilityStatement: Check rest.resource.type and rest.resource.profile if resource_type == 'CapabilityStatement': for rest_item in data.get('rest', []): for resource_item in rest_item.get('resource', []): res_type = resource_item.get('type') if res_type and res_type[0].isupper(): used_types.add(res_type) profile_uri = resource_item.get('profile') if profile_uri: profile_type = profile_uri.split('/')[-1] if profile_type and profile_type[0].isupper(): used_types.add(profile_type) # --- Generic recursive search for 'reference' fields? --- # This could be expensive. Let's rely on SDs for now. # def find_references(obj): # if isinstance(obj, dict): # for k, v in obj.items(): # if k == 'reference' and isinstance(v, str): # ref_type = v.split('/')[0] # if ref_type and ref_type[0].isupper(): used_types.add(ref_type) # else: # find_references(v) # elif isinstance(obj, list): # for item in obj: # find_references(item) # find_references(data) except json.JSONDecodeError as e: logger.warning(f"Could not parse JSON in {member.name} for used types: {e}") except UnicodeDecodeError as e: logger.warning(f"Could not decode {member.name} for used types: {e}") except Exception as e: logger.warning(f"Could not process member {member.name} for used types: {e}") finally: if fileobj: fileobj.close() except tarfile.ReadError as e: logger.error(f"Tar ReadError extracting used types from {tgz_path}: {e}") except tarfile.TarError as e: logger.error(f"TarError extracting used types from {tgz_path}: {e}") except FileNotFoundError: logger.error(f"Package file not found for used type extraction: {tgz_path}") except Exception as e: logger.error(f"Error extracting used types from {tgz_path}: {e}", exc_info=True) # Filter out potential primitives or base types that aren't resources? # E.g., 'string', 'boolean', 'Element', 'BackboneElement', 'Resource' core_non_resource_types = {'string', 'boolean', 'integer', 'decimal', 'uri', 'url', 'canonical', 'base64Binary', 'instant', 'date', 'dateTime', 'time', 'code', 'oid', 'id', 'markdown', 'unsignedInt', 'positiveInt', 'xhtml', 'Element', 'BackboneElement', 'Resource', 'DomainResource', 'DataType'} final_used_types = {t for t in used_types if t not in core_non_resource_types and t[0].isupper()} logger.debug(f"Extracted used types from {os.path.basename(tgz_path)}: {final_used_types}") return final_used_types def map_types_to_packages(used_types, all_dependencies): """Maps used types to the packages that provide them based on dependency lists.""" type_to_package = {} processed_types = set() # Pass 1: Exact matches in dependencies for (pkg_name, pkg_version), deps in all_dependencies.items(): for dep_name, dep_version in deps.items(): # Simple heuristic: if type name is in dependency package name # This is weak, needs improvement. Ideally, packages declare exported types. for t in used_types: # Exact match or common pattern (e.g., USCorePatient -> us.core) # Need a better mapping strategy - this is very basic. # Example: If 'USCorePatient' is used, and 'us.core' is a dependency. # A more robust approach would involve loading the .index.json from dependency packages. # For now, let's just use a simplified direct check: # If a dependency name contains the type name (lowercase) if t not in type_to_package and t.lower() in dep_name.lower(): type_to_package[t] = (dep_name, dep_version) processed_types.add(t) logger.debug(f"Mapped type '{t}' to dependency package '{dep_name}' based on name heuristic.") # Pass 2: Check the package itself for (pkg_name, pkg_version), deps in all_dependencies.items(): for t in used_types: if t not in type_to_package and t.lower() in pkg_name.lower(): type_to_package[t] = (pkg_name, pkg_version) processed_types.add(t) logger.debug(f"Mapped type '{t}' to source package '{pkg_name}' based on name heuristic.") # Fallback: map remaining types to the canonical package if not already mapped canonical_name, canonical_version = CANONICAL_PACKAGE unmapped_types = used_types - processed_types if unmapped_types: logger.info(f"Using canonical package {canonical_name}#{canonical_version} as fallback for unmapped types: {unmapped_types}") for t in unmapped_types: type_to_package[t] = CANONICAL_PACKAGE logger.debug(f"Final type-to-package mapping: {type_to_package}") return type_to_package def import_package_and_dependencies(initial_name, initial_version, dependency_mode='recursive'): """Orchestrates recursive download and dependency extraction based on the dependency mode.""" logger.info(f"Starting import for {initial_name}#{initial_version} with dependency_mode={dependency_mode}") results = { 'requested': (initial_name, initial_version), 'processed': set(), # Tuples (name, version) successfully processed (downloaded + deps extracted) 'downloaded': {}, # Dict {(name, version): save_path} for successfully downloaded 'all_dependencies': {}, # Dict {(name, version): {dep_name: dep_ver}} stores extracted deps for each processed pkg 'dependencies': [], # List of unique {"name": X, "version": Y} across all processed packages 'errors': [] # List of error messages encountered } # Queue stores (name, version) tuples to process pending_queue = [(initial_name, initial_version)] # Lookup stores (name, version) tuples that have been added to queue or processed, prevents cycles/re-queuing queued_or_processed_lookup = set([(initial_name, initial_version)]) all_found_dependencies = set() # Store unique dep tuples {(name, version)} found # --- Main Processing Loop --- while pending_queue: name, version = pending_queue.pop(0) package_id_tuple = (name, version) # Already successfully processed? Skip. (Shouldn't happen with lookup check before queueing, but safety) if package_id_tuple in results['processed']: logger.debug(f"Skipping already processed package: {name}#{version}") continue logger.info(f"Processing package from queue: {name}#{version}") # --- Download --- save_path, dl_error = download_package(name, version) if dl_error: error_msg = f"Download failed for {name}#{version}: {dl_error}" results['errors'].append(error_msg) logger.error(error_msg) # Do not add to processed, leave in lookup to prevent re-queueing a known failure continue # Move to next item in queue else: results['downloaded'][package_id_tuple] = save_path logger.info(f"Successfully downloaded/verified {name}#{version} at {save_path}") # --- Extract Dependencies --- dependencies, dep_error = extract_dependencies(save_path) if dep_error: # Log error but potentially continue processing other packages if deps are just missing error_msg = f"Dependency extraction failed for {name}#{version}: {dep_error}" results['errors'].append(error_msg) logger.error(error_msg) # Mark as processed even if dep extraction fails, as download succeeded results['processed'].add(package_id_tuple) # Don't queue dependencies if extraction failed continue elif dependencies is None: # This indicates a more severe error during extraction (e.g., corrupted tar) error_msg = f"Dependency extraction returned critical error for {name}#{version}. Aborting dependency processing for this package." results['errors'].append(error_msg) logger.error(error_msg) results['processed'].add(package_id_tuple) # Mark processed continue # Store extracted dependencies for this package results['all_dependencies'][package_id_tuple] = dependencies results['processed'].add(package_id_tuple) # Mark as successfully processed logger.debug(f"Successfully processed {name}#{version}. Dependencies found: {list(dependencies.keys())}") # Add unique dependencies to the overall list and potentially the queue current_package_deps = [] for dep_name, dep_version in dependencies.items(): if isinstance(dep_name, str) and isinstance(dep_version, str) and dep_name and dep_version: dep_tuple = (dep_name, dep_version) current_package_deps.append({"name": dep_name, "version": dep_version}) # For metadata if dep_tuple not in all_found_dependencies: all_found_dependencies.add(dep_tuple) results['dependencies'].append({"name": dep_name, "version": dep_version}) # Add to overall unique list # --- Queue Dependencies Based on Mode --- # Check if not already queued or processed if dep_tuple not in queued_or_processed_lookup: should_queue = False if dependency_mode == 'recursive': should_queue = True elif dependency_mode == 'patch-canonical' and dep_tuple == CANONICAL_PACKAGE: should_queue = True elif dependency_mode == 'tree-shaking': # Tree shaking requires calculating used types *after* initial pkg is processed # This logic needs adjustment - calculate used types only once for the root package. # Let's defer full tree-shaking queuing logic for now, treat as 'none'. # TODO: Implement tree-shaking queuing properly outside the loop based on initial package's used types. pass if should_queue: logger.debug(f"Adding dependency to queue ({dependency_mode}): {dep_name}#{dep_version}") pending_queue.append(dep_tuple) queued_or_processed_lookup.add(dep_tuple) else: logger.warning(f"Skipping invalid dependency entry in {name}#{version}: name='{dep_name}', version='{dep_version}'") # --- Save Metadata (after successful download and dep extraction) --- # We need profile relationship info which comes from process_package_file # Let's call it here if needed for metadata, though it duplicates effort if called later. # Alternative: Save basic metadata first, update later? # Let's just save what we have now. Profile relations can be added by a separate process. save_package_metadata(name, version, dependency_mode, current_package_deps) # TODO: Rework metadata saving if compliesWith/imposedBy is needed during import. # --- Post-Loop Processing (e.g., for Tree Shaking) --- if dependency_mode == 'tree-shaking' and (initial_name, initial_version) in results['downloaded']: logger.info("Performing tree-shaking dependency analysis...") root_save_path = results['downloaded'][(initial_name, initial_version)] used_types = extract_used_types(root_save_path) if used_types: type_to_package = map_types_to_packages(used_types, results['all_dependencies']) logger.debug(f"Tree-shaking mapping: {type_to_package}") tree_shaken_deps_to_ensure = set(type_to_package.values()) # Ensure canonical package is included if tree-shaking mode implies it if CANONICAL_PACKAGE not in tree_shaken_deps_to_ensure: logger.debug(f"Adding canonical package {CANONICAL_PACKAGE} to tree-shaking set.") tree_shaken_deps_to_ensure.add(CANONICAL_PACKAGE) initial_package_tuple = (initial_name, initial_version) if initial_package_tuple in tree_shaken_deps_to_ensure: tree_shaken_deps_to_ensure.remove(initial_package_tuple) # Don't queue self additional_processing_needed = False for dep_tuple in tree_shaken_deps_to_ensure: if dep_tuple not in results['processed'] and dep_tuple not in queued_or_processed_lookup: logger.info(f"Queueing missing tree-shaken dependency: {dep_tuple[0]}#{dep_tuple[1]}") pending_queue.append(dep_tuple) queued_or_processed_lookup.add(dep_tuple) additional_processing_needed = True # If tree-shaking added new packages, re-run the processing loop if additional_processing_needed: logger.info("Re-running processing loop for tree-shaken dependencies...") # This recursive call structure isn't ideal, better to refactor loop. # For now, let's just run the loop again conceptually. # This requires refactoring the main loop logic to be callable. # --- TEMPORARY WORKAROUND: Just log and state limitation --- logger.warning("Tree-shaking identified additional dependencies. Manual re-run or refactoring needed to process them.") results['errors'].append("Tree-shaking identified further dependencies; re-run required for full processing.") # TODO: Refactor the while loop into a callable function to handle recursive/iterative processing. proc_count = len(results['processed']) dl_count = len(results['downloaded']) err_count = len(results['errors']) logger.info(f"Import finished for {initial_name}#{initial_version}. Processed: {proc_count}, Downloaded: {dl_count}, Errors: {err_count}") # Make sure unique list of deps is accurate results['dependencies'] = [ {"name": d[0], "version": d[1]} for d in all_found_dependencies] return results def process_package_file(tgz_path): """Extracts types, profile status, MS elements, examples, and profile relationships from a downloaded .tgz package.""" logger.info(f"Processing package file details: {tgz_path}") results = { 'resource_types_info': [], # List of dicts about each Resource/Profile 'must_support_elements': {}, # Dict: { 'ResourceName/ProfileId': ['path1', 'path2'] } 'examples': {}, # Dict: { 'ResourceName/ProfileId': ['example_path1'] } 'complies_with_profiles': [], # List of canonical URLs 'imposed_profiles': [], # List of canonical URLs 'errors': [] } # Use defaultdict for easier aggregation # Key: SD ID if profile, otherwise ResourceType. Value: dict with info. resource_info = defaultdict(lambda: { 'name': None, # The key (SD ID or ResourceType) 'type': None, # Base FHIR type (e.g., Patient) 'is_profile': False, 'ms_flag': False, # Does this SD define *any* MS elements? 'ms_paths': set(), # Specific MS element paths defined *in this SD* 'examples': set(), # Paths to example files linked to this type/profile 'sd_processed': False # Flag to avoid reprocessing MS flags for the same SD key }) if not tgz_path or not os.path.exists(tgz_path): results['errors'].append(f"Package file not found: {tgz_path}") logger.error(f"Package file not found during processing: {tgz_path}") return results try: with tarfile.open(tgz_path, "r:gz") as tar: members = tar.getmembers() # Get all members once logger.debug(f"Found {len(members)} members in {os.path.basename(tgz_path)}") # --- Pass 1: Process StructureDefinitions --- logger.debug("Processing StructureDefinitions...") for member in members: # Basic filtering if not member.isfile() or not member.name.startswith('package/') or not member.name.lower().endswith('.json'): continue base_filename_lower = os.path.basename(member.name).lower() if base_filename_lower in ['package.json', '.index.json', 'validation-summary.json', 'validation-oo.json']: continue fileobj = None try: fileobj = tar.extractfile(member) if not fileobj: continue content_bytes = fileobj.read() content_string = content_bytes.decode('utf-8-sig') data = json.loads(content_string) if not isinstance(data, dict) or data.get('resourceType') != 'StructureDefinition': continue # Only interested in SDs in this pass # --- Process the StructureDefinition --- profile_id = data.get('id') or data.get('name') # Use ID, fallback to name sd_type = data.get('type') # The base FHIR type (e.g., Patient) sd_base = data.get('baseDefinition') is_profile_sd = bool(sd_base) # It's a profile if it has a baseDefinition if not profile_id: logger.warning(f"StructureDefinition in {member.name} missing 'id' and 'name', skipping.") continue if not sd_type: logger.warning(f"StructureDefinition '{profile_id}' in {member.name} missing 'type', skipping.") continue entry_key = profile_id # Use the SD's ID as the key entry = resource_info[entry_key] # Only process once per entry_key if entry.get('sd_processed'): continue entry['name'] = entry_key entry['type'] = sd_type entry['is_profile'] = is_profile_sd # Extract compliesWithProfile and imposeProfile extensions complies_with = [] imposed = [] for ext in data.get('extension', []): ext_url = ext.get('url') value = ext.get('valueCanonical') if value: if ext_url == 'http://hl7.org/fhir/StructureDefinition/structuredefinition-compliesWithProfile': complies_with.append(value) elif ext_url == 'http://hl7.org/fhir/StructureDefinition/structuredefinition-imposeProfile': imposed.append(value) # Add to overall results (unique) results['complies_with_profiles'].extend(c for c in complies_with if c not in results['complies_with_profiles']) results['imposed_profiles'].extend(i for i in imposed if i not in results['imposed_profiles']) # Find Must Support elements defined *in this specific SD* has_ms_in_this_sd = False ms_paths_in_this_sd = set() # Check differential first, then snapshot if needed? Or combine? Let's combine. elements = data.get('snapshot', {}).get('element', []) + data.get('differential', {}).get('element', []) # De-duplicate elements based on path if combining snapshot and differential (though usually only one is primary) processed_element_paths = set() unique_elements = [] for el in elements: el_path = el.get('path') if el_path and el_path not in processed_element_paths: unique_elements.append(el) processed_element_paths.add(el_path) elif not el_path: # Include elements without paths? Maybe not. pass for element in unique_elements: if isinstance(element, dict) and element.get('mustSupport') is True: element_path = element.get('path') if element_path: ms_paths_in_this_sd.add(element_path) has_ms_in_this_sd = True else: logger.warning(f"Found mustSupport=true without path in element of {entry_key} ({member.name})") if ms_paths_in_this_sd: entry['ms_paths'] = ms_paths_in_this_sd entry['ms_flag'] = True # Set flag if this SD defines MS elements logger.debug(f"Found {len(ms_paths_in_this_sd)} MS elements defined in SD {entry_key}") entry['sd_processed'] = True # Mark this SD as processed except json.JSONDecodeError as e: logger.warning(f"Could not parse JSON SD in {member.name}: {e}") except UnicodeDecodeError as e: logger.warning(f"Could not decode SD in {member.name}: {e}") except Exception as e: logger.warning(f"Could not process SD member {member.name}: {e}", exc_info=False) # Keep log cleaner finally: if fileobj: fileobj.close() # --- Pass 2: Process Examples --- logger.debug("Processing Examples...") for member in members: # Basic filtering if not member.isfile() or not member.name.startswith('package/'): # Allow non-JSON examples too continue member_name_lower = member.name.lower() base_filename_lower = os.path.basename(member_name_lower) if base_filename_lower in ['package.json', '.index.json', 'validation-summary.json', 'validation-oo.json']: continue # Heuristic for identifying examples # Check directory name or filename conventions is_example = 'example' in member.name.split('/') or 'example' in base_filename_lower.split('-') or 'example' in base_filename_lower.split('.') if not is_example: continue logger.debug(f"Processing potential example file: {member.name}") is_json = member_name_lower.endswith('.json') fileobj = None associated_key = None try: if is_json: fileobj = tar.extractfile(member) if not fileobj: continue content_bytes = fileobj.read() content_string = content_bytes.decode('utf-8-sig') data = json.loads(content_string) if not isinstance(data, dict): continue resource_type = data.get('resourceType') if not resource_type: continue # Try to associate example with a profile using meta.profile profile_meta = data.get('meta', {}).get('profile', []) found_profile_match = False if profile_meta and isinstance(profile_meta, list): for profile_url in profile_meta: # Extract profile ID from canonical URL profile_id_from_meta = profile_url.split('/')[-1] if profile_id_from_meta in resource_info: associated_key = profile_id_from_meta found_profile_match = True logger.debug(f"Example {member.name} associated with profile {associated_key} via meta.profile") break # Use first match # If no profile match, associate with the base resource type SD (if any) if not found_profile_match: # Find SD where type matches the example's resourceType and is_profile is False matching_base_sd_keys = [k for k, v in resource_info.items() if v.get('type') == resource_type and not v.get('is_profile') and v.get('sd_processed')] if matching_base_sd_keys: associated_key = matching_base_sd_keys[0] # Use the first matching base SD key logger.debug(f"Example {member.name} associated with base type SD {associated_key}") else: # Fallback: If no SD processed for this base type yet, use the type itself as key associated_key = resource_type logger.debug(f"Example {member.name} associated with resource type {associated_key} (no specific SD found/processed yet)") else: # For non-JSON examples, try to guess based on filename # e.g., patient-example.xml -> Patient # e.g., us-core-patient-example.xml -> us-core-patient (if profile exists) guessed_profile_id = None if '-' in base_filename_lower: # Try matching parts against known profile IDs parts = base_filename_lower.split('-') potential_id = parts[0] if potential_id in resource_info: guessed_profile_id = potential_id else: # Try combining parts? e.g., us-core if len(parts) > 1: potential_id_2 = f"{parts[0]}-{parts[1]}" if potential_id_2 in resource_info: guessed_profile_id = potential_id_2 if guessed_profile_id: associated_key = guessed_profile_id logger.debug(f"Non-JSON Example {member.name} associated with profile {associated_key} via filename heuristic") else: # Fallback to guessing base type guessed_type = base_filename_lower.split('-')[0].split('.')[0].capitalize() matching_base_sd_keys = [k for k, v in resource_info.items() if v.get('type') == guessed_type and not v.get('is_profile') and v.get('sd_processed')] if matching_base_sd_keys: associated_key = matching_base_sd_keys[0] logger.debug(f"Non-JSON Example {member.name} associated with base type SD {associated_key} via filename heuristic") elif guessed_type: associated_key = guessed_type logger.debug(f"Non-JSON Example {member.name} associated with resource type {associated_key} via filename heuristic (no specific SD found/processed yet)") # Add example path to the associated resource/profile info if associated_key: # Ensure the entry exists even if no SD was processed (for base types) if associated_key not in resource_info: resource_info[associated_key]['name'] = associated_key # Try to infer type if possible (might be None) resource_info[associated_key]['type'] = data.get('resourceType') if is_json else associated_key resource_info[associated_key]['examples'].add(member.name) else: logger.warning(f"Could not associate example {member.name} with any known resource or profile.") except json.JSONDecodeError as e: logger.warning(f"Could not parse JSON example in {member.name}: {e}") except UnicodeDecodeError as e: logger.warning(f"Could not decode example in {member.name}: {e}") except Exception as e: logger.warning(f"Could not process example member {member.name}: {e}", exc_info=False) finally: if fileobj: fileobj.close() # --- Final Formatting --- final_list = [] final_ms_elements = {} final_examples = {} logger.debug(f"Finalizing results from resource_info keys: {list(resource_info.keys())}") # Make sure all base resource types mentioned (even without explicit SDs) are included all_types_mentioned = set(v['type'] for v in resource_info.values() if v.get('type')) for type_name in all_types_mentioned: if type_name not in resource_info: # Add a basic entry if a type was mentioned (e.g., by an example) but had no SD if type_name and type_name[0].isupper(): # Basic check it looks like a resource type logger.debug(f"Adding basic entry for resource type '{type_name}' mentioned but without processed SD.") resource_info[type_name]['name'] = type_name resource_info[type_name]['type'] = type_name resource_info[type_name]['is_profile'] = False for key, info in resource_info.items(): display_name = info.get('name') or key base_type = info.get('type') # Skip if essential info is missing (shouldn't happen with defaultdict + population) if not display_name or not base_type: logger.warning(f"Skipping formatting for incomplete key: {key} - Info: {info}") continue logger.debug(f"Formatting item '{display_name}': type='{base_type}', profile='{info.get('is_profile', False)}', ms_flag='{info.get('ms_flag', False)}'") final_list.append({ 'name': display_name, # This is the SD ID or ResourceType 'type': base_type, # The base FHIR resource type 'is_profile': info.get('is_profile', False), 'must_support': info.get('ms_flag', False) # Does this SD *define* MS elements? }) if info['ms_paths']: final_ms_elements[display_name] = sorted(list(info['ms_paths'])) if info['examples']: final_examples[display_name] = sorted(list(info['examples'])) # Sort profiles after base types, then alphabetically results['resource_types_info'] = sorted(final_list, key=lambda x: (x.get('is_profile', False), x.get('name', ''))) results['must_support_elements'] = final_ms_elements results['examples'] = final_examples # Ensure relationship lists are unique (done during addition now) # results['complies_with_profiles'] = sorted(list(set(results['complies_with_profiles']))) # results['imposed_profiles'] = sorted(list(set(results['imposed_profiles']))) except tarfile.ReadError as e: err_msg = f"Tar ReadError processing package file {tgz_path}: {e}" logger.error(err_msg) results['errors'].append(err_msg) except tarfile.TarError as e: err_msg = f"TarError processing package file {tgz_path}: {e}" logger.error(err_msg) results['errors'].append(err_msg) except FileNotFoundError: err_msg = f"Package file not found during processing: {tgz_path}" logger.error(err_msg) results['errors'].append(err_msg) except Exception as e: err_msg = f"Unexpected error processing package file {tgz_path}: {e}" logger.error(err_msg, exc_info=True) results['errors'].append(err_msg) # Logging counts final_types_count = len(results['resource_types_info']) ms_defining_count = sum(1 for r in results['resource_types_info'] if r['must_support']) # Count SDs defining MS total_ms_paths = sum(len(v) for v in results['must_support_elements'].values()) total_examples = sum(len(v) for v in results['examples'].values()) logger.info(f"Package processing finished for {os.path.basename(tgz_path)}: " f"{final_types_count} Resources/Profiles identified; " f"{ms_defining_count} define MS elements ({total_ms_paths} total MS paths); " f"{total_examples} examples found. " f"CompliesWith: {len(results['complies_with_profiles'])}, Imposed: {len(results['imposed_profiles'])}") return results # --- Example Usage (if running script directly) --- if __name__ == '__main__': # Configure logger for direct script execution logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger.info("Running services.py directly for testing.") # Mock Flask app context minimally for config/instance path class MockFlaskConfig(dict): pass class MockFlaskCurrentApp: config = MockFlaskConfig() # Calculate instance path relative to this file instance_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'instance')) # Need to manually set current_app for testing outside Flask request context # This is tricky. Let's bypass current_app dependency in _get_download_dir for direct testing. # OR, provide a mock. Best approach is to structure code to reduce Flask dependency in core logic. # For testing, let's override _get_download_dir or manually create the dir test_download_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'instance', DOWNLOAD_DIR_NAME)) os.makedirs(test_download_dir, exist_ok=True) logger.info(f"Using test download directory: {test_download_dir}") # Override the helper function for testing context original_get_download_dir = _get_download_dir def mock_get_download_dir(): # In test, don't rely on current_app if possible # Ensure config exists if needed by validation code if not hasattr(mock_get_download_dir, 'config'): mock_get_download_dir.config = {'FHIR_PACKAGES_DIR': test_download_dir} return test_download_dir _get_download_dir = mock_get_download_dir # Add the FHIR_PACKAGES_DIR to the mock config directly _get_download_dir.config = {'FHIR_PACKAGES_DIR': test_download_dir} # --- Test Case 1: Import AU Core Patient Package --- pkg_name = "hl7.fhir.au.core" pkg_version = "1.0.1" # Use a specific version known to exist logger.info(f"\n--- Testing Import: {pkg_name}#{pkg_version} ---") import_results = import_package_and_dependencies(pkg_name, pkg_version, dependency_mode='recursive') # print("Import Results:", json.dumps(import_results, default=lambda o: '', indent=2)) if not import_results['errors'] and (pkg_name, pkg_version) in import_results['downloaded']: logger.info(f"Import successful for {pkg_name}#{pkg_version}") # --- Test Case 2: Validate Patient Resource --- logger.info(f"\n--- Testing Validation: Patient Example ---") patient_resource = { "resourceType": "Patient", "id": "banks-mia-leanne", "meta": { "profile": ["http://hl7.org.au/fhir/core/StructureDefinition/au-core-patient"] }, "identifier": [{ "type": {"coding": [{"system": "http://terminology.hl7.org/CodeSystem/v2-0203", "code": "NI"}], "text": "IHI"}, "system": "http://ns.electronichealth.net.au/id/hi/ihi/1.0", "value": "8003608333647261" }], "name": [{"use": "usual", "family": "Banks", "given": ["Mia", "Leanne"]}], "telecom": [{"system": "phone", "value": "0491574632", "use": "mobile"}], "gender": "female", "birthDate": "1983-08-25", "address": [{"line": ["50 Sebastien St"], "city": "Minjary", "state": "NSW", "postalCode": "2720", "country": "AU"}] # Missing communication on purpose to test warnings/errors if required by profile } validation_result = validate_resource_against_profile(pkg_name, pkg_version, patient_resource) print("\nPatient Validation Result:") print(json.dumps(validation_result, indent=2)) # --- Test Case 3: Validate Allergy Resource --- logger.info(f"\n--- Testing Validation: Allergy Example ---") allergy_resource = { "resourceType": "AllergyIntolerance", "id": "lactose", "meta": {"profile": ["http://hl7.org.au/fhir/core/StructureDefinition/au-core-allergyintolerance"]}, "clinicalStatus": {"coding": [{"system": "http://terminology.hl7.org/CodeSystem/allergyintolerance-clinical", "code": "active"}]}, "verificationStatus": {"coding": [{"system": "http://terminology.hl7.org/CodeSystem/allergyintolerance-verification", "code": "confirmed"}]}, "code": {"coding": [{"system": "http://snomed.info/sct", "code": "782415009", "display": "Intolerance to lactose"}]}, "patient": {"reference": "Patient/banks-mia-leanne"}, "onsetDateTime": "2022", # Example of choice type "reaction": [{ "manifestation": [{"coding": [{"system": "http://snomed.info/sct", "code": "21522001", "display": "Abdominal pain"}]}], "severity": "mild" }] } validation_result_allergy = validate_resource_against_profile(pkg_name, pkg_version, allergy_resource) print("\nAllergy Validation Result:") print(json.dumps(validation_result_allergy, indent=2)) else: logger.error(f"Import failed for {pkg_name}#{pkg_version}, cannot proceed with validation tests.") print("Import Errors:", import_results['errors']) # Restore original function if necessary _get_download_dir = original_get_download_dir