mirror of
https://github.com/Sudo-JHare/FHIRFLARE-IG-Toolkit.git
synced 2025-06-15 13:09:59 +00:00
315 lines
18 KiB
Python
315 lines
18 KiB
Python
# app/modules/fhir_ig_importer/services.py
|
|
|
|
import requests
|
|
import os
|
|
import tarfile
|
|
import gzip
|
|
import json
|
|
import io
|
|
import re
|
|
import logging
|
|
from flask import current_app
|
|
|
|
# Constants
|
|
FHIR_REGISTRY_BASE_URL = "https://packages.fhir.org"
|
|
DOWNLOAD_DIR_NAME = "fhir_packages"
|
|
|
|
# --- Helper Functions ---
|
|
|
|
def _get_download_dir():
|
|
"""Gets the absolute path to the download directory, creating it if needed."""
|
|
logger = logging.getLogger(__name__)
|
|
try:
|
|
instance_path = current_app.instance_path
|
|
# logger.debug(f"Using instance path from current_app: {instance_path}") # Can be noisy
|
|
except RuntimeError:
|
|
logger.warning("No app context for instance_path, constructing relative path.")
|
|
instance_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..', 'instance'))
|
|
logger.debug(f"Constructed instance path: {instance_path}")
|
|
download_dir = os.path.join(instance_path, DOWNLOAD_DIR_NAME)
|
|
try:
|
|
os.makedirs(download_dir, exist_ok=True)
|
|
return download_dir
|
|
except OSError as e:
|
|
logger.error(f"Fatal Error: Could not create dir {download_dir}: {e}", exc_info=True)
|
|
return None
|
|
|
|
def sanitize_filename_part(text):
|
|
"""Basic sanitization for creating filenames."""
|
|
# Replace common invalid chars, keep ., -
|
|
safe_text = "".join(c if c.isalnum() or c in ['.', '-'] else '_' for c in text)
|
|
# Replace multiple underscores with single one
|
|
safe_text = re.sub(r'_+', '_', safe_text)
|
|
# Remove leading/trailing underscores/hyphens/periods
|
|
safe_text = safe_text.strip('_-.')
|
|
return safe_text if safe_text else "invalid_name" # Ensure not empty
|
|
|
|
def _construct_tgz_filename(name, version):
|
|
"""Constructs the standard filename for the package."""
|
|
return f"{sanitize_filename_part(name)}-{sanitize_filename_part(version)}.tgz"
|
|
|
|
# --- Helper to Find/Extract SD ---
|
|
def _find_and_extract_sd(tgz_path, resource_type_to_find):
|
|
"""Helper to find and extract SD json from a given tgz path."""
|
|
sd_data = None
|
|
found_path = None
|
|
logger = current_app.logger if current_app else logging.getLogger(__name__) # Use app logger if possible
|
|
try:
|
|
with tarfile.open(tgz_path, "r:gz") as tar:
|
|
logger.debug(f"Searching for SD type '{resource_type_to_find}' in {tgz_path}")
|
|
# Prioritize paths like 'package/StructureDefinition-[Type].json'
|
|
potential_paths = [
|
|
f'package/StructureDefinition-{resource_type_to_find.lower()}.json',
|
|
f'package/StructureDefinition-{resource_type_to_find}.json'
|
|
]
|
|
member_found = None
|
|
for potential_path in potential_paths:
|
|
try:
|
|
member_found = tar.getmember(potential_path)
|
|
if member_found: break
|
|
except KeyError:
|
|
pass
|
|
|
|
# If specific paths failed, iterate
|
|
if not member_found:
|
|
for member in tar:
|
|
if member.isfile() and member.name.startswith('package/') and member.name.lower().endswith('.json'):
|
|
filename_lower = os.path.basename(member.name).lower()
|
|
if filename_lower in ['package.json', '.index.json', 'validation-summary.json', 'validation-oo.json']:
|
|
continue
|
|
sd_fileobj = None
|
|
try:
|
|
sd_fileobj = tar.extractfile(member)
|
|
if sd_fileobj:
|
|
content_bytes = sd_fileobj.read(); content_string = content_bytes.decode('utf-8-sig'); data = json.loads(content_string)
|
|
if isinstance(data, dict) and data.get('resourceType') == 'StructureDefinition' and data.get('type') == resource_type_to_find:
|
|
member_found = member
|
|
break
|
|
except Exception:
|
|
pass
|
|
finally:
|
|
if sd_fileobj: sd_fileobj.close()
|
|
|
|
if member_found:
|
|
sd_fileobj = None
|
|
try:
|
|
sd_fileobj = tar.extractfile(member_found)
|
|
if sd_fileobj:
|
|
content_bytes = sd_fileobj.read(); content_string = content_bytes.decode('utf-8-sig'); sd_data = json.loads(content_string)
|
|
found_path = member_found.name; logger.info(f"Found matching SD at path: {found_path}")
|
|
except Exception as e:
|
|
logger.warning(f"Could not read/parse member {member_found.name} after finding it: {e}")
|
|
sd_data = None; found_path = None
|
|
finally:
|
|
if sd_fileobj: sd_fileobj.close()
|
|
|
|
except tarfile.TarError as e:
|
|
logger.error(f"TarError reading {tgz_path}: {e}")
|
|
raise
|
|
except FileNotFoundError:
|
|
logger.error(f"FileNotFoundError reading {tgz_path}")
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error in _find_and_extract_sd for {tgz_path}: {e}", exc_info=True)
|
|
raise
|
|
return sd_data, found_path
|
|
# --- End Helper ---
|
|
|
|
# --- Core Service Functions ---
|
|
|
|
def download_package(name, version):
|
|
""" Downloads a single FHIR package. Returns (save_path, error_message) """
|
|
logger = logging.getLogger(__name__)
|
|
download_dir = _get_download_dir()
|
|
if not download_dir: return None, "Could not get/create download directory."
|
|
|
|
package_id = f"{name}#{version}"
|
|
package_url = f"{FHIR_REGISTRY_BASE_URL}/{name}/{version}"
|
|
filename = _construct_tgz_filename(name, version)
|
|
save_path = os.path.join(download_dir, filename)
|
|
|
|
if os.path.exists(save_path):
|
|
logger.info(f"Package already exists: {filename}")
|
|
return save_path, None
|
|
|
|
logger.info(f"Downloading: {package_id} -> {filename}")
|
|
try:
|
|
with requests.get(package_url, stream=True, timeout=90) as r:
|
|
r.raise_for_status()
|
|
with open(save_path, 'wb') as f:
|
|
for chunk in r.iter_content(chunk_size=8192): f.write(chunk)
|
|
logger.info(f"Success: Downloaded {filename}")
|
|
return save_path, None
|
|
except requests.exceptions.RequestException as e: err_msg = f"Download error for {package_id}: {e}"; logger.error(err_msg); return None, err_msg
|
|
except OSError as e: err_msg = f"File save error for {filename}: {e}"; logger.error(err_msg); return None, err_msg
|
|
except Exception as e: err_msg = f"Unexpected download error for {package_id}: {e}"; logger.error(err_msg, exc_info=True); return None, err_msg
|
|
|
|
def extract_dependencies(tgz_path):
|
|
""" Extracts dependencies dict from package.json. Returns (dep_dict, error_message) """
|
|
logger = logging.getLogger(__name__)
|
|
package_json_path = "package/package.json"
|
|
dependencies = {}
|
|
error_message = None
|
|
if not tgz_path or not os.path.exists(tgz_path): return None, f"File not found at {tgz_path}"
|
|
try:
|
|
with tarfile.open(tgz_path, "r:gz") as tar:
|
|
package_json_member = tar.getmember(package_json_path) # Raises KeyError if not found
|
|
package_json_fileobj = tar.extractfile(package_json_member)
|
|
if package_json_fileobj:
|
|
try:
|
|
package_data = json.loads(package_json_fileobj.read().decode('utf-8-sig'))
|
|
dependencies = package_data.get('dependencies', {})
|
|
finally: package_json_fileobj.close()
|
|
else: raise FileNotFoundError(f"Could not extract {package_json_path}")
|
|
except KeyError: error_message = f"'{package_json_path}' not found in {os.path.basename(tgz_path)}."; logger.warning(error_message) # No deps is okay
|
|
except (json.JSONDecodeError, UnicodeDecodeError) as e: error_message = f"Parse error in {package_json_path} from {os.path.basename(tgz_path)}: {e}"; logger.error(error_message); dependencies = None # Parsing failed
|
|
except (tarfile.TarError, FileNotFoundError) as e: error_message = f"Archive error {os.path.basename(tgz_path)}: {e}"; logger.error(error_message); dependencies = None # Archive read failed
|
|
except Exception as e: error_message = f"Unexpected error extracting deps: {e}"; logger.error(error_message, exc_info=True); dependencies = None
|
|
return dependencies, error_message
|
|
|
|
# --- Recursive Import Orchestrator (Corrected Indentation) ---
|
|
def import_package_and_dependencies(initial_name, initial_version):
|
|
"""Orchestrates recursive download and dependency extraction."""
|
|
logger = logging.getLogger(__name__)
|
|
logger.info(f"Starting recursive import for {initial_name}#{initial_version}")
|
|
results = {'requested': (initial_name, initial_version), 'processed': set(), 'downloaded': {}, 'all_dependencies': {}, 'errors': [] }
|
|
pending_queue = [(initial_name, initial_version)]; processed_lookup = set()
|
|
|
|
while pending_queue:
|
|
name, version = pending_queue.pop(0)
|
|
package_id_tuple = (name, version)
|
|
|
|
if package_id_tuple in processed_lookup: continue
|
|
|
|
logger.info(f"Processing: {name}#{version}"); processed_lookup.add(package_id_tuple)
|
|
|
|
# 1. Download
|
|
save_path, dl_error = download_package(name, version)
|
|
|
|
if dl_error:
|
|
error_msg = f"Download failed for {name}#{version}: {dl_error}"
|
|
results['errors'].append(error_msg)
|
|
if package_id_tuple == results['requested']:
|
|
logger.error("Aborting import: Initial package download failed.")
|
|
break # Stop processing queue if initial download fails
|
|
else:
|
|
continue # Skip dependency extraction for this failed download
|
|
else:
|
|
# Download succeeded (or file existed)
|
|
results['downloaded'][package_id_tuple] = save_path
|
|
|
|
# --- FIX: Indent dependency extraction under the else block ---
|
|
# 2. Extract Dependencies from downloaded file
|
|
dependencies, dep_error = extract_dependencies(save_path)
|
|
|
|
if dep_error:
|
|
results['errors'].append(f"Dependency extraction failed for {name}#{version}: {dep_error}")
|
|
# Still mark as 'downloaded', but maybe not 'processed'? Let's allow queue processing.
|
|
elif dependencies is not None: # Not None means extraction attempt happened (even if deps is {})
|
|
results['all_dependencies'][package_id_tuple] = dependencies
|
|
results['processed'].add(package_id_tuple) # Mark as successfully processed (downloaded + deps extracted)
|
|
logger.debug(f"Dependencies for {name}#{version}: {list(dependencies.keys())}")
|
|
# Add new dependencies to queue
|
|
for dep_name, dep_version in dependencies.items():
|
|
# Basic validation of dependency entry format
|
|
if isinstance(dep_name, str) and isinstance(dep_version, str) and dep_name and dep_version:
|
|
dep_tuple = (dep_name, dep_version)
|
|
if dep_tuple not in processed_lookup: # Check processed_lookup to prevent re-queueing
|
|
if dep_tuple not in pending_queue: # Avoid duplicate queue entries
|
|
pending_queue.append(dep_tuple)
|
|
logger.debug(f"Added to queue: {dep_name}#{dep_version}")
|
|
else:
|
|
logger.warning(f"Skipping invalid dependency entry '{dep_name}': '{dep_version}' in {name}#{version}")
|
|
# --- End Indentation Fix ---
|
|
|
|
# Final Summary Log
|
|
proc_count=len(results['processed']); dl_count=len(results['downloaded']); err_count=len(results['errors'])
|
|
logger.info(f"Import finished. Processed: {proc_count}, Downloaded/Verified: {dl_count}, Errors: {err_count}")
|
|
return results
|
|
|
|
# --- Package File Content Processor ---
|
|
def process_package_file(tgz_path):
|
|
""" Extracts types, MS elements, and examples from a downloaded .tgz package (Single Pass). """
|
|
logger = logging.getLogger(__name__)
|
|
logger.info(f"Processing package file details: {tgz_path}")
|
|
results = {'resource_types_info': [], 'must_support_elements': {}, 'examples': {}, 'errors': [] }
|
|
resource_info = {} # Temp dict: {'Type': {'ms_flag': False, 'ms_paths': [], 'examples': [], 'sd_processed': False}}
|
|
|
|
if not os.path.exists(tgz_path): errors.append(f"Package file not found: {tgz_path}"); return results
|
|
|
|
try:
|
|
with tarfile.open(tgz_path, "r:gz") as tar:
|
|
for member in tar:
|
|
if not member.isfile(): continue
|
|
member_name_lower = member.name.lower()
|
|
base_filename_lower = os.path.basename(member_name_lower)
|
|
fileobj = None
|
|
|
|
# Check if JSON file inside package/ (excluding known non-resources)
|
|
if member.name.startswith('package/') and member_name_lower.endswith('.json') and \
|
|
base_filename_lower not in ['package.json', '.index.json', 'validation-summary.json']:
|
|
try:
|
|
fileobj = tar.extractfile(member)
|
|
if not fileobj: continue
|
|
content_bytes = fileobj.read(); content_string = content_bytes.decode('utf-8-sig'); data = json.loads(content_string)
|
|
|
|
if isinstance(data, dict) and 'resourceType' in data:
|
|
resource_type = data['resourceType']
|
|
# Ensure entry exists for this type
|
|
type_entry = resource_info.setdefault(resource_type, {'ms_flag': False, 'ms_paths': [], 'examples': [], 'sd_processed': False})
|
|
|
|
# If it's an example file, record it
|
|
if member.name.startswith('package/example/'):
|
|
type_entry['examples'].append(member.name)
|
|
logger.debug(f"Found example for {resource_type}: {member.name}")
|
|
|
|
# If it's a StructureDefinition, process MS flags (only once per type)
|
|
if resource_type == 'StructureDefinition' and not type_entry['sd_processed']:
|
|
sd_type = data.get('type')
|
|
if sd_type and sd_type in resource_info: # Check against types we've already seen
|
|
ms_paths_for_type = []; has_ms = False
|
|
for element_list in [data.get('snapshot', {}).get('element', []), data.get('differential', {}).get('element', [])]:
|
|
for element in element_list:
|
|
if isinstance(element, dict) and element.get('mustSupport') is True:
|
|
element_path = element.get('path')
|
|
if element_path:
|
|
ms_paths_for_type.append(element_path); has_ms = True
|
|
if ms_paths_for_type:
|
|
resource_info[sd_type]['ms_paths'] = sorted(list(set(ms_paths_for_type))) # Store unique sorted paths
|
|
if has_ms:
|
|
resource_info[sd_type]['ms_flag'] = True
|
|
resource_info[sd_type]['sd_processed'] = True # Mark as processed for MS
|
|
logger.debug(f"Processed SD for {sd_type}, MS found: {has_ms}")
|
|
else:
|
|
logger.warning(f"SD {member.name} defines type '{sd_type}' which wasn't found as a resource type key.")
|
|
|
|
except Exception as e: logger.warning(f"Could not read/parse member {member.name}: {e}")
|
|
finally:
|
|
if fileobj: fileobj.close()
|
|
|
|
# Also find XML and HTML examples
|
|
elif (member.name.startswith('package/example/') or ('example' in base_filename_lower and member.name.startswith('package/'))) \
|
|
and (member_name_lower.endswith('.xml') or member_name_lower.endswith('.html')):
|
|
# Try to guess type (imperfect)
|
|
guessed_type = base_filename_lower.split('-', 1)[0].capitalize()
|
|
if guessed_type in resource_info: # Only add if type is known
|
|
resource_info[guessed_type]['examples'].append(member.name)
|
|
logger.debug(f"Found non-JSON example for {guessed_type}: {member.name}")
|
|
|
|
except Exception as e: err_msg = f"Error processing package file {tgz_path}: {e}"; logger.error(err_msg, exc_info=True); results['errors'].append(err_msg)
|
|
|
|
# Format final results
|
|
results['resource_types_info'] = sorted([{'name': rt, 'must_support': info['ms_flag']} for rt, info in resource_info.items()], key=lambda x: x['name'])
|
|
results['must_support_elements'] = {rt: info['ms_paths'] for rt, info in resource_info.items() if info['ms_paths']}
|
|
results['examples'] = {rt: sorted(info['examples']) for rt, info in resource_info.items() if info['examples']}
|
|
|
|
# Logging final counts
|
|
final_types_count = len(results['resource_types_info']); ms_count = sum(1 for r in results['resource_types_info'] if r['must_support']); total_ms_paths = sum(len(v) for v in results['must_support_elements'].values()); total_examples = sum(len(v) for v in results['examples'].values())
|
|
logger.info(f"Extracted {final_types_count} types ({ms_count} with MS; {total_ms_paths} MS paths; {total_examples} examples) from {tgz_path}")
|
|
|
|
return results
|
|
|
|
# --- Remove or Comment Out old/unused functions ---
|
|
# def _fetch_package_metadata(package_name, package_version): ... (REMOVED)
|
|
# def resolve_all_dependencies(initial_package_name, initial_package_version): ... (REMOVED)
|
|
# def process_ig_import(package_name, package_version): ... (OLD orchestrator - REMOVED) |