adhd200-cat12.8.1/code/xml2csv.py

#!/usr/bin/env python3
# call with xml2csv <basename> <path/to/subject>
#
# writes header only for new files

import sys
import shutil
from xml.etree.ElementTree import parse as xmlparse
import csv
from pathlib import Path
from tempfile import (
    TemporaryDirectory,
    TemporaryFile,
)
import re


# define CSV columns (changing here will re-order)
csv_fieldnames = [
    'SubjectID', 'filepath',
    'res_ECR', 'NCR', 'ICR', 'IQR', 'GM', 'WM', 'CSF',
    'TIV', 'TSA', 'EulerNr', 'EulerR', 'SIQR'
]

# NCR: noise to contrast ratio
# ICR: inhomogeneity to contrast ratio
# IQR: image quality rating
# TIV: total intracranial volume (GM+WM+CSF)
# GM:  total gray matter volume
# WM:  total white matter volume
# CSF: total cerebral spinal fluid volume
# WMH: total white matter hyperintensities volume
# TSA: total surface area

def val2out(str_float):
    """Uniform formating of floating point values for output.

    The input does not have to be a float, but can also be a str that
    is convertable to float.
    """
    return '{:.4f}'.format(float(str_float))

def get_basic_catlog(filepath, sub, outfilebase):
    # load report XML
    # catrep_file = report_dir / 'cat_{}_ses-baselineYear1Arm1_run-05_T1w.xml'.format(sub)
    with TemporaryFile() as tf:
        tf.write(re.sub(b'item\.\.\.', b'item>...', filepath.read_bytes()))
        tf.seek(0)
        catrep = xmlparse(tf)
    catreport = catrep.getroot()
    # build CSV record
    catlog = {
        'SubjectID': sub,
        'filepath': filepath,
        'res_ECR': val2out(catreport.find('qualityratings/res_ECR').text),
        'NCR': val2out(catreport.find('qualityratings/NCR').text),
        'ICR': val2out(catreport.find('qualityratings/ICR').text),
        'IQR': val2out(catreport.find('qualityratings/IQR').text),
        'EulerNr': val2out(catreport.find('qualitymeasures//SurfaceEulerNumber').text),
        'EulerR': val2out(catreport.find('qualityratings//SurfaceEulerNumber').text),
        'SIQR': val2out(catreport.find('qualityratings/SIQR').text),
        'TIV': val2out(catreport.find('subjectmeasures/vol_TIV').text),
        'TSA': val2out(catreport.find('subjectmeasures/surf_TSA').text),
    }
    # get total and tissue volumes
    absTV = catreport.find('subjectmeasures/vol_abs_CGW').text.strip('[]')
    for t, tv in zip(('CSF', 'GM', 'WM'), absTV.split()):
        if float(tv) > 0:
            catlog[t] = val2out(tv)
        # write QC only file
    destfile = Path('{}_QC.csv'.format(
        outfilebase,
    ))
    need_header = not destfile.is_file()
    # use context manager to get automatic cleanup
    with destfile.open('a') as catlog_data:
        # build CSV record
        writer = csv.DictWriter(
            catlog_data,
            fieldnames=csv_fieldnames
        )
        # if there was no CSV, write the header
        if need_header:
            writer.writeheader()
        # write CSV row
        writer.writerow(catlog)
    return catlog


def xml2csv(infile, outfilebase, catlog_templ, data_tag,
            additional_extractor=None):
    # load XML
    root_node = xmlparse(infile).getroot()
    # iterate over surface atlas found in XML
    for child in root_node:
        destfile = Path('{}_rois_{}.csv'.format(
            outfilebase,
            child.tag,
        ))
        # get ROI names
        rois = [
            name.text
            for name in root_node.findall(child.tag + '/names/item')
        ]
        # this list will define the output columns
        roi_names = list(rois)
        need_header = not destfile.is_file()
        # use context manager to get automatic cleanup
        with destfile.open('a') as catlog_data:
            # build CSV record
            catlog = catlog_templ.copy()
            # get ROI thickness matching succession
            ROIvol = root_node.find(
                child.tag + '/data/' + data_tag).text.strip('[]')
            for id, vol in zip(roi_names, ROIvol.split(';')):
                catlog[id] = val2out(vol)
            if additional_extractor:
                additional_extractor(
                    root_node, child.tag, rois, catlog, roi_names)
            writer = csv.DictWriter(
                catlog_data,
                fieldnames=csv_fieldnames + roi_names
            )
            # if there was no CSV, write the header
            if need_header:
                writer.writeheader()
            # write CSV row
            writer.writerow(catlog)


def add_WM_CSF(root_node, tag, rois, catlog, roi_names):
    # if atlas has WM volume, add at the end
    if root_node.findtext(tag + '/data/Vwm'):
        roi_namesWM = [name + '_WM' for name in rois]
        ROIwm = root_node.find(tag + '/data/Vwm').text.strip('[]')
        for id, vol in zip(roi_namesWM, ROIwm.split(';')):
            catlog[id] = val2out(vol)
        roi_names.extend(roi_namesWM)
    # if atlas has CSF volume, add at the end
    if root_node.findtext(tag + '/data/Vcsf'):
        roi_namesCSF = [name + '_CSF' for name in rois]
        ROIcsf = root_node.find(tag + '/data/Vcsf').text.strip('[]')
        for id, vol in zip(roi_namesCSF, ROIcsf.split(';')):
            catlog[id] = val2out(vol)
        roi_names.extend(roi_namesCSF)


# output base name
base_name = sys.argv[1]

# path to the report
path2data= Path(sys.argv[2])

# extract subject identifier from path
sub = path2data.parts[0]

for path in path2data.glob(r'**/report/**/*'):
     if path.suffix in {'.xml'}:
        # load report XML
        catlog = get_basic_catlog(path, sub, base_name)

# write QC table
for path in path2data.glob(r'**/label/**/catROI_*'):
    if path.suffix in {'.xml'}:
        xml2csv(
            path,
            '{}'.format(base_name),
            catlog,
            'Vgm',
            add_WM_CSF,
        )

for path in path2data.glob(r'**/label/**/catROIs_*'):
    if path.suffix in {'.xml'}:
        xml2csv(
            path,
            '{}'.format(base_name),
            catlog,
            'thickness',
        )
        xml2csv(
            path,
            '{}'.format(base_name),
            catlog,
            'gyrification',
        )
        xml2csv(
            path,
            '{}'.format(base_name),
            catlog,
            'toroGI20mm',
        )
        xml2csv(
            path,
            '{}'.format(base_name),
            catlog,
            'area',
        )
        xml2csv(
            path,
            '{}'.format(base_name),
            catlog,
            'gmv',
        )
        xml2csv(
            path,
            '{}'.format(base_name),
            catlog,
            'depth',
        )
        xml2csv(
            path,
            '{}'.format(base_name),
            catlog,
            'fractaldimension',
        )