85 lines
3.5 KiB
Bash
Executable file
85 lines
3.5 KiB
Bash
Executable file
#!/bin/bash
|
|
|
|
# the job assumes that it is a good idea to run everything in PWD
|
|
# the job manager should make sure that is true
|
|
|
|
# fail whenever something is fishy, use -x to get verbose logfiles
|
|
set -e -u -x
|
|
|
|
dssource="$1"
|
|
pushgitremote="$2"
|
|
# if dataset has sites subd includes it
|
|
subd="$3"
|
|
subid=$(basename $3)
|
|
|
|
# get the analysis dataset, which includes the inputs as well
|
|
# importantly, we do not clone from the lcoation that we want to push the
|
|
# results too, in order to avoid too many jobs blocking access to
|
|
# the same location and creating a throughput bottleneck
|
|
datalad clone "${dssource}" ds
|
|
|
|
# all following actions are performed in the context of the superdataset
|
|
cd ds
|
|
|
|
# in order to avoid accumulation temporary git-annex availability information
|
|
# and to avoid a syncronization bottleneck by having to consolidate the
|
|
# git-annex branch across jobs, we will only push the main tracking branch
|
|
# back to the output store (plus the actual file content). Final availability
|
|
# information can be establish via an eventual "git-annex fsck -f cat12.8.1_out-storage".
|
|
# this remote is never fetched, it accumulates a larger number of branches
|
|
# and we want to avoid progressive slowdown. Instead we only ever push
|
|
# a unique branch per each job (subject AND process specific name)
|
|
git remote add outputstore "$pushgitremote"
|
|
|
|
# all results of this job will be put into a dedicated branch
|
|
git checkout -b "job-${JOBID}"
|
|
|
|
# we pull down the input subject manually in order to discover relevant
|
|
# files. We do this outside the recorded call, because on a potential
|
|
# re-run we want to be able to do fine-grained recomputing of individual
|
|
# outputs. The recorded calls will have specific paths that will enable
|
|
# recomputation outside the scope of the original Condor setup
|
|
datalad get -n "inputs/AOMIC_ID1000/${subd}"
|
|
|
|
# the meat of the matter
|
|
# look for T1w files in the input data for the given participant
|
|
# it is critical for reproducibility that the command given to
|
|
# "containers-run" does not rely on any property of the immediate
|
|
# computational environment (env vars, services, etc)
|
|
find \
|
|
inputs/AOMIC_ID1000/${subd} \
|
|
-name "${subid}*T1w.nii.gz" \
|
|
-exec sh -c '
|
|
odir=$(echo {} | cut -d / -f3);
|
|
datalad containers-run \
|
|
-m "Compute $odir" \
|
|
-n cat12-8-1\
|
|
--explicit \
|
|
-o $odir \
|
|
-i {} \
|
|
sh -e -u -x -c "
|
|
mkdir -p {outputs[0]} || true \
|
|
&& cp {inputs[0]} {outputs[0]} \
|
|
&& /singularity -b code/cat_standalone_segment_enigma_subdir_rp_MSA.m {outputs[0]}/*.nii.gz \
|
|
&& /singularity -b code/pipeline/batches/cat_standalone_batch-surfext.m {outputs[0]}/surf/lh.central.* \
|
|
&& /singularity -b code/pipeline/batches/cat_standalone_batch-thick1.m {outputs[0]}/surf/lh.thickness.* \
|
|
&& /singularity -b code/pipeline/batches/cat_standalone_batch-thick2.m {outputs[0]}/surf/lh.thickness.* \
|
|
&& rm -f {outputs[0]}/*.nii* \
|
|
&& gzip {outputs[0]}/*/*.nii \
|
|
" \
|
|
' \;
|
|
|
|
# remove big files from results after hashing before pushing to ria
|
|
datalad drop --what filecontent --reckless kill sub-*/mri/iy* sub-*/mri/y* sub-*/mri/anon_m* sub-*/*/*.pdf sub-*/surf/*sphere*
|
|
|
|
#### Maybe remove simlinks without data and commit ???
|
|
# rm -f sub-*/mri/iy* sub-*/mri/y* # sub-*/mri/anon_m* sub-*/*/*.pdf # sub-*/surf/*sphere*
|
|
# datalad save -m "remove simlinks without data"
|
|
|
|
# file content first -- does not need a lock, no interaction with Git
|
|
datalad push --to cat12.8.1_out-storage
|
|
# and the output branch
|
|
flock --verbose $DSLOCKFILE git push outputstore
|
|
|
|
echo SUCCESS
|
|
# job handler should clean up workspace
|