    subroutine get_u_snapshots(snapshots_el,nDof,n_RVEs,snapshots_inel,n_snaps_inel,n_snaps,ncpus,n_SV_el,n_SV_inel) bind(C,name='get_u_snapshots')
        !this routine returns the left singular values of the elastic and inelastic snapshots of the nodal displacements and additionally
        !outputs the singular values of the inelastic snapshots; all operations are performed in parallel
        
        !DEC$ ATTRIBUTES DLLEXPORT :: get_u_snapshots
        
        use omp_lib
        use, intrinsic :: iso_c_binding
        
        implicit none
        
        integer(C_INT),intent(in),value:: nDof,n_RVEs,n_snaps_inel,ncpus
        real(C_DOUBLE),dimension(nDof,n_RVEs),intent(out),target:: snapshots_el
        real(C_DOUBLE),dimension(nDof,n_snaps_inel),intent(out),target:: snapshots_inel
        integer(C_INT),dimension(n_RVEs),intent(in):: n_snaps
        integer(C_INT),intent(out):: n_SV_el,n_SV_inel
        real(C_DOUBLE),dimension(:,:),pointer:: snapshots_inel_i
        integer(C_INT):: i,k,l,m,error,n_snapshots_inel
        character(len=20)::training_number
        !for SVD
        real(C_DOUBLE),dimension(min(nDof,n_RVEs)):: SV_snapshots_el
        real(C_DOUBLE),dimension(:),allocatable:: SV_snapshots_inel
        real(C_DOUBLE),dimension(:,:),pointer::VT=>Null(),U=>Null()
        real(C_DOUBLE),dimension(:),allocatable::work
        integer(C_INT)::lwork,info
        !get the current time
        character(len=10):: time
        
        write(*,*) 'Start reading Displacement Snapshots using ',ncpus,'cpus.'
        call date_and_time(time=time)
        write(*,*) 'Time: '//time
        
        n_snapshots_inel=0
        call omp_set_num_threads(ncpus)
        !$omp parallel do &
        !$omp private(i,m,k,l,training_number,error,snapshots_inel_i) &
        !$omp shared(n_snaps,snapshots_el,snapshots_inel,n_snapshots_inel)
            do i=1,size(n_snaps)
                
                !at first open the file with the displacement snapshots data
                write(training_number,*) i !internally write the training number
                open(unit=100+i,file='training_data-u-'//TRIM(ADJUSTL(training_number))//'.txt',action='READ',status='old',iostat=error)
                
                !assume the first displacement snapshot to be elastic
                read(100+i,*,iostat=error) snapshots_el(:,i)
                
                !now set the local pointer to the global inelastic snapshot array
                !$omp critical
                   snapshots_inel_i=>snapshots_inel(:,n_snapshots_inel+1:n_snapshots_inel+n_snaps(i)-1)
                   n_snapshots_inel=n_snapshots_inel+n_snaps(i)-1
                !$omp end critical
                
                !now read the inelastic snapshots
                read(100+i,*,iostat=error) snapshots_inel_i
                
                !close the file
                close(unit=100+i)
                
            end do
        !$omp end parallel do
        
        !compute the SVD of the elastic and inelastic snapshots
        
        write(*,*) 'Start computing SVD of Displacement Snapshots using ',ncpus,'cpus.'
        call date_and_time(time=time)
        write(*,*) 'Time: '//time
        
        call omp_set_num_threads(ncpus)
        call mkl_set_num_threads(ncpus)
        call mkl_set_num_threads_local(ncpus)
        call mkl_set_dynamic(.True.)
        
        allocate(work(1))
        lwork=-1
        call dgesvd ('O','N',size(snapshots_el,1),size(snapshots_el,2),snapshots_el,size(snapshots_el,1),SV_snapshots_el,U,size(snapshots_el,1),VT,size(snapshots_el,2),work,lwork,info)
        lwork=work(1)
        deallocate(work); allocate(work(lwork))
        call dgesvd ('O','N',size(snapshots_el,1),size(snapshots_el,2),snapshots_el,size(snapshots_el,1),SV_snapshots_el,U,size(snapshots_el,1),VT,size(snapshots_el,2),work,lwork,info)
        deallocate(work)
        
        n_SV_el=0
        do i=1,size(snapshots_el,2)
           if (SV_snapshots_el(i)/SV_snapshots_el(1)>0.01) then
              n_SV_el=n_SV_el+1
           else
              exit
           end if
        end do
         
       !modify snapshots_inel by substracting the orthogonal share of snapshots_el
       snapshots_inel(:,1:n_snapshots_inel)=snapshots_inel(:,1:n_snapshots_inel)-matmul(snapshots_el(:,1:n_SV_el),matmul(transpose(snapshots_el(:,1:n_SV_el)),snapshots_inel(:,1:n_snapshots_inel)))
       
       allocate(SV_snapshots_inel(min(size(snapshots_inel,1),n_snapshots_inel)),work(1))
       lwork=-1
       call dgesvd ('O','N',size(snapshots_inel,1),n_snapshots_inel,snapshots_inel,size(snapshots_inel,1),SV_snapshots_inel,U,size(snapshots_inel,1),VT,size(SV_snapshots_inel),work,lwork,info)
       lwork=work(1)
       deallocate(work); allocate(work(lwork))
       call dgesvd ('O','N',size(snapshots_inel,1),n_snapshots_inel,snapshots_inel,size(snapshots_inel,1),SV_snapshots_inel,U,size(snapshots_inel,1),VT,size(SV_snapshots_inel),work,lwork,info)
       deallocate(work)
       
       n_SV_inel=0
       do i=1,size(SV_snapshots_inel)
          if (SV_snapshots_inel(i)/SV_snapshots_inel(1)>0.000001_C_DOUBLE) then
             n_SV_inel=n_SV_inel+1
          else
             exit
          end if
       end do
       
       write(*,*) 'End computing SVD of Displacement Snapshots using ',ncpus,'cpus.'
       call date_and_time(time=time)
       write(*,*) 'Time: '//time
    
    end subroutine get_u_snapshots    
    
    
    subroutine get_f_snapshots(F_Snaps,NGP,n_F_Snaps,n_snaps,n_RVEs,n_F_SV,ncpus) bind(C,name='get_f_snapshots')
        !this routine returns the singular values of the internal force snapshots of the integration
        !points, by first collecting the snapshots in F_Snaps and afterwards computing the SVD of it
        !(both in Parallel using openMP resp. MKL one api)
        
        !DEC$ ATTRIBUTES DLLEXPORT :: get_f_snapshots
        
        use omp_lib
        use, intrinsic :: iso_c_binding
        
        implicit none
        
        integer(C_INT),intent(in),value:: NGP,n_F_Snaps,n_RVEs,ncpus
        real(C_DOUBLE),dimension(NGP,n_F_Snaps),intent(out),target:: F_Snaps
        integer(C_INT),dimension(n_RVEs),intent(in):: n_snaps
        integer(C_INT),intent(out):: n_F_SV
        real(C_DOUBLE),dimension(min(NGP,n_F_Snaps)):: SV_F_Snaps
        real(C_DOUBLE):: Volume
        integer(C_INT):: i,n,error
        character(len=20)::training_number
        real(C_DOUBLE),dimension(:,:),pointer:: F_Snaps_i
        !for SVD
        real(C_DOUBLE),dimension(:,:),pointer::VT=>Null()
        real(C_DOUBLE),dimension(:,:),pointer::U=>Null()
        real(C_DOUBLE),dimension(:),allocatable::work
        integer(C_INT)::lwork,info
        !get the current time
        character(len=10):: time
        
        write(*,*) 'Start reading Force Snapshots using ',ncpus,'cpus.'
        call date_and_time(time=time)
        write(*,*) 'Time: '//time
        
        !read the force snapshots in parallel
        n=0
        call omp_set_num_threads(ncpus)
        !$omp parallel do &
        !$omp private(i,training_number,error,F_Snaps_i) &
        !$omp shared(n_RVEs,n,n_snaps)
        do i=1,n_RVEs
            write(training_number,*) i !internal write
            open(unit=20+i,file='training_data-f-'//TRIM(ADJUSTL(training_number))//'.txt',action='READ',status='old',iostat=error)
            !$omp critical
               F_Snaps_i=>F_Snaps(:,n+1:n+n_snaps(i))
               n=n+n_snaps(i)
            !$omp end critical
            read(20+i,*,iostat=error) F_Snaps_i
            close(unit=20+i)
        end do
        !$omp end parallel do
                 
         write(*,*) 'Start computing SVD of Force Snapshots using ',ncpus,'cpus.'
         call date_and_time(time=time)
         write(*,*) 'Time: '//time
         
         !set up the parallel environment
         call omp_set_num_threads(ncpus)
         call mkl_set_num_threads(ncpus)
         call mkl_set_num_threads_local(ncpus)
         call mkl_set_dynamic(.True.)
         
         !calculate the singular value decomposition of the force snapshots 
         allocate(work(1))
         lwork=-1
         call dgesvd ('O','N',size(F_Snaps,1),size(F_Snaps,2),F_Snaps,size(F_Snaps,1),SV_F_Snaps,U,size(F_Snaps,1),VT,size(F_Snaps,2),work,lwork,info)
         lwork=work(1)
         deallocate(work); allocate(work(lwork))
         call dgesvd ('O','N',size(F_Snaps,1),size(F_Snaps,2),F_Snaps,size(F_Snaps,1),SV_F_Snaps,U,size(F_Snaps,1),VT,size(F_Snaps,2),work,lwork,info)
         deallocate(work)
         
         !get the number of relevant singular values (and therefore corresponding singular vectors)
         n_F_SV=0
         do i=1,size(SV_F_Snaps)
             if (SV_F_Snaps(i)/SV_F_Snaps(1)>0.000001_C_DOUBLE) then
                n_F_SV=n_F_SV+1
             else
                exit
             end if
         end do
         
         write(*,*) 'End computing SVD of Force Snapshots using ',ncpus,'cpus.'
         call date_and_time(time=time)
         write(*,*) 'Time: '//time
    
    end subroutine get_f_snapshots
    
    subroutine multiplication(A,m,n,x,y,ncpus) bind(C,name='multiplication')
        !this routine performs a simple multiplication y=A^T*x
        
        !DEC$ ATTRIBUTES DLLEXPORT :: multiplication
        
        use omp_lib
        use, intrinsic :: iso_c_binding
        
        implicit none
        
        integer(C_INT),intent(in),value:: m,n
        real(C_DOUBLE),dimension(m,n),intent(in):: A
        real(C_DOUBLE),dimension(m),intent(in):: x
        real(C_DOUBLE),dimension(m),intent(out):: y
        integer(C_INT),intent(in),value:: ncpus
        
        call omp_set_num_threads(ncpus)
        call mkl_set_num_threads(ncpus)
        call mkl_set_num_threads_local(ncpus)
        call mkl_set_dynamic(.True.)
        
        call dgemv ('T',m,n,1.0_C_DOUBLE,A,m,x,1,0.0_C_DOUBLE,y,1)
        
    end subroutine multiplication
    
    subroutine least_squares(A,m,n,x,y,ncpus) bind(C,name='least_squares')
        !this routine performs a simple least square problem min(b-A*x)
        
        !DEC$ ATTRIBUTES DLLEXPORT :: least_squares
        
        use omp_lib
        use, intrinsic :: iso_c_binding
        
        implicit none
        
        integer(C_INT),intent(in),value:: m,n
        real(C_DOUBLE),dimension(m,n),intent(in):: A
        real(C_DOUBLE),dimension(m,n):: A_copy
        real(C_DOUBLE),dimension(m),intent(in):: x
        real(C_DOUBLE),dimension(n),intent(out):: y
        real(C_DOUBLE),dimension(m):: sol
        integer(C_INT),intent(in),value:: ncpus
        real(C_DOUBLE),dimension(:),allocatable::work,s
        integer(C_INT)::lwork,info
        
        call omp_set_num_threads(ncpus)
        call mkl_set_num_threads(ncpus)
        call mkl_set_num_threads_local(ncpus)
        call mkl_set_dynamic(.True.)
        
        A_copy=A
        sol=x
        allocate(s(min(m,n)))
        
        lwork=-1
        allocate(work(1))
        call dgels('N',m,n,1,A_copy,m,sol,m,work,lwork,info)
        lwork=work(1)
        deallocate(work); allocate(work(lwork))
        call dgels('N',m,n,1,A_copy,m,sol,m,work,lwork,info)
        deallocate(work)
        
        y=sol(1:n)
        
    end subroutine least_squares
