129 INTEGER :: INODE, IFATH, IGRANDFATH, SPECIAL_ROOT,
130 & nfront, npiv, leaf, varnum, ierr
131 LOGICAL :: INODE_IS_A_LEAF
132 INTEGER(8) :: NFRONT8, NPIV8
133 INTEGER(8) :: SUM_CB, MAX_MEM
134 DOUBLE PRECISION :: COST_NODE
135 LOGICAL :: IN_L0INIT, SKIP_ABOVE
136 LOGICAL,
EXTERNAL :: MUMPS_ROOTSSARBR, MUMPS_IN_OR_ROOT_SSARBR
137 INTEGER,
EXTERNAL :: MUMPS_GET_POOL_LENGTH, MUMPS_TYPENODE
138 IF (
associated(ipool_b_l0_omp))
THEN
139 WRITE(*,*)
" Internal error 1 MUMPS_ANA_INITIALIZE_L0_OMP"
142 IF (
associated(ipool_a_l0_omp))
THEN
143 WRITE(*,*)
" Internal error 2 MUMPS_ANA_INITIALIZE_L0_OMP"
146 IF (
associated(virt_l0_omp))
THEN
147 WRITE(*,*)
" Internal error 3 MUMPS_ANA_INITIALIZE_L0_OMP"
150 IF (
associated(virt_l0_omp_mapping))
THEN
151 WRITE(*,*)
" Internal error 4 MUMPS_ANA_INITIALIZE_L0_OMP"
154 IF (
associated(perm_l0_omp))
THEN
155 WRITE(*,*)
" Internal error 5 MUMPS_ANA_INITIALIZE_L0_OMP"
158 IF (
associated(ptr_leafs_l0_omp))
THEN
159 WRITE(*,*)
" Internal error 6 MUMPS_ANA_INITIALIZE_L0_OMP"
164 ALLOCATE( threads_charge( nb_threads ), stat=ierr )
165 IF (ierr .GT. 0)
THEN
168 IF (lpok)
WRITE(lp,150)
'THREADS_CHARGE'
171 ALLOCATE( costs_mono_thread( nsteps ), stat=ierr )
175 IF (lpok)
WRITE(lp, 150)
' COSTS_MONO_THREAD'
178 ALLOCATE( costs_multi_thread( nsteps ), stat=ierr )
182 IF (lpok)
WRITE(lp, 150)
' COSTS_MULTI_THREAD'
185 ALLOCATE( schur_memory( nsteps ), stat=ierr )
189 IF (lpok)
WRITE(lp, 150)
' SCHUR_MEMORY'
192 ALLOCATE( subtree_factor_memory( nsteps ), stat=ierr )
196 IF (lpok)
WRITE(lp, 150)
' SCHUR_FACTOR_MEMORY'
199 ALLOCATE( subtree_memory( nsteps ), stat=ierr )
203 IF (lpok)
WRITE(lp, 150)
' SUBTREE_MEMORY'
206 ALLOCATE( cp_nstk_steps( nsteps ), stat=ierr )
210 IF (lpok)
WRITE(lp, 150)
' CP_NSTK_STEPS'
213 lpool_b_l0_omp=mumps_get_pool_length(na(1),keep(1),keep8(1))
214 ALLOCATE( ipool_b_l0_omp( lpool_b_l0_omp) , stat=ierr )
218 IF (lpok)
WRITE(lp, 150)
' id%IPOOL_B_L0_OMP'
221 costs_mono_thread = 0.0d0
222 costs_multi_thread = 0.0d0
225 cost_total_best = huge(cost_total_best)
227 subtree_factor_memory = 0_8
229 factor_size_under_l0 = 0_8
230 cp_nstk_steps(:) = nstk_steps(:)
231 IF (keep(403).NE.0)
THEN
236 & keep(199), na(1), lna,
237 & keep(1), keep8(1), step(1),
239 & ipool_b_l0_omp(1), lpool_b_l0_omp)
242 IF (nbleaf_myid .EQ. 0)
THEN
246 inode = ipool_b_l0_omp( leaf )
248 inode_is_a_leaf=.true.
250 nfront = nd( step( inode ) )
251 nfront8= int(nfront,8)
254 DO WHILE (varnum .GT. 0 )
256 varnum = fils( varnum )
260 IF (keep(50) .EQ. 0)
THEN
261 schur_memory( step( inode ) ) =
262 & (nfront8 - npiv8)*(nfront8 - npiv8)
263 IF (keep(251) .EQ. 0)
THEN
264 subtree_factor_memory( step( inode ) ) = nfront8 * nfront8
265 & - schur_memory( step( inode ) )
267 subtree_factor_memory( step( inode ) ) = 0_8
270 schur_memory( step( inode ) ) =
271 & (nfront8 - npiv8)*(nfront8 + 1_8 - npiv8)/2_8
272 IF (keep(251) .EQ. 0)
THEN
273 subtree_factor_memory( step( inode ) ) = nfront8 * npiv8
275 subtree_factor_memory( step( inode ) ) = 0_8
280 IF (keep(403) .EQ. 0)
THEN
283 costs_mono_thread( step( inode ) ) = cost_node
285 CALL cost_bench (npiv, nfront-npiv, 1, keep(50), cost_node)
286 costs_mono_thread( step( inode ) ) = cost_node
287 CALL cost_bench (npiv,nfront-npiv,nb_threads,keep(50),cost_node)
288 costs_multi_thread( step( inode ) ) = cost_node
290 DO WHILE (varnum .GT. 0 )
291 costs_mono_thread( step( inode ) ) =
292 & costs_mono_thread( step( inode ) )
294 & costs_mono_thread( step( varnum ) )
295 max_mem =
max(max_mem,
296 & subtree_memory( step( varnum ) ) + sum_cb )
297 sum_cb = sum_cb + schur_memory( step( varnum ) ) +
298 & subtree_factor_memory( step( varnum ) )
299 subtree_factor_memory( step( inode ) ) =
300 & subtree_factor_memory( step( inode ) )
301 & + subtree_factor_memory( step( varnum ) )
302 varnum = frere( step( varnum ) )
304 subtree_memory( step( inode ) ) =
305 &
max( max_mem, nfront8*nfront8 + sum_cb )
306 ifath = dad( step( inode ) )
307 IF (ifath .NE. 0)
THEN
308 igrandfath = dad( step( ifath ) )
312 special_root =
max(keep(38), keep(20))
315 IF ( inode .EQ. special_root )
THEN
317 IF (inode_is_a_leaf)
THEN
321 WRITE(*,*)
" Internal error 1 in MUMPS_ANA_INITIALIZE_L0_OMP",
322 & inode, special_root
326 IF ( ifath .NE. 0 .AND. ifath .EQ. keep(38) )
THEN
328 IF (inode_is_a_leaf)
THEN
332 WRITE(*,*)
" Internal error 2 in MUMPS_ANA_INITIALIZE_L0_OMP",
333 & inode, ifath, keep(38)
337 IF ( slavef_during_mapping > 1 )
THEN
338 IF (mumps_rootssarbr(
339 & procnode_steps( step( inode ) ), keep(199) )
340 & .OR. .NOT. mumps_in_or_root_ssarbr(
341 & procnode_steps( step ( inode ) ), keep(199) )
344 IF (inode_is_a_leaf)
THEN
349 &
" Internal error 3 in MUMPS_ANA_INITIALIZE_L0_OMP",
356 IF ( mumps_typenode(step(ifath),keep(199)).EQ.2)
THEN
358 IF (inode_is_a_leaf)
THEN
363 &
" Internal error 5 in MUMPS_ANA_INITIALIZE_L0_OMP",
369 IF ( mumps_typenode(step(inode),keep(199)).EQ.2)
THEN
371 IF (inode_is_a_leaf)
THEN
376 &
" Internal error 6 in MUMPS_ANA_INITIALIZE_L0_OMP",
381 IF ( ifath .EQ. 0 )
THEN
385 IF ( ifath .EQ. keep(20) )
THEN
389 IF ( igrandfath .EQ. keep(38) .AND. keep(38) .NE. 0 )
THEN
393 IF ( slavef_during_mapping > 1 )
THEN
394 IF (mumps_rootssarbr(
395 & procnode_steps( step( ifath ) ), keep(199) ))
THEN
402 IF (.NOT. skip_above)
THEN
403 IF (keep(50).EQ.0)
THEN
404 factor_size_under_l0 = factor_size_under_l0 +
405 & npiv8 * ( nfront8 + nfront8 - npiv8 )
407 factor_size_under_l0 = factor_size_under_l0 +
411 IF ( in_l0init )
THEN
413 ELSE IF ( skip_above )
THEN
415 IF ( .NOT. inode_is_a_leaf )
THEN
417 &
" Internal error 7 in MUMPS_ANA_INITIALIZE_L0_OMP",
421 ipool_b_l0_omp(leaf+1) = -inode
423 cp_nstk_steps( step( ifath ) ) =
424 & cp_nstk_steps( step( ifath ) ) - 1
425 IF ( cp_nstk_steps( step( ifath ) ) .EQ. 0 )
THEN
427 inode_is_a_leaf = .false.
431 IF ( leaf .GT. 0 )
THEN
437 & /
' ** ALLOC FAILURE IN MUMPS_ANA_INITIALIZE_L0_OMP FOR ',
550 INTEGER :: i, i_less_charged, , nb_in_l0
551 DOUBLE PRECISION :: lightest_charge, heaviest_charge
553 threads_charge = 0.0d0
556 DO WHILE (
associated ( idll_node ) )
557 nb_in_l0 = nb_in_l0 + 1
559 lightest_charge = threads_charge( 1 )
561 IF ( threads_charge( i ) .LT. lightest_charge )
THEN
563 lightest_charge = threads_charge( i )
566 threads_charge( i_less_charged ) =
567 & threads_charge( i_less_charged )
569 & costs_mono_thread( step( idll_node%ELMT ) )
570 idll_node => idll_node%NEXT
572 nb_max_in_l0_acceptl0 =
max(nb_max_in_l0_acceptl0, nb_in_l0)
573 lightest_charge = threads_charge( 1 )
574 heaviest_charge = threads_charge( 1 )
576 IF ( threads_charge( i ) .LT. lightest_charge )
THEN
577 lightest_charge = threads_charge( i )
578 ELSEIF ( threads_charge( i ) .GT. heaviest_charge )
THEN
579 heaviest_charge = threads_charge( i )
582 cost_under = heaviest_charge
583 IF (keep(403) .EQ. 0)
THEN
586 & dble(lightest_charge)/(dble(heaviest_charge)+1.d-12)
587 & .GT.thresh_equilib .AND.
589 & factor_size_under_l0 .LE.
590 & factor_size_per_mpi * int(thresh_mem,8) / 100_8
594 & ( nb_in_l0 .LT. nb_max_in_l0_acceptl0 .AND.
595 & lightest_charge .EQ. 0.0d0 )
596 & .OR. ( nb_in_l0 .EQ. 0 )
598 IF (
associated(phys_l0_omp))
THEN
599 DEALLOCATE(phys_l0_omp)
602 ierr =
idll_2_array( l0_omp_dll, phys_l0_omp, l_phys_l0_omp )
603 IF (ierr .EQ. -2)
THEN
605 info(2) = l_phys_l0_omp
610 IF (cost_under + cost_above .LT. cost_total_best)
THEN
611 IF (
associated(phys_l0_omp))
THEN
612 DEALLOCATE(phys_l0_omp)
615 ierr =
idll_2_array( l0_omp_dll, phys_l0_omp, l_phys_l0_omp )
616 cost_total_best = cost_under + cost_above
617 nb_repeat_acceptl0 = 100
619 nb_repeat_acceptl0 = nb_repeat_acceptl0- 1
626 INTEGER :: , OLD_INODE, I, J, K, LEAF, IERR
627 DOUBLE PRECISION :: LIGHTEST_CHARGE
628 INTEGER :: I_LESS_CHARGED
629 INTEGER(8) :: SUM_CB, MAX_MEM, MAX_MEM_ALL_THREADS
630 INTEGER :: MAX_TASK_PER_THREAD
631 TYPE ( IDLL_NODE_T ),
POINTER :: IDLL_NODE
632 INTEGER,
DIMENSION(:,:),
ALLOCATABLE :: THREADS_TASKS
633 INTEGER,
DIMENSION(:),
ALLOCATABLE :: NB_TASK_PER_THREAD
634 INTEGER,
DIMENSION(:),
ALLOCATABLE :: INV_PERM_L0_OMP
637 IF (keep(402) .EQ. 0)
THEN
638 l_virt_l0_omp = nb_threads + 1
640 l_virt_l0_omp = l_phys_l0_omp + 1
643 ALLOCATE ( virt_l0_omp(
max(l_virt_l0_omp,1) ),
644 & virt_l0_omp_mapping(
max(l_virt_l0_omp,1) ),
648 info(2)=2*
max(l_virt_l0_omp,1)
649 IF (lpok)
WRITE(lp,150)
'id%VIRT_L0_OMP[_MAPPING]'
652 ALLOCATE ( perm_l0_omp(
max(l_phys_l0_omp,1) ), stat=ierr )
655 info(2)=
max(l_phys_l0_omp,1)
656 IF (lpok)
WRITE(lp,150)
'id%PERM_L0_OMP'
659 ALLOCATE ( ptr_leafs_l0_omp( l_phys_l0_omp + 1 ), stat=ierr )
662 info(2)=
max(l_phys_l0_omp,1)
663 IF (lpok)
WRITE(lp,150)
'id%PTR_LEAFS_L0_OMP'
666 ALLOCATE ( ipool_a_l0_omp( lpool_a_l0_omp ), stat=ierr )
669 info(2)=lpool_a_l0_omp
670 IF (lpok)
WRITE(lp,150)
'id%IPOOL_A_L0_OMP'
673 ALLOCATE ( nb_task_per_thread( nb_threads ), stat=ierr )
677 IF (lpok)
WRITE(lp,150)
'NB_TASK_PER_THREAD'
680 ALLOCATE ( inv_perm_l0_omp( l_phys_l0_omp ), stat=ierr )
682 WRITE(*,*)
"Allocation Error in MUMPS_ANA_FINALIZE_L0_OMP"
685 nb_task_per_thread = 0
686 threads_charge = 0.0d0
687 DO i = 1, l_phys_l0_omp
689 lightest_charge = threads_charge( 1 )
691 IF ( threads_charge( j ) .LT. lightest_charge )
THEN
693 lightest_charge = threads_charge( j )
694 IF (threads_charge( j ) .EQ. 0)
THEN
699 nb_task_per_thread( i_less_charged ) =
700 & nb_task_per_thread( i_less_charged ) + 1
701 IF (keep(402) .NE. 0)
THEN
702 virt_l0_omp_mapping(i) = i_less_charged
704 threads_charge( i_less_charged ) =
705 & threads_charge( i_less_charged )
707 & costs_mono_thread( step( phys_l0_omp( i ) ) )
709 IF (keep(402) .EQ. 0)
THEN
711 virt_l0_omp_mapping(i) = i
714 virt_l0_omp_mapping(l_virt_l0_omp) = -999999
715 max_task_per_thread = 0
717 max_task_per_thread =
max(max_task_per_thread,
718 & nb_task_per_thread( i ) )
720 ALLOCATE ( threads_tasks( nb_threads, max_task_per_thread ),
724 info(2)=nb_threads*max_task_per_thread
725 IF (lpok)
WRITE(lp,150)
'THREADS_TASK'
728 nb_task_per_thread = 0
729 threads_charge = 0.0d0
731 DO i = 1, l_phys_l0_omp
733 lightest_charge = threads_charge( 1 )
735 IF ( threads_charge( j ) .LT. lightest_charge )
THEN
737 lightest_charge = threads_charge( j )
740 nb_task_per_thread( i_less_charged ) =
741 & nb_task_per_thread( i_less_charged ) + 1
742 threads_tasks( i_less_charged, nb_task_per_thread
743 & ( i_less_charged ) ) = phys_l0_omp( i )
744 threads_charge( i_less_charged ) =
745 & threads_charge( i_less_charged )
747 & costs_mono_thread( step( phys_l0_omp( i ) ) )
749 max_mem_all_threads = 0_8
753 DO j = 1, nb_task_per_thread( i )
754 max_mem =
max( max_mem, subtree_memory( step(
755 & threads_tasks(i,j) ) ) + sum_cb )
757 & +schur_memory(step(threads_tasks(i,j)))
758 & +subtree_factor_memory(
759 & step(threads_tasks(i,j)))
761 max_mem =
max( max_mem, sum_cb )
762 IF (keep(402) .EQ. 0)
THEN
763 threads_charge( i ) = dble(max_mem)
765 max_mem_all_threads =
max( max_mem_all_threads, max_mem )
767 max_mem_all_threads = ( max_mem_all_threads
768 & * int(100 + keep(12),8) ) / 100_8
769 thread_la =
max(max_mem_all_threads,6_8)
770 IF (keep(402) .EQ. 0)
THEN
774 DO j = 1, nb_task_per_thread( i )
775 phys_l0_omp(k) = threads_tasks(i,j)
779 virt_l0_omp(nb_threads+1) = k
781 DO i = 1, l_virt_l0_omp
785 DO i = 1, l_phys_l0_omp
786 inv_perm_l0_omp( i ) = i
788 IF ( l_phys_l0_omp .GT. 1 )
THEN
790 & inv_perm_l0_omp, l_phys_l0_omp, 1, l_phys_l0_omp )
792 DO i = 1, l_phys_l0_omp
793 perm_l0_omp( inv_perm_l0_omp( i ) ) = i
796 ptr_leafs_l0_omp( 1 ) = j
797 DO i = 1, l_phys_l0_omp
799 inode = phys_l0_omp( i )
800 DO WHILE ( inode .NE. 0 )
802 DO WHILE ( inode .GT. 0 )
803 inode = fils( inode )
807 DO WHILE ( ipool_b_l0_omp( j ) .NE. old_inode )
811 ptr_leafs_l0_omp( i + 1 ) = j
813 cp_nstk_steps(:) = nstk_steps(:)
817 DO WHILE (
associated( idll_node ) )
818 ipool_a_l0_omp( leaf ) = idll_node%ELMT
820 idll_node => idll_node%NEXT
822 DO i = 1 , l_phys_l0_omp
823 IF ( dad( step( phys_l0_omp(i) ) ) .NE. 0 )
THEN
824 cp_nstk_steps( step( dad( step( phys_l0_omp(i) ) ) ) ) =
825 & cp_nstk_steps( step( dad( step( phys_l0_omp(i) ) ) ) )-1
826 IF (cp_nstk_steps(step(dad(step(phys_l0_omp(i))))) .EQ. 0)
THEN
827 ipool_a_l0_omp( leaf ) = dad(step(phys_l0_omp( i )))
833 ipool_a_l0_omp(lpool_a_l0_omp) = leaf
834 ipool_a_l0_omp(lpool_a_l0_omp-1) = 0
835 ipool_a_l0_omp(lpool_a_l0_omp-2) = 0
836 IF (leaf .GT. 1)
THEN
838 & ipool_a_l0_omp(1), leaf, 1, leaf )
841 IF (
allocated(nb_task_per_thread))
DEALLOCATE (nb_task_per_thread)
842 IF (
allocated(inv_perm_l0_omp ))
DEALLOCATE ( inv_perm_l0_omp )
843 IF (
allocated(threads_tasks ))
DEALLOCATE (threads_tasks )
846 & /
' ** ALLOC FAILURE IN MUMPS_ANA_FINALIZE_L0_OMP FOR ',