Merge branch 'qe_dav' into 'develop'

Qe turbo_dav See merge request QEF/q-e!2633
2025-06-15 08:11:32 +00:00 · 2025-06-15 08:11:32 +00:00 · 5172d43d7d
parent afacb1cee7 772490ec7f
commit 5172d43d7d
29 changed files with 415 additions and 651 deletions
--- a/FFTXlib/src/fft_interpolate.f90
+++ b/FFTXlib/src/fft_interpolate.f90
@ -26,9 +26,11 @@ subroutine fft_interpolate_real (dfft_in, v_in, dfft_out, v_out )
  call start_clock ('interpolate')

  IF (dfft_out%grid_id == dfft_in%grid_id) THEN
-
+     !$acc data present_or_copyin(v_in) present_or_copyout(v_out)     
+     !$acc kernels
     v_out (1:dfft_in%nnr) = v_in (1:dfft_in%nnr)
-
+     !$acc end kernels
+     !$acc end data
  ELSE

     if (dfft_in%lgamma .neqv. dfft_out%lgamma) &
@ -37,7 +39,6 @@ subroutine fft_interpolate_real (dfft_in, v_in, dfft_out, v_out )
     ALLOCATE (aux_in( dfft_in%nnr), aux_out(dfft_out%nnr))

     aux_in (1:dfft_in%nnr) = v_in(1:dfft_in%nnr)
-
     CALL fwfft ('Rho', aux_in, dfft_in)

     aux_out(1:dfft_out%nnr) = (0.d0, 0.d0)
@ -45,7 +46,9 @@ subroutine fft_interpolate_real (dfft_in, v_in, dfft_out, v_out )
     ngm = min(dfft_in%ngm, dfft_out%ngm)

     aux_out (dfft_out%nl (1:ngm) ) = aux_in (dfft_in%nl (1:ngm) )
-     IF (dfft_in%lgamma) aux_out (dfft_out%nlm (1:ngm) ) = aux_in (dfft_in%nlm (1:ngm) )
+     IF (dfft_in%lgamma) THEN
+        aux_out (dfft_out%nlm (1:ngm) ) = aux_in (dfft_in%nlm (1:ngm) )
+     ENDIF

     CALL invfft ('Rho', aux_out, dfft_out)

@ -83,13 +86,14 @@ subroutine fft_interpolate_complex (dfft_in, v_in, dfft_out, v_out )
  call start_clock ('interpolate')

  IF (dfft_out%grid_id == dfft_in%grid_id) THEN
-
+     !$acc data present_or_copyin(v_in) present_or_copyout(v_out)
+     !$acc kernels
     v_out (1:dfft_in%nnr) = v_in (1:dfft_in%nnr)
-
+     !$acc end kernels
+     !$acc end data
  ELSE

     ALLOCATE (aux_in( dfft_in%nnr))
-
     aux_in (1:dfft_in%nnr) = v_in(1:dfft_in%nnr)

     CALL fwfft ('Rho', aux_in, dfft_in)
--- a/LR_Modules/dv_of_drho.f90
+++ b/LR_Modules/dv_of_drho.f90
@ -91,32 +91,32 @@ subroutine dv_of_drho (dvscf, drhoc)
  CALL fwfft ('Rho', dvscf(:,1), dfftp)
  !
  IF (do_comp_mt) THEN
-      !
-      ! Response Hartree potential with the Martyna-Tuckerman correction
-      !
-      allocate(dvhart(dfftp%nnr,nspin_mag))
-      dvhart(:,:) = (0.d0,0.d0)
-      !
-      do is = 1, nspin_lsda
+     !
+     ! Response Hartree potential with the Martyna-Tuckerman correction
+     !
+     allocate(dvhart(dfftp%nnr,nspin_mag))
+     dvhart(:,:) = (0.d0,0.d0)
+     !
+     do is = 1, nspin_lsda
        do ig = gstart, ngm
-          qg2 = (g(1,ig)+xq(1))**2 + (g(2,ig)+xq(2))**2 + (g(3,ig)+xq(3))**2
-          dvhart(dfftp%nl(ig),is) = e2 * fpi * dvscf(dfftp%nl(ig),1) / (tpiba2 * qg2)
+           qg2 = (g(1,ig)+xq(1))**2 + (g(2,ig)+xq(2))**2 + (g(3,ig)+xq(3))**2
+           dvhart(dfftp%nl(ig),is) = e2 * fpi * dvscf(dfftp%nl(ig),1) / (tpiba2 * qg2)
        enddo
-      enddo 
-      !
-      ! Add Martyna-Tuckerman correction to response Hartree potential
-      !
-      allocate( dvaux_mt( ngm ), rgtot(ngm) )
-      !
-      ! Total response density
-      !
-      do ig = 1, ngm
-         rgtot(ig) = dvscf(dfftp%nl(ig),1)
-      enddo
-      !
-      CALL wg_corr_h (omega, ngm, rgtot, dvaux_mt, eh_corr)
-      !
-      do is = 1, nspin_lsda
+     enddo 
+     !
+     ! Add Martyna-Tuckerman correction to response Hartree potential
+     !
+     allocate( dvaux_mt( ngm ), rgtot(ngm) )
+     !
+     ! Total response density
+     !
+     do ig = 1, ngm
+        rgtot(ig) = dvscf(dfftp%nl(ig),1)
+     enddo
+     !
+     CALL wg_corr_h (omega, ngm, rgtot, dvaux_mt, eh_corr)
+     !
+     do is = 1, nspin_lsda
        !
        do ig = 1, ngm
           dvhart(dfftp%nl(ig),is)  = dvhart(dfftp%nl(ig),is)  + dvaux_mt(ig)
@ -131,87 +131,77 @@ subroutine dv_of_drho (dvscf, drhoc)
        !
        CALL invfft ('Rho', dvhart (:,is), dfftp)
        !
-      enddo
-      !
-      ! At the end the two contributions (XC+Hartree) are added
-      ! 
-      dvscf = dvaux + dvhart
-      !
-      deallocate( dvaux_mt, rgtot ) 
-      deallocate(dvhart)
-      !
+     enddo
+     !
+     ! At the end the two contributions (XC+Hartree) are added
+     ! 
+     dvscf = dvaux + dvhart
+     !
+     deallocate( dvaux_mt, rgtot ) 
+     deallocate(dvhart)
+     !
  ELSE
-   !
-   ! Response Hartree potential (without Martyna-Tuckerman correction)
-   !
-   If (gamma_only) then
-      !
-      ! Gamma_only case
-      !
-      allocate(dvhart(dfftp%nnr,nspin_mag))
-      dvhart(:,:) = (0.d0,0.d0)
-      !
-      do is = 1, nspin_lsda
-        do ig = 1, ngm
-           qg2 = (g(1,ig)+xq(1))**2 + (g(2,ig)+xq(2))**2 + (g(3,ig)+xq(3))**2
-           if (qg2 > 1.d-8) then
-              dvhart(dfftp%nl(ig),is) = e2 * fpi * dvscf(dfftp%nl(ig),1) / (tpiba2 * qg2)
-              dvhart(dfftp%nlm(ig),is)=conjg(dvhart(dfftp%nl(ig),is))
-           endif
+     !
+     ! Response Hartree potential (without Martyna-Tuckerman correction)
+     !
+     If (gamma_only) then
+        !
+        ! Gamma_only case
+        !
+        allocate(dvhart(dfftp%nnr,nspin_mag))
+        dvhart(:,:) = (0.d0,0.d0)
+        !
+        do is = 1, nspin_lsda
+           do ig = 1, ngm
+              qg2 = (g(1,ig)+xq(1))**2 + (g(2,ig)+xq(2))**2 + (g(3,ig)+xq(3))**2
+              if (qg2 > 1.d-8) then
+                 dvhart(dfftp%nl(ig),is) = e2 * fpi * dvscf(dfftp%nl(ig),1) / (tpiba2 * qg2)
+                 dvhart(dfftp%nlm(ig),is)=conjg(dvhart(dfftp%nl(ig),is))
+              endif
+           enddo
+           !
+           ! Transformed back to real space
+           !
+           CALL invfft ('Rho', dvhart (:, is), dfftp)
+           !
        enddo
        !
-        ! Transformed back to real space
+        ! At the end the two contributes are added
        !
-        CALL invfft ('Rho', dvhart (:, is), dfftp)
+        dvscf = dvaux + dvhart
        !
-      enddo
-      !
-      ! At the end the two contributes are added
-      !
-      dvscf = dvaux + dvhart
-      !
-      deallocate(dvhart)
-      !
-   else
-      !
-      ! General k points implementation
-      !
-      do is = 1, nspin_lsda
-         CALL fwfft ('Rho', dvaux (:, is), dfftp)
-         IF (do_cutoff_2D) THEN 
-            call cutoff_dv_of_drho(dvaux, is, dvscf)
-         ELSE
-            do ig = 1, ngm
-               qg2 = (g(1,ig)+xq(1))**2 + (g(2,ig)+xq(2))**2 + (g(3,ig)+xq(3))**2
-               g2  = g(1,ig)**2 + g(2,ig)**2 + g(3,ig)**2
-               IF (lnolr) THEN
-                 !
-                 IF (g2 > 1d-8 .AND. qg2 > 1.d-8) THEN
-                    dvaux(dfftp%nl(ig),is) = dvaux(dfftp%nl(ig),is) + &
-                                   & e2 * fpi * dvscf(dfftp%nl(ig),1) / (tpiba2 * qg2)
-                 ENDIF
-                 ! 
-               ELSE
+        deallocate(dvhart)
+        !
+     else
+        !
+        ! General k points implementation
+        !
+        do is = 1, nspin_lsda
+           CALL fwfft ('Rho', dvaux (:, is), dfftp)
+           IF (do_cutoff_2D) THEN 
+              call cutoff_dv_of_drho(dvaux, is, dvscf)
+           ELSE
+              do ig = 1, ngm
+                 qg2 = (g(1,ig)+xq(1))**2 + (g(2,ig)+xq(2))**2 + (g(3,ig)+xq(3))**2
                 if (qg2 > 1.d-8) then
                    dvaux(dfftp%nl(ig),is) = dvaux(dfftp%nl(ig),is) + &
                                   & e2 * fpi * dvscf(dfftp%nl(ig),1) / (tpiba2 * qg2)
                 endif
-               ENDIF
-            enddo
-         ENDIF
-         !
-         ! Transformed back to real space
-         !
-         CALL invfft ('Rho', dvaux (:, is), dfftp)
-         !
-      enddo
-      !
-      ! At the end the two contributes are added
-      !
-      dvscf (:,:) = dvaux (:,:)
-      !
-   endif
-   !
+              enddo
+           ENDIF
+           !
+           ! Transformed back to real space
+           !
+           CALL invfft ('Rho', dvaux (:, is), dfftp)
+           !
+        enddo
+        !
+        ! At the end the two contributes are added
+        !
+        dvscf (:,:) = dvaux (:,:)
+        !
+     endif
+     !
  ENDIF
  !
  deallocate (dvaux)
--- a/LR_Modules/incdrhoscf.f90
+++ b/LR_Modules/incdrhoscf.f90
@ -51,7 +51,7 @@ subroutine incdrhoscf (drhoscf, weight, ik, dbecsum, dpsi)
  COMPLEX(DP), ALLOCATABLE :: tg_psi(:), tg_dpsi(:), tg_drho(:)

  INTEGER :: npw, npwq, ikk, ikq, itmp
-  INTEGER :: ibnd, ir, ir3, ig, incr, v_siz, idx, ioff, ioff_tg, nxyp
+  INTEGER :: ibnd, ir, ir3, ig, incr, v_siz, idx, ioff, ioff_tg, nxyp, sum_siz
  INTEGER :: right_inc, ntgrp
  ! counters

@ -60,7 +60,6 @@ subroutine incdrhoscf (drhoscf, weight, ik, dbecsum, dpsi)
  INTEGER, POINTER, DEVICE :: nl_d(:)
  !
  nl_d  => dffts%nl_d
-  !$acc update device(evc) 
 #else
  INTEGER, ALLOCATABLE :: nl_d(:)
  !
@ -93,12 +92,13 @@ subroutine incdrhoscf (drhoscf, weight, ik, dbecsum, dpsi)
     !
  ELSE
     v_siz = dffts%nnr
+     sum_siz = nhm*(nhm+1)/2
  ENDIF
  !
  ! dpsi contains the   perturbed wavefunctions of this k point
  ! evc  contains the unperturbed wavefunctions of this k point
  !
-  !$acc data copyin(dpsi(1:npwx,1:nbnd)) copy(drhoscf(1:v_siz)) create(psi(1:v_siz),dpsic(1:v_siz)) present(igk_k) deviceptr(nl_d) 
+  !$acc data present_or_copyin(dpsi(1:npwx,1:nbnd)) present_or_copy(drhoscf(1:v_siz)) create(psi(1:v_siz),dpsic(1:v_siz)) present(igk_k) deviceptr(nl_d)
  do ibnd = 1, nbnd_occ(ikk), incr
     !
     IF ( dffts%has_task_groups ) THEN
@ -183,11 +183,11 @@ subroutine incdrhoscf (drhoscf, weight, ik, dbecsum, dpsi)
     ENDIF
     !
  enddo ! loop on bands
-  !$acc end data
  !
  ! Ultrasoft contribution
  ! Calculate dbecsum = <evc|vkb><vkb|dpsi>
  ! 
+  !$acc end data
  CALL addusdbec (ik, weight, dpsi, dbecsum)
  !
  DEALLOCATE(psi)
--- a/LR_Modules/lr_sm1_psi.f90
+++ b/LR_Modules/lr_sm1_psi.f90
@ -45,15 +45,15 @@ SUBROUTINE lr_sm1_psi (ik, lda, n, m, psi, spsi)
  !
  CALL start_clock( 'lr_sm1_psi' )
  !
+  !$acc data present_or_copyin(psi) present_or_copyout(spsi)
  IF ( gamma_only ) THEN
-     !$acc data present_or_copyin(psi) present_or_copyout(spsi)     
     CALL sm1_psi_gamma()
-     !$acc end data
  ELSEIF (noncolin) THEN
     CALL sm1_psi_nc()
  ELSE
     CALL sm1_psi_k()
  ENDIF
+  !$acc end data
  !
  CALL stop_clock( 'lr_sm1_psi' )
  !
@ -90,8 +90,6 @@ CONTAINS
    !
    ! Initialize spsi : spsi = psi
    !
-!    !$acc data present(psi,spsi)
-    !
    !$acc kernels
    spsi(:,:) = psi(:,:)
    !$acc end kernels 
@ -129,7 +127,6 @@ CONTAINS
    !$acc end host_data
    !
    !$acc exit data delete(ps)
-!    !$acc end data
    DEALLOCATE(ps)
    !
    RETURN
@ -159,7 +156,9 @@ CONTAINS
    !
    ! Initialize spsi : spsi = psi
    !
-    CALL ZCOPY( lda*m, psi, 1, spsi, 1 )
+    !$acc kernels
+    spsi(:,:) = psi(:,:)
+    !$acc end kernels
    !
    IF ( nkb == 0 .OR. .NOT. okvan ) RETURN
    !
@ -175,7 +174,7 @@ CONTAINS
    !
    ! Calculate beta-functions vkb for a given k+q point.
    !
-    CALL init_us_2 (n, igk_k(1,ikq), xk(1,ikq), vkb)
+    CALL init_us_2 (n, igk_k(1,ikq), xk(1,ikq), vkb, .true.)
    !
    ! Compute the product of the beta-functions vkb with the functions psi
    ! at point k+q, and put the result in becp%k.
@ -230,7 +229,9 @@ SUBROUTINE sm1_psi_nc()
    !
    ! Initialize spsi : spsi = psi
    !
-    CALL ZCOPY( lda*npol*m, psi, 1, spsi, 1 )
+    !$acc kernels
+    spsi(:,:) = psi(:,:)
+    !$acc end kernels
    !
    IF ( nkb == 0 .OR. .NOT. okvan ) RETURN
    !
--- a/PHonon/PH/solve_e2.f90
+++ b/PHonon/PH/solve_e2.f90
@ -123,6 +123,7 @@ subroutine solve_e2
        ! reads unperturbed wavefunctions psi_k in G_space, for all bands
        !
        if (nksq.gt.1) call get_buffer(evc, lrwfc, iuwfc, ikk)
+        !$acc update device(evc)
        npw = ngk(ikk)
        npwq= ngk(ikq)
        !
--- a/PHonon/PH/solve_e_fpol.f90
+++ b/PHonon/PH/solve_e_fpol.f90
@ -168,6 +168,7 @@ subroutine solve_e_fpol( iw )
        ! read unperturbed wavefunctions psi_k in G_space, for all bands
        !
        if (nksq.gt.1) call get_buffer(evc, lrwfc, iuwfc, ik)
+        !$acc update device(evc)
        !
        ! compute beta functions and kinetic energy for k-point ik
        ! needed by h_psi, called by cch_psi_all, called by gmressolve_all
--- a/PHonon/PH/zstar_eu_us.f90
+++ b/PHonon/PH/zstar_eu_us.f90
@ -92,6 +92,7 @@ subroutine zstar_eu_us
     ikk = ikks(ik)
     npw = ngk(ikk)
     if (nksq.gt.1) call get_buffer (evc, lrwfc, iuwfc, ikk)
+     !$acc update device(evc)
     if (lsda) current_spin = isk (ikk)
     call init_us_2 (npw, igk_k(1,ikk), xk(1,ikk), vkb)
     weight = wk (ikk)
--- a/TDDFPT/CMakeLists.txt
+++ b/TDDFPT/CMakeLists.txt
@ -43,7 +43,6 @@ set(src_tddfpt
    src/lr_dav_routines.f90
    src/lr_us.f90
    src/lr_test_restart.f90
-    src/lr_dav_debug.f90
    src/plugin_tddfpt_potential.f90
    src/lr_sternheimer.f90
    src/linear_solvers.f90
--- a/TDDFPT/src/Makefile
+++ b/TDDFPT/src/Makefile
@ -62,7 +62,6 @@ lr_dav_variables.o\
 lr_dav_routines.o\
 lr_us.o\
 lr_test_restart.o\
-lr_dav_debug.o\
 plugin_tddfpt_potential.o\
 linear_solvers.o\
 orthogonalize_omega.o\
--- a/TDDFPT/src/bcast_lr_input.f90
+++ b/TDDFPT/src/bcast_lr_input.f90
@ -81,7 +81,6 @@ SUBROUTINE bcast_lr_input
  CALL mp_bcast (num_basis_max, ionode_id, world_comm )
  CALL mp_bcast (residue_conv_thr, ionode_id, world_comm )
  CALL mp_bcast (precondition, ionode_id, world_comm )
-  CALL mp_bcast (dav_debug, ionode_id, world_comm )
  CALL mp_bcast (reference, ionode_id, world_comm )
  CALL mp_bcast (vccouple_shift, ionode_id, world_comm )
  CALL mp_bcast (single_pole, ionode_id, world_comm )
@ -93,9 +92,7 @@ SUBROUTINE bcast_lr_input
  CALL mp_bcast (start, ionode_id, world_comm )
  CALL mp_bcast (finish, ionode_id, world_comm )
  CALL mp_bcast (step, ionode_id, world_comm )
-  CALL mp_bcast (if_check_orth, ionode_id, world_comm )
  CALL mp_bcast (if_random_init, ionode_id, world_comm )
-  CALL mp_bcast (if_check_her, ionode_id, world_comm )
  CALL mp_bcast (p_nbnd_occ, ionode_id, world_comm )
  CALL mp_bcast (p_nbnd_virt, ionode_id, world_comm )
  CALL mp_bcast (poor_of_ram, ionode_id, world_comm )
--- a/TDDFPT/src/lr_alloc_init.f90
+++ b/TDDFPT/src/lr_alloc_init.f90
@ -183,14 +183,8 @@ SUBROUTINE lr_alloc_init()
  ! Allocate the R-space unperturbed orbitals
  !
  IF (dffts%has_task_groups) THEN
-     ALLOCATE(tg_revc0(dffts%nnr_tg,nbnd,nksq))
     IF (.NOT. ALLOCATED(tg_psic)) &
          & ALLOCATE( tg_psic(dffts%nnr_tg) )
-  ELSE
-     IF (.NOT.eels) THEN
-        ALLOCATE(revc0(dffts%nnr,nbnd,nksq))
-        revc0(:,:,:) = (0.0d0,0.0d0)
-     ENDIF
  ENDIF
  !
  ! Optical case: allocate the response charge-density
--- a/TDDFPT/src/lr_apply_liouvillian.f90
+++ b/TDDFPT/src/lr_apply_liouvillian.f90
@ -38,7 +38,7 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
  USE gvect,                ONLY : ngm, gstart, g, gg
  USE io_global,            ONLY : stdout
  USE klist,                ONLY : nks, xk, ngk, igk_k
-  USE lr_variables,         ONLY : evc0, sevc0, revc0, rho_1, rho_1c, &
+  USE lr_variables,         ONLY : evc0, sevc0, rho_1, rho_1c, &
                                 & ltammd, size_evc, no_hxc, lr_exx, &
                                 & scissor, davidson, lr_verbosity
  USE lsda_mod,             ONLY : nspin
@ -102,7 +102,7 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
  ALLOCATE( sevc1_new(npwx*npol,nbnd,nks))
  !
  nnr_siz= dffts%nnr
-!$acc data present_or_copyin(evc1(1:npwx*npol,1:nbnd,1:nks)) present_or_copyout(evc1_new(1:npwx*npol,1:nbnd,1:nks)) create(sevc1_new(1:npwx*npol,1:nbnd,1:nks), spsi1(1:npwx, 1:nbnd)) present_or_copyin(revc0(1:nnr_siz,1:nbnd,1))
+!$acc data present_or_copyin(evc1(1:npwx*npol,1:nbnd,1:nks), evc0(1:npwx*npol,1:nbnd,1:nks)) present_or_copyout(evc1_new(1:npwx*npol,1:nbnd,1:nks)) create(sevc1_new(1:npwx*npol,1:nbnd,1:nks), spsi1(1:npwx, 1:nbnd)) 
  !
  d_deeq(:,:,:,:)=0.0d0
  !$acc kernels
@ -303,7 +303,6 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
  ! [H(k) - E(k)] * evc1(k). Keep this in mind if you want to
  ! generalize this subroutine to metals. 
  !
-  !$acc data copyin(evc0)
  DO ik = 1, nks
     !

@ -314,7 +313,6 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
     !$acc end kernels
     !
  ENDDO 
-  !$acc end data
  !
  !
  ! Here we apply the S^{-1} operator.
@ -349,7 +347,7 @@ CONTAINS
    !
    ! Gamma only case 
    !
-    USE lr_variables,             ONLY : becp_1, tg_revc0
+    USE lr_variables,             ONLY : becp_1
    USE realus,                   ONLY : tg_psic
    USE mp_global,                ONLY : me_bgrp
    USE mp_global,                ONLY : ibnd_start, ibnd_end, inter_bgrp_comm
@ -458,21 +456,22 @@ CONTAINS
          ! Product with the potential vrs = (vltot+vr)
          ! revc0 is on smooth grid. psic is used up to smooth grid
          !
+          CALL invfft_orbital_gamma(evc0(:,:,1),ibnd,nbnd)
+          !
          IF (dffts%has_task_groups) THEN
             !
             DO ir=1, dffts%nr1x*dffts%nr2x*dffts%my_nr3p
                !
-                tg_psic(ir) = tg_revc0(ir,ibnd,1)*CMPLX(tg_dvrss(ir),0.0d0,DP)
+                tg_psic(ir) = tg_psic(ir)*CMPLX(tg_dvrss(ir),0.0d0,DP)
                !
             ENDDO
             !
          ELSE
             !
-             !DO ir = 1,dffts%nnr
             !$acc parallel loop
             DO ir = 1, nnr_siz
                !
-                psic(ir) = revc0(ir,ibnd,1)*CMPLX(dvrss(ir),0.0d0,DP)
+                psic(ir) = psic(ir)*CMPLX(dvrss(ir),0.0d0,DP)
                !
             ENDDO
             !
@ -661,9 +660,16 @@ SUBROUTINE lr_apply_liouvillian_k()
             !
             !   Product with the potential vrs = (vltot+vr)
             !
+             psic(:) = (0.0d0,0.0d0)
+             !
+             DO ig = 1, ngk(ik)
+                psic(dffts%nl(igk_k(ig,ik)))=evc0(ig,ibnd,ik)
+             ENDDO
+             CALL invfft ('Wave', psic, dffts)
+             !
             DO ir = 1,dffts%nnr
                !
-                psic(ir) = revc0(ir,ibnd,ik)*dvrssc(ir)
+                psic(ir) = psic(ir)*dvrssc(ir)
                !
             ENDDO
             !
--- a/TDDFPT/src/lr_apply_liouvillian_eels.f90
+++ b/TDDFPT/src/lr_apply_liouvillian_eels.f90
@ -56,6 +56,7 @@ SUBROUTINE lr_apply_liouvillian_eels ( evc1, evc1_new, interaction )
  COMPLEX(DP), ALLOCATABLE :: hpsi(:,:), spsi(:,:), &
                            & dvrsc(:,:), dvrssc(:,:), &
                            & sevc1_new(:,:,:)
+  INTEGER :: npwx_siz
  ! Change of the Hartree and exchange-correlation (HXC) potential
  !
  CALL start_clock('lr_apply')
@ -73,6 +74,8 @@ SUBROUTINE lr_apply_liouvillian_eels ( evc1, evc1_new, interaction )
  dvrssc(:,:) = (0.0d0,0.0d0)
  sevc1_new(:,:,:) = (0.0d0,0.0d0)
  !
+  npwx_siz = npwx*npol
+  !
  IF (no_hxc) THEN
     interaction1 = .false.
  ELSE
@ -127,6 +130,7 @@ SUBROUTINE lr_apply_liouvillian_eels ( evc1, evc1_new, interaction )
  ! 1) HXC term : P_c^+(k+q) V_HXC(q)*psi0(k) 
  ! 2) (H - E)*psi(k+q)
  !
+!$acc data copyin(dvrssc)  
  DO ik = 1, nksq
     !
     ikk  = ikks(ik)
@ -164,7 +168,7 @@ SUBROUTINE lr_apply_liouvillian_eels ( evc1, evc1_new, interaction )
        ! We need to redistribute it so that it is completely contained in the
        ! processors of an orbital TASK-GROUP.
        !
-!$acc data copyin(dvrssc) copy(dvpsi)
+!$acc data copy(dvpsi)
        CALL apply_dpot_bands(ik, nbnd_occ(ikk), dvrssc, evc, dvpsi)
 !$acc end data        
        !
@ -196,7 +200,7 @@ SUBROUTINE lr_apply_liouvillian_eels ( evc1, evc1_new, interaction )
     ! Apply the operator ( H - \epsilon S + alpha_pv P_v) to evc1
     ! where alpha_pv = 0
     !
-     !$acc data copyin(evq) copy(evc1(1:npwx*npol,1:nbnd,ik),sevc1_new(1:npwx*npol,1:nbnd,ik), et)
+     !$acc data copyin(evq) copy(evc1(1:npwx_siz,1:nbnd,ik),sevc1_new(1:npwx_siz,1:nbnd,ik), et)
     CALL ch_psi_all (npwq, evc1(:,:,ik), sevc1_new(:,:,ik), et(:,ikk), ik, nbnd_occ(ikk)) 
     !$acc end data
     IF (noncolin) THEN
@ -234,6 +238,7 @@ SUBROUTINE lr_apply_liouvillian_eels ( evc1, evc1_new, interaction )
                          & sevc1_new(1,1,ik), evc1_new(1,1,ik))
     !
  ENDDO ! loop on ik
+!$acc end data  
  !
  CALL apply_dpot_deallocate()
  DEALLOCATE (dvrsc)
--- a/TDDFPT/src/lr_calc_dens.f90
+++ b/TDDFPT/src/lr_calc_dens.f90
@ -31,7 +31,7 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
  USE io_global,              ONLY : stdout
  USE kinds,                  ONLY : dp
  USE klist,                  ONLY : nks, xk, wk, ngk, igk_k
-  USE lr_variables,           ONLY : evc0,revc0,rho_1,lr_verbosity,&
+  USE lr_variables,           ONLY : evc0,rho_1,lr_verbosity,&
                                     & charge_response, itermax,&
                                     & cube_save, LR_iteration,&
                                     & LR_polarization, project,&
@ -39,7 +39,7 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
                                     & n_ipol, becp1_virt, rho_1c,&
                                     & lr_exx
  USE lsda_mod,               ONLY : current_spin, isk
-  USE wavefunctions,   ONLY : psic
+  USE wavefunctions,          ONLY : psic
  USE wvfct,                  ONLY : nbnd, et, wg, npwx
  USE control_flags,          ONLY : gamma_only
  USE uspp,                   ONLY : vkb, nkb, okvan, qq_nt, becsum
@ -60,7 +60,7 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
  USE lr_exx_kernel,          ONLY : lr_exx_kernel_int, revc_int,&
                                     & revc_int_c
  USE constants,              ONLY : eps12
-  USE qpoint,                ONLY : nksq
+  USE qpoint,                 ONLY : nksq
  USE fft_helper_subroutines
  !
  IMPLICIT NONE
@ -81,7 +81,7 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
  ! These are temporary buffers for the response 
  REAL(kind=dp), ALLOCATABLE :: rho_sum_resp_x(:), rho_sum_resp_y(:),&
                              & rho_sum_resp_z(:)  
-  COMPLEX(kind=dp), ALLOCATABLE :: rhoaux(:,:)
+  COMPLEX(kind=dp), ALLOCATABLE :: rhoaux(:,:), revc0(:), tg_revc0(:)
  CHARACTER(len=256) :: tempfile, filename
  !
  IF (lr_verbosity > 5) THEN
@ -90,15 +90,24 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
  !
  CALL start_clock_gpu('lr_calc_dens')
  !
+  IF ( dffts%has_task_groups ) THEN
+     !
+     ALLOCATE(tg_revc0(dfftp%nnr_tg))
+     !
+  ENDIF
+
+
  ALLOCATE( psic(dfftp%nnr) )
+  ALLOCATE( revc0(dfftp%nnr) )

  v_siz = dfftp%nnr
  nnr_siz = dffts%nnr
  !
-  !$acc data create(psic(1:v_siz)) present_or_copyin(evc1(1:npwx*npol,1:nbnd,1:nks)) present_or_copyin(revc0(1:nnr_siz,1:nbnd,1:nksq)) copyout( rho_1(1:v_siz,1:nspin_mag)) 
+  !$acc data create(psic(1:v_siz), revc0(1:v_siz)) present_or_copyin(evc1(1:npwx*npol,1:nbnd,1:nks), evc0(1:npwx*npol,1:nbnd,1:nks)) present_or_copyout( rho_1(1:v_siz,1:nspin_mag)) 
  !
  !$acc kernels
  psic(:) = (0.0d0, 0.0d0)
+  revc0(:) = (0.0d0, 0.0d0)
  !$acc end kernels
  !
  IF (gamma_only) THEN
@ -119,9 +128,7 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
     !
     IF ( doublegrid ) THEN
        print *, 'doublegrid', doublegrid
-        !$acc host_data use_device(rho_1(:,1))     
        CALL fft_interpolate(dffts, rho_1(:,1), dfftp, rho_1(:,1))
-        !$acc end host_data
     ENDIF
     !
 #if defined(__MPI)
@ -238,7 +245,13 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
  !
  !$acc end data
  !
-  DEALLOCATE ( psic )
+  DEALLOCATE ( psic, revc0 )
+  !
+  IF ( dffts%has_task_groups ) THEN
+     !
+     DEALLOCATE(tg_revc0)
+     !
+  ENDIF
  !
  IF (charge_response == 2 .AND. LR_iteration /=0) THEN
     !
@ -373,7 +386,7 @@ CONTAINS
    ! Gamma_only case
    !
    USE becmod,              ONLY : bec_type, becp, calbec
-    USE lr_variables,        ONLY : becp_1, tg_revc0
+    USE lr_variables,        ONLY : becp_1
    USE io_global,           ONLY : stdout
    USE realus,              ONLY : real_space, invfft_orbital_gamma,&
                                    initialisation_level,&
@ -412,6 +425,17 @@ CONTAINS
    ENDIF
    !
    DO ibnd = ibnd_start_gamma, ibnd_end_gamma, incr
+       !
+       ! FFT: evc0 -> psic -> revc0
+       !
+       CALL invfft_orbital_gamma(evc0(:,:,1),ibnd,nbnd)
+       IF (dffts%has_task_groups) THEN
+          tg_revc0(:) = tg_psic(:)     
+       ELSE
+          !$acc kernels     
+          revc0(:) = psic(:)
+          !$acc end kernels
+       ENDIF
       !
       ! FFT: evc1 -> psic
       !
@ -450,8 +474,8 @@ CONTAINS
          !
          DO ir = 1, dffts%nr1x * dffts%nr2x * dffts%my_nr3p
             tg_rho(ir) = tg_rho(ir) &
-                  + 2.0d0*(w1*real(tg_revc0(ir,ibnd,1),dp)*real(tg_psic(ir),dp)&
-                  +       w2*aimag(tg_revc0(ir,ibnd,1))*aimag(tg_psic(ir)))
+                  + 2.0d0*(w1*real(tg_revc0(ir),dp)*real(tg_psic(ir),dp)&
+                  +       w2*aimag(tg_revc0(ir))*aimag(tg_psic(ir)))  
          ENDDO
          !
       ELSE
@ -479,8 +503,8 @@ CONTAINS
          !$acc parallel loop
          DO ir = 1, nnr_siz
             rho_1(ir,1) = rho_1(ir,1) &
-                  + 2.0d0*(w1*dble(revc0(ir,ibnd,1))*dble(psic(ir))&
-                  +       w2*aimag(revc0(ir,ibnd,1))*aimag(psic(ir)))
+                  + 2.0d0*(w1*dble(revc0(ir))*dble(psic(ir))&
+                  +       w2*aimag(revc0(ir))*aimag(psic(ir)))
          ENDDO
          !
          ! OBM - psic now contains the response functions in real space.
@ -631,12 +655,15 @@ CONTAINS
          !
          ! FFT: evc1(:,:,ik) -> psic(:)
          !
+          revc0(:) = (0.0d0,0.0d0)
          psic(:) = (0.0d0,0.0d0)
          !
          DO ig = 1, ngk(ik)
+             revc0(dffts%nl(igk_k(ig,ik))) = evc0(ig,ibnd,ik)
             psic(dffts%nl(igk_k(ig,ik)))=evc1(ig,ibnd,ik)
          ENDDO
          !
+          CALL invfft ('Wave', revc0, dffts)
          CALL invfft ('Wave', psic, dffts)
          !
          ! I. Timrov: Try to use invfft_orbital_k
@ -648,7 +675,7 @@ CONTAINS
          !
          DO ir=1,dffts%nnr
             rho_1c(ir,:) = rho_1c(ir,:) &
-                  + 2.0d0*w1*conjg(revc0(ir,ibnd,ik))*psic(ir)
+                  + 2.0d0*w1*conjg(revc0(ir))*psic(ir)
          ENDDO
          !
          IF (lr_exx) CALL lr_exx_kernel_int (evc1(:,:,ik), ibnd, nbnd, ik )
--- a/TDDFPT/src/lr_calc_dens_eels.f90
+++ b/TDDFPT/src/lr_calc_dens_eels.f90
@ -29,7 +29,7 @@ SUBROUTINE lr_calc_dens_eels (drhoscf, dpsi)
  USE wvfct,                 ONLY : nbnd, npwx
  USE gvecw,                 ONLY : gcutw
  USE qpoint,                ONLY : nksq, ikks, ikqs
-  USE wavefunctions,  ONLY : evc
+  USE wavefunctions,         ONLY : evc
  USE uspp_param,            ONLY : nhm 
  USE uspp,                  ONLY : okvan, vkb
  USE mp_global,             ONLY : inter_pool_comm, intra_bgrp_comm
@ -54,11 +54,18 @@ SUBROUTINE lr_calc_dens_eels (drhoscf, dpsi)
              ikq, & ! index of the point k+q
              npwq   ! number of the plane-waves at point k+q
  REAL(DP) :: weight ! weight of the k point
+  INTEGER :: nnr_siz, nnrs_siz
  !
-  CALL start_clock('lr_calc_dens')
+  CALL start_clock_gpu('lr_calc_dens')
  !
+  nnr_siz = dfftp%nnr
+  nnrs_siz = dffts%nnr
  ALLOCATE ( drhoscfh(dffts%nnr) )
+!$acc data present_or_copyin(dpsi(1:npwx,1:nbnd,1:nksq)) present_or_copyout(drhoscf(1:nnr_siz)) create(drhoscfh(1:nnrs_siz))
+  !
+  !$acc kernels
  drhoscfh(:) = (0.0d0, 0.0d0)
+  !$acc end kernels
  !
  IF (okvan) THEN
     ALLOCATE (dbecsum(nhm*(nhm+1)/2,nat)) 
@ -74,6 +81,7 @@ SUBROUTINE lr_calc_dens_eels (drhoscf, dpsi)
     ! Read the unperturbed wavefuctions evc at k
     !
     IF (nksq > 1) CALL get_buffer (evc, nwordwfc, iunwfc, ikk)
+     !$acc update device(evc)
     !
     ! The weight of the k point
     !
@ -81,7 +89,8 @@ SUBROUTINE lr_calc_dens_eels (drhoscf, dpsi)
     !
     ! Calculate beta-functions vkb at point k+q
     !
-     IF (okvan) CALL init_us_2 (npwq, igk_k(1,ikq), xk(1,ikq), vkb)
+     IF (okvan) CALL init_us_2 (npwq, igk_k(1,ikq), xk(1,ikq), vkb, .true.)
+     !$acc update host(vkb)
     !
     ! Calculation of the response charge density
     !
@ -114,12 +123,16 @@ SUBROUTINE lr_calc_dens_eels (drhoscf, dpsi)
  ! Reduce the response charge density across pools.
  !
 #if defined(__MPI)
+  !$acc host_data use_device(drhoscf)
  CALL mp_sum (drhoscf, inter_pool_comm)
+  !$acc end host_data
 #endif
  !
  ! Symmetrization of the charge density response
  ! wrt the small group of q.
  !
+!$acc end data
+  !  
 #if defined(__MPI)
  CALL lr_psym_eels(drhoscf)
 #else
@ -129,7 +142,7 @@ SUBROUTINE lr_calc_dens_eels (drhoscf, dpsi)
  DEALLOCATE (drhoscfh)
  IF (okvan) DEALLOCATE (dbecsum)
  !
-  CALL stop_clock('lr_calc_dens')
+  CALL stop_clock_gpu('lr_calc_dens')
  !
  RETURN
  !
--- a/TDDFPT/src/lr_dav_debug.f90
+++ b/TDDFPT/src/lr_dav_debug.f90
@ -1,274 +0,0 @@
-!
-! Copyright (C) 2001-2015 Quantum ESPRESSO group
-! This file is distributed under the terms of the
-! GNU General Public License. See the file `License'
-! in the root directory of the present distribution,
-! or http://www.gnu.org/copyleft/gpl.txt .
-!
-!--------------------------------------------------------------------
-module lr_dav_debug
-!----------------------------------------------------------------------------
-! Created by Xiaochuan Ge (Aprile, 2013)
-!-----------------------------------------------------------------------
-contains
-
-  ! subroutine for debug
-  subroutine check_orth()
-    !-------------------------------------------------------------------------------
-    ! Created by X.Ge in Jan. 2013
-    !-------------------------------------------------------------------------------
-    use lr_dav_variables
-    use io_global,   only : stdout
-    use lr_us,   only : lr_dot_us
-    use uspp,    only : okvan
-    
-    implicit none
-    complex(kind=dp),external :: lr_dot
-    integer :: ia, ib
-    real(dp) :: inner_err,temp
-    
-    inner_err=0.0d0
-    do ia = 1, num_basis
-      do ib = 1, num_basis 
-        if( poor_of_ram .or. .not. okvan) then
-          inner_matrix(ia,ib)=dble(lr_dot_us(vec_b(:,:,1,ia),vec_b(:,:,1,ib)))
-        else
-          inner_matrix(ia,ib)=dble(lr_dot(svec_b(:,:,1,ia),vec_b(:,:,1,ib)))
-        endif
-        if(ia==ib) then
-          temp=(inner_matrix(ia,ib)-1)**2
-        else
-          temp=inner_matrix(ia,ib)**2
-        endif
-        inner_err=inner_err+temp
-        if(temp .gt. 10E-10) &
-          &write(stdout,*) "Warning, the inner product between ",ia," and ",ib," is : ",temp
-      enddo
-    enddo
-    
-    inner_err=inner_err/(num_basis*num_basis)
-    write(stdout,'(/5x,"The error of the orthonalization of the basis is:",&
-            & 5x,E20.12)') inner_err
-    call check("inner_matrix")
-    return
-  end subroutine check_orth
-  !-------------------------------------------------------------------------------
-
-  subroutine check(flag_check)
-    !-------------------------------------------------------------------------------
-    ! Created by X.Ge in Jan. 2013
-    !-------------------------------------------------------------------------------
-    ! For debuging
-    use lr_dav_variables
-    use kinds,     only : dp
-    use io_global, only : stdout
-    USE wvfct,         ONLY : nbnd,npwx
-    USE klist,             ONLY : nks
-    use lr_variables,    only : evc0, sevc0
-
-    implicit none
-    integer :: ia,ib
-    character(len=*) :: flag_check
-   
-    if(.not. dav_debug) return
-
-    write(stdout,'(/5x,"You are checking some information for debugging")')
-    if(flag_check == "M_C") then
-      write(stdout, '(7x,"Matrix C is:")')
-      do ia=1,num_basis
-        write(stdout,'(7x,10F15.8)') (dble(M_C(ia,ib)),ib=1,min(10,num_basis))
-      enddo
-    endif
-
-    if(flag_check == "M_D") then
-      write(*, '(7x,"Matrix D is:")')
-      do ia=1,num_basis
-        write(stdout,'(7x,10F15.8)') (dble(M_D(ia,ib)),ib=1,min(10,num_basis))
-      enddo
-    endif
-
-    if(flag_check == "M") then
-      write(*, '(7x,"Matrix DC is:")')
-      do ia=1,num_basis
-        write(stdout,'(7x,10F15.8)') (dble(M(ia,ib)),ib=1,min(10,num_basis))
-      enddo
-    endif
-
-    if(flag_check == "right_M") then
-      write(*, '(7x,"Matrix right_M is:")')
-      do ia=1,num_basis
-        write(stdout,'(7x,10F15.8)') (dble(right_M(ia,ib)),ib=1,min(10,num_basis))
-      enddo
-    endif
-
-    if(flag_check == "left_M") then
-      write(*, '(7x,"Matrix left_M is:")')
-      do ia=1,num_basis
-        write(stdout,'(7x,10F15.8)') (dble(left_M(ia,ib)),ib=1,min(10,num_basis))
-      enddo
-    endif
-
-    if(flag_check == "inner_matrix") then
-      write(*, '(7x,"Inner_matrix is:")')
-      do ia=1,num_basis
-        write(stdout,'(7x,10F15.8)') (dble(inner_matrix(ia,ib)),ib=1,min(10,num_basis))
-      enddo
-    endif
-
-    return
-  end subroutine check
-  !-------------------------------------------------------------------------------
-
-  subroutine check_overlap(vec)
-    !-------------------------------------------------------------------------------
-    ! Created by X.Ge in Apr. 2013
-    !-------------------------------------------------------------------------------
-    ! To see if the vector is othogonal to occupied space
-    
-    use lr_variables,         only : evc0
-    use wvfct,                only : nbnd,npwx
-    use io_global,            only : stdout
-    use kinds,                 only : dp
-    use lr_us,     only : lr_dot_us
-
-    implicit none
-    complex(dp) :: vec(npwx,nbnd),occupy(npwx,nbnd), overlap
-    integer :: i,j
-
-    overlap=0.0d0
-    do i = 1, nbnd
-      do j = 1, nbnd
-        occupy(:,j)=evc0(:,i,1)
-      enddo
-      overlap=overlap+lr_dot_us(vec(:,:),occupy(:,:))
-    enddo
-    write(stdout,'("!!!! the tot overlap with the occupied space is:",5x,E20.12)') dble(overlap)
-
-    return
-  end subroutine check_overlap
-  !-------------------------------------------------------------------------------
-
-  subroutine check_overlap_basis(vec)
-    !-------------------------------------------------------------------------------
-    ! Created by X.Ge in Apr. 2013
-    !-------------------------------------------------------------------------------
-    ! Check the overlap between residue and basis
-    use lr_variables,         only : evc0
-    use wvfct,                only : nbnd,npwx
-    use kinds,                 only : dp
-    use io_global,            only : stdout
-    use lr_dav_variables,      only : vec_b,num_basis
-
-    implicit none
-
-    complex(kind=dp),external :: lr_dot
-    complex(dp) :: vec(npwx,nbnd),overlap
-    integer :: i
-
-    overlap=0.0d0
-    do i = 1, num_basis
-      overlap = overlap + lr_dot(vec(:,:),vec_b(:,:,1,i))**2
-    enddo
-
-    write(stdout,'("!!!! the tot overlap of the residue with the basis space is:",5x,E20.12)') dble(overlap)
-   
-    return
-  end subroutine check_overlap_basis
-  !-------------------------------------------------------------------------------
-
-  subroutine check_revc0()
-    !-------------------------------------------------------------------------------
-    ! Created by X.Ge in Apr. 2013
-    !-------------------------------------------------------------------------------
-    ! Due to the bug of virt_read, this is to check if revc0 is correct
-    use lr_variables,    only : evc0, revc0
-    use wvfct,           only : nbnd, npwx
-    use kinds,           only : dp
-    use fft_base,             only : dffts
-    use mp_global,            only : intra_bgrp_comm
-    use mp_world,             only : world_comm
-    use mp,                   only : mp_sum, mp_barrier
-    use lr_dav_variables
-    USE cell_base,              ONLY : omega
-    USE wavefunctions, ONLY : psic
-    USE realus,              ONLY : invfft_orbital_gamma, fwfft_orbital_gamma
-      USE gvect,                ONLY : gstart
-
-    implicit none
-    integer :: i,tot_nnr
-    real(dp) :: norm,banda(dffts%nnr),bandb(dffts%nnr)
-    real(kind=dp), external    :: ddot
-    complex(kind=dp),external :: lr_dot   
-    complex(dp) :: wfck(npwx,1)
-
-    ALLOCATE( psic(dffts%nnr) )
-    
-    tot_nnr=dffts%nr1x*dffts%nr2x*dffts%nr3x
-    
-    do i = 1, nbnd,2
-      banda(:)=dble(revc0(:,i,1))     
-      bandb(:)=aimag(revc0(:,i,1))     
-      wfck(:,1)=evc0(:,i,1)
-      call invfft_orbital_gamma(wfck(:,:),1,1)  ! FFT: v  -> psic
-      norm=DDOT(dffts%nnr,psic(:),2,banda,1)/tot_nnr
-#if defined(__MPI)
-      call mp_barrier( world_comm )
-      call mp_sum(norm,intra_bgrp_comm)
-#endif
-      print *, norm
-
-      wfck(:,1)=evc0(:,i+1,1)
-      call invfft_orbital_gamma(wfck(:,:),1,1)  ! FFT: v  -> psic
-      norm=DDOT(dffts%nnr,psic(:),2,bandb,1)/tot_nnr
-#if defined(__MPI)
-      call mp_barrier( world_comm )
-      call mp_sum(norm,intra_bgrp_comm)
-#endif
-      print *, norm
-    enddo
-!call mp_stop(100)
-    return
-  end subroutine check_revc0
-  !-------------------------------------------------------------------------------
-
-  subroutine check_hermitian()
-    !-------------------------------------------------------------------------------
-    ! Created by X.Ge in Apr. 2013
-    !-------------------------------------------------------------------------------
-    use lr_dav_variables
-    use lr_variables, only : evc0,sevc0
-    use kinds,     only : dp
-    use lr_us,  only : lr_dot_us
-
-    implicit none
-    integer :: ibr,ibl,ibr0,ibl0
-    real(dp) :: diff,diffmax
-
-    do ibr = 1, num_basis
-      call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.true.)
-      do ibl = 1, num_basis
-        M_C(ibl,ibr)=lr_dot_us(vec_b(1,1,1,ibl),vecwork(1,1,1))
-      enddo
-    enddo
-
-    diffmax=0.0d0
-    ibr0=1;ibl0=1
-    do ibr = 1, num_basis-1
-      do ibl = ibr+1, num_basis
-        diff = abs(dble(M_C(ibl,ibr))-dble(M_C(ibr,ibl)))
-        print *, diff,dble(M_C(ibl,ibr)),dble(M_C(ibr,ibl))
-        if( diff .gt. diffmax ) then
-          diffmax=diff
-          ibr0=ibr;ibl0=ibl
-        endif
-      enddo
-    enddo
-
-    print *, "Max|C(i,j)-C(j,i)|=",diffmax
-    print *, "i,j=",ibl0,ibr0
-    print *, "C(i,j);C(j,i)",dble(M_C(ibl0,ibr0)),dble(M_C(ibr0,ibl0))
-
-  end subroutine check_hermitian
-  !-------------------------------------------------------------------------------
-
-END MODULE lr_dav_debug
--- a/TDDFPT/src/lr_dav_main.f90
+++ b/TDDFPT/src/lr_dav_main.f90
@ -20,7 +20,7 @@ PROGRAM lr_dav_main
  USE lr_variables,          ONLY : restart, restart_step,&
       evc1,n_ipol, d0psi, &
       no_hxc, nbnd_total, &
-       revc0, lr_io_level, code1,davidson
+       lr_io_level, code1,davidson
  USE ions_base,             ONLY : tau,nat,atm,ityp
  USE environment,           ONLY : environment_start
  USE mp_global,             ONLY : nimage, mp_startup, inter_bgrp_comm, &
@ -32,7 +32,6 @@ PROGRAM lr_dav_main
  USE xc_lib,                ONLY : xclib_dft_is
  use lr_dav_routines
  use lr_dav_variables
-  use lr_dav_debug
  !
 #if defined (__ENVIRON)
  USE plugin_flags,          ONLY : use_environ
@ -96,11 +95,9 @@ PROGRAM lr_dav_main
  CALL lr_dv_setup()

  !   Davidson loop
-  !$acc data copyin(revc0(:,:,:))
  if (precondition) write(stdout,'(/5x,"Precondition is used in the algorithm,")')
  do while (.not. dav_conv .and. dav_iter .lt. max_iter)
    dav_iter=dav_iter+1
-      if(if_check_orth) call check_orth()
      ! In one david step, M_C,M_D and M_CD are first constructed;then will be
        ! solved rigorously; then the solution in the subspace left_sub() will
        ! be transformed into full space left_full()
@ -116,15 +113,9 @@ PROGRAM lr_dav_main
      endif
      !
  enddo
-  !$acc end data
-  ! call check_hermitian()
  ! Extract physical meaning from the solution
  if ( check_stop_now() ) goto 100
  call interpret_eign('END')
-  ! The check_orth at the end may take quite a lot of time in the case of 
-  ! USPP because we didn't store the S* vector basis. Turn this step on only
-  ! in cases of debugging
-  ! call check_orth() 
  if(lplot_drho) call plot_drho()

 100 continue
--- a/TDDFPT/src/lr_dav_routines.f90
+++ b/TDDFPT/src/lr_dav_routines.f90
@ -26,7 +26,6 @@ contains
                                & if_dft_spectrum, reference
    use lr_variables, only : nbnd_total
    use io_global,    only : stdout
-    use lr_dav_debug
 
    implicit none
    integer :: ib,ic,iv
@ -186,14 +185,13 @@ contains
    use kinds,         only : dp
    use wvfct,         only : nbnd, npwx, et
    use lr_dav_variables
-    use lr_variables,         only : evc0, sevc0 ,revc0, evc0_virt,&
+    use lr_variables,         only : evc0, sevc0 , evc0_virt,&
                                   & sevc0_virt, nbnd_total, davidson, restart
    use io_global,    only : stdout
    use wvfct,       only : npwx,nbnd,et
    use gvect,                only : gstart
    use uspp,           only : okvan
    use lr_us
-    use lr_dav_debug

    implicit none
    integer :: ib,ia,ipw,ibnd
@ -231,7 +229,6 @@ contains
    dav_iter=0
 101 continue
    dav_conv=.false.
-    ! call check_orth()
    write(stdout,'(5x,"Finished initiating.")')
    !
    RETURN
@ -437,7 +434,6 @@ contains
    use lr_us
    use uspp,           only : okvan
    use lr_dav_variables
-    use lr_dav_debug
    use lr_us

    implicit none
@ -605,7 +601,6 @@ contains
    use lr_us
    use uspp,           only : okvan
    use lr_dav_variables
-    use lr_dav_debug
    use lr_us

    implicit none
@ -629,9 +624,6 @@ contains
    call start_clock("matrix")
    call ZGEMM('N', 'N', num_basis,num_basis,num_basis,(1.0D0,0.0D0),M_D,&
               num_basis_max,M_C,num_basis_max,(0.0D0,0.0D0), M,num_basis_max)
-    call check("M_C")
-    call check("M_D")
-    call check("M")

    ! Solve M_DC
    ! It is dangerous to be "solved", use its shadow avatar in order to protect the original one
@ -672,8 +664,8 @@ contains
 #endif

    ! Recover eigenvectors in the whole space
-    left_full(:,:,:,:)=0.0d0
-    right_full(:,:,:,:)=0.0d0
+    left_full(:,:,:,:)=(0.0d0, 0.0d0)
+    right_full(:,:,:,:)=(0.0d0, 0.0d0)
    do ieign = 1, num_eign
      do ibr = 1, num_basis
        left_full(:,:,1,ieign)=left_full(:,:,1,ieign)+left_M(ibr,eign_value_order(ieign))*vec_b(:,:,1,ibr)
@ -733,7 +725,6 @@ contains
    use io_global, only : stdout
    use wvfct,                only : nbnd, npwx
    use uspp,           only : okvan
-    use lr_dav_debug
    use lr_us
    
    implicit none
@ -842,7 +833,6 @@ contains
    use io_global,       only : stdout
    use uspp,           only : okvan
    use lr_us
-    use lr_dav_debug
    use wvfct,                only : npwx
    
    implicit none
@ -1542,7 +1532,7 @@ contains
    use kinds,         only : DP
    use wvfct,         only : nbnd,et
    use lr_dav_variables
-    use lr_variables,         only : evc0, sevc0 ,revc0, evc0_virt
+    use lr_variables,         only : evc0, sevc0 , evc0_virt
    use wvfct,                only : npwx,wg
    use fft_base,             only : dffts,dfftp
    use uspp,           only : okvan
@ -1983,7 +1973,6 @@ contains
    use charg_resp,           only : lr_calc_R
    use io_global,    only : stdout,ionode
    use io_files,      only : prefix
-    use lr_dav_debug

    implicit none
    integer :: ib,ic,iv,i
--- a/TDDFPT/src/lr_dav_variables.f90
+++ b/TDDFPT/src/lr_dav_variables.f90
@ -29,9 +29,9 @@ MODULE lr_dav_variables
             p_nbnd_occ,p_nbnd_virt ! number of occ and virt bands for projection
  REAL(kind=dp) :: residue_conv_thr, reference, close_pre, broadening,start,&
                   finish,step,turn2planb, vccouple_shift
-  logical :: precondition,dav_debug, single_pole,&
-             sort_contr,diag_of_h,print_spectrum,if_check_orth,&
-             if_random_init,if_check_her,poor_of_ram,poor_of_ram2,&
+  logical :: precondition, single_pole,&
+             sort_contr,diag_of_h,print_spectrum,&
+             if_random_init,poor_of_ram,poor_of_ram2,&
             conv_assistant,if_dft_spectrum,lplot_drho

  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
--- a/TDDFPT/src/lr_dealloc.f90
+++ b/TDDFPT/src/lr_dealloc.f90
@ -46,9 +46,7 @@ SUBROUTINE lr_dealloc()
  IF (allocated(sevc1))     DEALLOCATE(sevc1)
  IF (allocated(d0psi))     DEALLOCATE(d0psi)
  IF (allocated(d0psi2))    DEALLOCATE(d0psi2)
-  IF (allocated(tg_revc0))  DEALLOCATE(tg_revc0)
  IF (allocated(tg_psic))   DEALLOCATE(tg_psic)
-  IF (allocated(revc0))     DEALLOCATE(revc0)
  IF (allocated(bbg))       DEALLOCATE(bbg)
  IF (allocated(bbk))       DEALLOCATE(bbk)
  IF (allocated(bbnc))      DEALLOCATE(bbnc)
--- a/TDDFPT/src/lr_eels_main.f90
+++ b/TDDFPT/src/lr_eels_main.f90
@ -23,7 +23,7 @@ PROGRAM lr_eels_main
                                  & evc1, evc1_old, norm0, n_ipol, &
                                  & d0psi, d0psi2, LR_iteration, LR_polarization, &
                                  & plot_type, nbnd_total, pseudo_hermitian, &
-                                  & itermax_int, revc0, lr_io_level, code2, &
+                                  & itermax_int, lr_io_level, code2, &
                                  & eels, approximation, calculator, fru, fiu, &
                                  & current_w, nfs, start_freq, last_freq, chirr, &
                                  & chirz, chizz, chizr, epsm1 
--- a/TDDFPT/src/lr_exx_kernel.f90
+++ b/TDDFPT/src/lr_exx_kernel.f90
@ -26,7 +26,7 @@ MODULE lr_exx_kernel
  !

  USE kinds,                  ONLY : DP
-  USE lr_variables,           ONLY : evc0, revc0
+  USE lr_variables,           ONLY : evc0
  USE fft_base,               ONLY : dffts
  USE fft_interfaces,         ONLY : invfft, fwfft
  USE lsda_mod,               ONLY : nspin
--- a/TDDFPT/src/lr_magnons_main.f90
+++ b/TDDFPT/src/lr_magnons_main.f90
@ -22,7 +22,7 @@ PROGRAM lr_magnons_main
                                  & evc1, evc1_old, norm0, n_ipol, &
                                  & d0psi, d0psi2, LR_iteration, LR_polarization, &
                                  & plot_type, nbnd_total, pseudo_hermitian, &
-                                  & itermax_int, revc0, lr_io_level, code3, &
+                                  & itermax_int, lr_io_level, code3, &
                                  & magnons, approximation, V0psi, ipol, n_op, &
                                  & evc1_rgt, evc1_lft, evc1_rgt_old, evc1_lft_old
  USE io_files,              ONLY : nd_nmbr
--- a/TDDFPT/src/lr_main.f90
+++ b/TDDFPT/src/lr_main.f90
@ -28,7 +28,7 @@ PROGRAM lr_main
                                  & evc1, evc1_old,norm0, charge_response, n_ipol, &
                                  & d0psi, LR_iteration, LR_polarization, &
                                  & plot_type, no_hxc, nbnd_total, project, F,R, &
-                                  & itermax_int, revc0, lr_io_level, code1
+                                  & itermax_int, lr_io_level, code1
  USE charg_resp,            ONLY : lr_calc_w_T, read_wT_beta_gamma_z, lr_project_init,&
                                  & lr_dump_rho_tot_compat1, lr_dump_rho_tot_cube,&
                                  & lr_dump_rho_tot_xyzd,lr_dump_rho_tot_xcrys,&
@ -221,7 +221,9 @@ PROGRAM lr_main
     IF (okvan) CALL sd0psi() 
     !
     ! Loop on the Lanczos iterations
-     ! 
+     !
+!     !$acc data copyin(evc0, evc1, evc1_old) create(evc1_new, sevc1_new) 
+     !
     lancz_loop1 : DO iteration = iter_restart, itermax
        !
        LR_iteration = iteration
@ -240,6 +242,8 @@ PROGRAM lr_main
        !
        IF ( check_stop_now() ) THEN
           !
+!           !$acc update host(evc1, evc1_old)
+           !
           CALL lr_write_restart()
           !
           ! Deallocate PW variables.
@ -253,6 +257,8 @@ PROGRAM lr_main
        !
     ENDDO lancz_loop1
     !
+!     !$acc end data
+     !
     IF (charge_response == 1) THEN
        !
        ! Write the apropriate charge density plot.
--- a/TDDFPT/src/lr_ortho.f90
+++ b/TDDFPT/src/lr_ortho.f90
@ -50,6 +50,9 @@ SUBROUTINE lr_ortho(dvpsi, evq, ikk, ikq, sevc, inverse)
  use lr_variables,      ONLY : lr_verbosity
  use control_flags,     only : gamma_only
  USE io_global,         ONLY : stdout
+#if defined(__CUDA)
+  USE cublas
+#endif  
  !
  IMPLICIT NONE
  INTEGER, INTENT(in) :: ikk, ikq   
@ -71,6 +74,7 @@ SUBROUTINE lr_ortho(dvpsi, evq, ikk, ikq, sevc, inverse)
  !else
    inverse_mode = inverse
  !endif
+  !$acc data present_or_copyin(evq(1:npwx*npol,1:nbnd), sevc(1:npwx*npol,1:nbnd)) present_or_copy(dvpsi(1:npwx*npol,1:nbnd))
  !
  IF (gamma_only) THEN
     !
@ -86,6 +90,8 @@ SUBROUTINE lr_ortho(dvpsi, evq, ikk, ikq, sevc, inverse)
     !
  ENDIF
  !
+  !$acc end data
+  !
  CALL stop_clock ('lr_ortho')
  !
  RETURN
@ -100,121 +106,149 @@ SUBROUTINE lr_ortho_k()

  COMPLEX(DP), ALLOCATABLE :: ps(:,:)
  INTEGER :: ibnd, jbnd, nbnd_eff
-  REAL(DP) :: wg1, w0g, wgp, wwg, deltae, theta
+  REAL(DP) :: wg1, w0g, wgp, wwg(nbnd), deltae, theta
  REAL(DP), EXTERNAL :: w0gauss, wgauss

  ALLOCATE(ps(nbnd,nbnd))
  !
+  !$acc enter data create(ps(:,:))
+  !
  IF (lgauss) THEN
-       !
-       !  metallic case
-       !
-       ps = (0.d0, 0.d0)
-       IF (inverse_mode) THEN
-          CALL ZGEMM( 'C', 'N', nbnd, nbnd_occ(ikk), ngk(ikk), (1.d0,0.d0), &
-                      sevc, npwx, dvpsi, npwx, (0.d0,0.d0), ps, nbnd )
-       ELSE
-          CALL ZGEMM( 'C', 'N', nbnd, nbnd_occ(ikk), ngk(ikk), (1.d0,0.d0), &
-                      evq, npwx, dvpsi, npwx, (0.d0,0.d0), ps, nbnd )
-       ENDIF
-       !
-       DO ibnd = 1, nbnd_occ (ikk)
-          wg1 = wgauss ((ef-et(ibnd,ikk)) / degauss, ngauss)
-          w0g = w0gauss((ef-et(ibnd,ikk)) / degauss, ngauss) / degauss
-          DO jbnd = 1, nbnd
-             wgp = wgauss ( (ef - et (jbnd, ikq) ) / degauss, ngauss)
-             deltae = et (jbnd, ikq) - et (ibnd, ikk)
-             theta = wgauss (deltae / degauss, 0)
-             wwg = wg1 * (1.d0 - theta) + wgp * theta
-             IF (jbnd <= nbnd_occ (ikq) ) THEN
-                IF (abs (deltae) > 1.0d-5) THEN
-                    wwg = wwg + alpha_pv * theta * (wgp - wg1) / deltae
-                ELSE
-                   !
-                   !  if the two energies are too close takes the limit
-                   !  of the 0/0 ratio
-                   !
-                   wwg = wwg - alpha_pv * theta * w0g
-                ENDIF
-             ENDIF
-             !
-             ps(jbnd,ibnd) = wwg * ps(jbnd,ibnd)
-             !
-          ENDDO
-          CALL DSCAL (2*ngk(ikk), wg1, dvpsi(1,ibnd), 1)
-       ENDDO
-       !
-       nbnd_eff = nbnd
-       !
+     !
+     !  metallic case
+     !
+     !$acc kernels
+     ps(:,:) = (0.d0, 0.d0)
+     !$acc end kernels
+     !
+     !$acc host_data use_device(sevc, evq, dvpsi, ps)
+     !
+     IF (inverse_mode) THEN
+        CALL ZGEMM( 'C', 'N', nbnd, nbnd_occ(ikk), ngk(ikk), (1.d0,0.d0), &
+                    sevc, npwx, dvpsi, npwx, (0.d0,0.d0), ps, nbnd )
+     ELSE
+        CALL ZGEMM( 'C', 'N', nbnd, nbnd_occ(ikk), ngk(ikk), (1.d0,0.d0), &
+                    evq, npwx, dvpsi, npwx, (0.d0,0.d0), ps, nbnd )
+     ENDIF
+     !
+     !$acc end host_data
+     !
+     DO ibnd = 1, nbnd_occ (ikk)
+        wg1 = wgauss ((ef-et(ibnd,ikk)) / degauss, ngauss)
+        w0g = w0gauss((ef-et(ibnd,ikk)) / degauss, ngauss) / degauss
+        DO jbnd = 1, nbnd
+           wgp = wgauss ( (ef - et (jbnd, ikq) ) / degauss, ngauss)
+           deltae = et (jbnd, ikq) - et (ibnd, ikk)
+           theta = wgauss (deltae / degauss, 0)
+           wwg(jbnd) = wg1 * (1.d0 - theta) + wgp * theta
+           IF (jbnd <= nbnd_occ (ikq) ) THEN
+              IF (abs (deltae) > 1.0d-5) THEN
+                 wwg(jbnd) = wwg(jbnd) + alpha_pv * theta * (wgp - wg1) / deltae
+              ELSE
+                 !
+                 !  if the two energies are too close takes the limit
+                 !  of the 0/0 ratio
+                 !
+                 wwg(jbnd) = wwg(jbnd) - alpha_pv * theta * w0g
+              ENDIF
+           ENDIF
+           !
+        ENDDO
+        !
+        !$acc parallel loop copyin(wwg)
+        DO jbnd = 1, nbnd
+           ps(jbnd,ibnd) = wwg(jbnd) * ps(jbnd,ibnd)
+        ENDDO
+        !$acc kernels
+        dvpsi(1:ngk(ikk), ibnd) = wg1 * dvpsi(1:ngk(ikk), ibnd)
+        !$acc end kernels
+     ENDDO
+     !
+     nbnd_eff = nbnd
+     !
  ELSE
-      !
-      !  insulators
-      !
-      ps = (0.d0, 0.d0)
-      !
-      IF (inverse_mode) THEN
-         !
-         ! ps = <sevc|dvpsi>
-         !
-         CALL ZGEMM( 'C', 'N', nbnd_occ(ikq), nbnd_occ (ikk), ngk(ikk), &
-                   (1.d0,0.d0), sevc, npwx, dvpsi, npwx, &
-                   (0.d0,0.d0), ps, nbnd )
-         !
-      ELSE
-         !
-         ! ps = <evq|dvpsi>
-         !
-         CALL ZGEMM( 'C', 'N', nbnd_occ(ikq), nbnd_occ (ikk), ngk(ikk), &
-                   (1.d0,0.d0), evq, npwx, dvpsi, npwx, &
-                   (0.d0,0.d0), ps, nbnd )
-      ENDIF
-      !
-      nbnd_eff = nbnd_occ(ikk)
-      !
+     !
+     !  insulators
+     !
+     !$acc kernels
+     ps(:,:) = (0.d0, 0.d0)
+     !$acc end kernels
+     !
+     !$acc host_data use_device(sevc, evq, dvpsi, ps)
+     !
+     IF (inverse_mode) THEN
+        !
+        ! ps = <sevc|dvpsi>
+        !
+        CALL ZGEMM( 'C', 'N', nbnd_occ(ikq), nbnd_occ (ikk), ngk(ikk), &
+                  (1.d0,0.d0), sevc, npwx, dvpsi, npwx, &
+                  (0.d0,0.d0), ps, nbnd )
+        !
+     ELSE
+        !
+        ! ps = <evq|dvpsi>
+        !
+        CALL ZGEMM( 'C', 'N', nbnd_occ(ikq), nbnd_occ (ikk), ngk(ikk), &
+                  (1.d0,0.d0), evq, npwx, dvpsi, npwx, &
+                  (0.d0,0.d0), ps, nbnd )
+     ENDIF
+     !
+     !$acc end host_data
+     !
+     nbnd_eff = nbnd_occ(ikk)
+     !
  ENDIF
  !
 #if defined(__MPI)
-   CALL mp_sum(ps(:,1:nbnd_eff),intra_bgrp_comm)
+  !$acc host_data use_device(ps) 
+  CALL mp_sum(ps(:,1:nbnd_eff),intra_bgrp_comm)
+  !$acc end host_data
 #endif
+  !
+  !$acc host_data use_device(sevc, evq, dvpsi, ps)
  !
  IF (lgauss) THEN
-      !
-      !  Metallic case
-      !
-      if (inverse_mode) then
-         CALL ZGEMM( 'N', 'N', ngk(ikk), nbnd_occ(ikk), nbnd, &
-                   (-1.d0,0.d0), evq, npwx, ps, nbnd, (1.0d0,0.d0), &
-                   dvpsi, npwx )
-      else
-         CALL ZGEMM( 'N', 'N', ngk(ikk), nbnd_occ(ikk), nbnd, &
-                   (-1.d0,0.d0), sevc, npwx, ps, nbnd, (1.0d0,0.d0), &
-                   dvpsi, npwx )
-      endif
-      !
+     !
+     !  Metallic case
+     !
+     if (inverse_mode) then
+        CALL ZGEMM( 'N', 'N', ngk(ikk), nbnd_occ(ikk), nbnd, &
+                  (-1.d0,0.d0), evq, npwx, ps, nbnd, (1.0d0,0.d0), &
+                  dvpsi, npwx )
+     else
+        CALL ZGEMM( 'N', 'N', ngk(ikk), nbnd_occ(ikk), nbnd, &
+                  (-1.d0,0.d0), sevc, npwx, ps, nbnd, (1.0d0,0.d0), &
+                  dvpsi, npwx )
+     endif
+     !
  ELSE
-      !
-      !  Insulators: note that nbnd_occ(ikk)=nbnd_occ(ikq) in an insulator
-      !
-      if (inverse_mode) then
-         !
-         ! |dvspi> =  |dvpsi> - |evq><sevc|dvpsi>
-         !
-         CALL ZGEMM( 'N', 'N', ngk(ikk), nbnd_occ(ikk), nbnd_occ(ikk), &
-                   (-1.d0,0.d0), evq, npwx, ps, nbnd, (1.0d0,0.d0), &
-                   dvpsi, npwx )
-         !
-      else
-         !
-         ! |dvspi> =  |dvpsi> - |sevc><evq|dvpsi>
-         !
-         CALL ZGEMM( 'N', 'N', ngk(ikk), nbnd_occ(ikk), nbnd_occ(ikk), &
-                   (-1.d0,0.d0), sevc, npwx, ps, nbnd, (1.0d0,0.d0), &
-                   dvpsi, npwx )
-         !
-      endif
-      !
+     !
+     !  Insulators: note that nbnd_occ(ikk)=nbnd_occ(ikq) in an insulator
+     !
+     if (inverse_mode) then
+        !
+        ! |dvspi> =  |dvpsi> - |evq><sevc|dvpsi>
+        !
+        CALL ZGEMM( 'N', 'N', ngk(ikk), nbnd_occ(ikk), nbnd_occ(ikk), &
+                  (-1.d0,0.d0), evq, npwx, ps, nbnd, (1.0d0,0.d0), &
+                  dvpsi, npwx )
+        !
+     else
+        !
+        ! |dvspi> =  |dvpsi> - |sevc><evq|dvpsi>
+        !
+        CALL ZGEMM( 'N', 'N', ngk(ikk), nbnd_occ(ikk), nbnd_occ(ikk), &
+                  (-1.d0,0.d0), sevc, npwx, ps, nbnd, (1.0d0,0.d0), &
+                  dvpsi, npwx )
+        !
+     endif
+     !
  ENDIF
  !
+  !$acc end host_data
+  !
+  !$acc exit data delete(ps)
+  !
  DEALLOCATE(ps)
  !
 END SUBROUTINE lr_ortho_k
@ -227,13 +261,13 @@ SUBROUTINE lr_ortho_gamma()

  COMPLEX(DP), ALLOCATABLE :: ps_c(:,:)
  REAL(DP), ALLOCATABLE :: ps(:,:)
-  INTEGER :: ibnd, jbnd, nbnd_eff
-  REAL(DP) :: wg1, w0g, wgp, wwg, deltae, theta
-  REAL(DP), EXTERNAL :: w0gauss, wgauss
+  INTEGER :: nbnd_eff

  ALLOCATE(ps(nbnd,nbnd))
  ALLOCATE(ps_c(nbnd,nbnd))
  !
+  !$acc enter data create(ps(:,:), ps_c(:,:))
+  !
  IF (lgauss) THEN
      !
      ! Metals
@ -244,7 +278,11 @@ SUBROUTINE lr_ortho_gamma()
      !
      ! Insulators
      !
-      ps = 0.d0
+      !$acc kernels
+      ps(:,:) = 0.d0
+      !$acc end kernels
+      !
+      !$acc host_data use_device(sevc,evq,dvpsi,ps)
      !
      IF (inverse_mode) THEN
         ! 
@ -262,38 +300,48 @@ SUBROUTINE lr_ortho_gamma()
         !
      ENDIF
      !
+      !$acc end host_data
+      !
      nbnd_eff = nbnd
      !
      ! G=0 has been accounted twice, therefore we subtruct one contribution.
      !
+      !$acc host_data use_device(sevc,evq,dvpsi,ps)
+      !
      IF (gstart == 2) THEN
         !
         IF (inverse_mode) THEN
            !
            ! ps = ps - <sevc|dvpsi>
            !
-            CALL DGER( nbnd, nbnd, -1.D0, sevc, 2*npwx, dvpsi, 2*npwx, ps, nbnd)
+            CALL MYDGER( nbnd, nbnd, -1.D0, sevc, 2*npwx, dvpsi, 2*npwx, ps, nbnd)
            !
         ELSE
            !
            ! ps = ps - <evq|dvpsi>
            !
-            CALL DGER( nbnd, nbnd, -1.D0, evq, 2*npwx, dvpsi, 2*npwx, ps, nbnd )
+            CALL MYDGER( nbnd, nbnd, -1.D0, evq, 2*npwx, dvpsi, 2*npwx, ps, nbnd )
            !
         ENDIF
         !
      ENDIF
      !
+      !$acc end host_data
+      !
  ENDIF
  !
 #if defined(__MPI)
-   CALL mp_sum(ps(:,:),intra_bgrp_comm)
+  !$acc host_data use_device(ps)
+  CALL mp_sum(ps(:,:),intra_bgrp_comm)
+  !$acc end host_data
 #endif
  !
  ! In the original code dpsi was used as a storage for sevc, since in
  ! tddfpt we have it stored in memory as sevc0 this part is obsolote
  !
-  ps_c = cmplx(ps, 0.d0, dp)
+  !$acc kernels
+  ps_c(:,:) = cmplx(ps(:,:), 0.d0, dp)
+  !$acc end kernels
  !
  IF (lgauss) THEN
      !
@ -305,6 +353,8 @@ SUBROUTINE lr_ortho_gamma()
     !
     !  Insulators: note that nbnd_occ(ikk) = nbnd_occ(ikq)
     !
+     !$acc host_data use_device(sevc,evq,dvpsi,ps_c)
+     !
     IF (inverse_mode) THEN
        !
        ! |dvpsi> = |dvpsi> - |evq><sevc|dvpsi>
@ -321,8 +371,12 @@ SUBROUTINE lr_ortho_gamma()
        !
     ENDIF
     !
+     !$acc end host_data
+     !
  ENDIF
  !
+  !$acc exit data delete(ps, ps_c)
+  !
  DEALLOCATE(ps)
  DEALLOCATE(ps_c)
  !
@ -339,21 +393,28 @@ SUBROUTINE lr_ortho_noncolin()

  COMPLEX(DP), ALLOCATABLE :: ps(:,:)
  INTEGER :: ibnd, jbnd, nbnd_eff
-  REAL(DP) :: wg1, w0g, wgp, wwg, deltae, theta
+  REAL(DP) :: wg1, w0g, wgp, wwg(nbnd), deltae, theta
  REAL(DP), EXTERNAL :: w0gauss, wgauss
  
  IF (inverse_mode) CALL errore ('lr_ortho', "The inverse mode is not implemented!",1)
  !
  ALLOCATE(ps(nbnd,nbnd))
+  !
+  !$acc enter data create(ps(:,:))
+             
  !
  IF (lgauss) THEN
       !
       !  metallic case
       !
-       ps = (0.d0, 0.d0)
+       !$acc kernels
+       ps(:,:) = (0.d0, 0.d0)
+       !$acc end kernels
       !
+       !$acc host_data use_device(evq,dvpsi,ps)
       CALL ZGEMM( 'C', 'N', nbnd, nbnd_occ (ikk), npwx*npol, (1.d0,0.d0), &
                    evq, npwx*npol, dvpsi, npwx*npol, (0.d0,0.d0), ps, nbnd )
+       !$acc end host_data
       !
       DO ibnd = 1, nbnd_occ (ikk)
          !
@ -369,20 +430,26 @@ SUBROUTINE lr_ortho_noncolin()
             !
             IF (jbnd <= nbnd_occ (ikq) ) THEN
                IF (abs (deltae) > 1.0d-5) THEN
-                    wwg = wwg + alpha_pv * theta * (wgp - wg1) / deltae
+                    wwg(jbnd) = wwg(jbnd) + alpha_pv * theta * (wgp - wg1) / deltae
                ELSE
                   !
                   !  if the two energies are too close takes the limit
                   !  of the 0/0 ratio
                   !
-                   wwg = wwg - alpha_pv * theta * w0g
+                   wwg(jbnd) = wwg(jbnd) - alpha_pv * theta * w0g
                ENDIF
             ENDIF
             !
-             ps(jbnd,ibnd) = wwg * ps(jbnd,ibnd)
-             !
          ENDDO
-             CALL DSCAL (2*npwx*npol, wg1, dvpsi(1,ibnd), 1)
+          !
+          !$acc parallel loop copyin(wwg)
+          DO jbnd = 1, nbnd
+             ps(jbnd,ibnd) = wwg(jbnd) * ps(jbnd,ibnd)
+          ENDDO
+          !
+          !$acc kernels
+          dvpsi(1:npwx*npol, ibnd) = wg1 * dvpsi(1:npwx*npol, ibnd)
+          !$acc end kernels
       ENDDO
       !
       nbnd_eff = nbnd
@ -391,24 +458,32 @@ SUBROUTINE lr_ortho_noncolin()
      !
      !  insulators
      !
-      ps = (0.d0, 0.d0)
+      !$acc kernels
+      ps(:,:) = (0.d0, 0.d0)
+      !$acc end kernels
      !
      ! ps = <evq|dvpsi>
      !
+      !
+      !$acc host_data use_device(evq,dvpsi,ps)
      CALL ZGEMM( 'C', 'N',nbnd_occ(ikq), nbnd_occ(ikk), npwx*npol, &
                (1.d0,0.d0), evq, npwx*npol, dvpsi, npwx*npol, (0.d0,0.d0), ps, nbnd )
+      !$acc end host_data  
      !
      nbnd_eff = nbnd_occ(ikk)
      !
   ENDIF
   !
 #if defined(__MPI)
+   !$acc host_data use_device(ps)
   CALL mp_sum(ps(:,1:nbnd_eff),intra_bgrp_comm)
+   !$acc end host_data
 #endif
   !
   ! In the original dpsi was used as a storage for sevc, since in
   ! tddfpt we have it stored in memory as sevc0 this part is obsolote.
   !
+   !$acc host_data use_device(sevc,dvpsi,ps)
   IF (lgauss) THEN
      !
      !  metallic case
@ -428,6 +503,9 @@ SUBROUTINE lr_ortho_noncolin()
                (-1.d0,0.d0),sevc,npwx*npol,ps,nbnd,(1.0d0,0.d0), dvpsi,npwx*npol )
      !
   ENDIF
+   !$acc end host_data
+   !
+   !$acc exit data delete(ps)
   !
   DEALLOCATE(ps)
   !
--- a/TDDFPT/src/lr_read_wf.f90
+++ b/TDDFPT/src/lr_read_wf.f90
@ -22,7 +22,7 @@ SUBROUTINE lr_read_wf()
  USE gvect,                ONLY : ngm, g
  USE io_files,             ONLY : nwordwfc, iunwfc, prefix, diropn,&
                                 & tmp_dir, wfc_dir 
-  USE lr_variables,         ONLY : evc0, sevc0 ,revc0, evc0_virt,        &
+  USE lr_variables,         ONLY : evc0, sevc0, evc0_virt,        &
                                 & sevc0_virt, nbnd_total, becp1_virt,   &
                                 & becp1_c_virt, no_hxc, becp_1, becp1_c, &
                                 & test_case_no, size_evc, project,       &
@ -120,7 +120,6 @@ SUBROUTINE normal_read()
  !
  ! The usual way of reading wavefunctions
  !
-  USE lr_variables,             ONLY : tg_revc0
  USE wavefunctions,     ONLY : psic
  USE realus,                   ONLY : tg_psic
  USE mp_global,                ONLY : me_bgrp
@ -233,63 +232,6 @@ SUBROUTINE normal_read()
     !
  ENDIF
  !
-  ! Calculation of the unperturbed wavefunctions in R-space revc0.
-  ! Inverse Fourier transform of evc0.
-  !
-  IF ( dffts%has_task_groups ) THEN
-       !
-       v_siz =  dffts%nnr_tg
-       incr = 2 * fftx_ntgrp(dffts)
-       tg_revc0 = (0.0d0,0.0d0)
-       !
-  ELSE
-       !
-       revc0 = (0.0d0,0.0d0)
-       !
-  ENDIF
-  !
-  IF ( gamma_only ) THEN
-     !
-     DO ibnd = 1, nbnd, incr
-        !
-        CALL invfft_orbital_gamma ( evc0(:,:,1), ibnd, nbnd)
-        !
-        IF (dffts%has_task_groups) THEN               
-           !
-           DO j = 1, dffts%nr1x*dffts%nr2x*dffts%my_nr3p
-               !
-               tg_revc0(j,ibnd,1) = tg_psic(j)
-               !  
-           ENDDO
-           !
-        ELSE
-           !
-           revc0(1:dffts%nnr,ibnd,1) = psic(1:dffts%nnr)
-           !
-        ENDIF
-        !
-     ENDDO
-     !
-  ELSE
-     !
-     ! The FFT is done in the same way as in invfft_orbital_k 
-     ! (where also the task groups is implemented but must be checked).
-     !
-     DO ik = 1, nks
-        DO ibnd = 1, nbnd
-           DO ig = 1, ngk(ik)
-               !
-               revc0(dffts%nl(igk_k(ig,ik)),ibnd,ik) = evc0(ig,ibnd,ik)
-               !
-           ENDDO
-           !
-           CALL invfft ('Wave', revc0(:,ibnd,ik), dffts)
-           !
-        ENDDO
-     ENDDO
-     !
-  ENDIF
-  !
  IF ( real_space .AND. .NOT. gamma_only ) &
           CALL errore( ' iosys ', ' Linear response calculation ' // &
           & 'real space algorithms with k-points not implemented', 1 )
@ -512,9 +454,6 @@ SUBROUTINE virt_read()
  sevc0 = (0.0d0,0.0d0)
  sevc0(:,:,:) = sevc_all(:,1:nbnd,:)
  !
-  revc0 = (0.0d0,0.0d0)
-  revc0(:,:,:) = revc_all(:,1:nbnd,:)
-  !
  IF (nkb>0) THEN
     !
     IF (gamma_only) THEN
--- a/TDDFPT/src/lr_readin.f90
+++ b/TDDFPT/src/lr_readin.f90
@ -89,9 +89,9 @@ SUBROUTINE lr_readin
                        & force_real_alpha, force_zero_alpha, lan_precondition 
  NAMELIST / lr_post /    omeg, beta_gamma_z_prefix, w_T_npol, plot_type, epsil, itermax_int,sum_rule
  namelist / lr_dav /     num_eign, num_init, num_basis_max, residue_conv_thr, precondition,         &
-                        & dav_debug, reference,single_pole, sort_contr, diag_of_h, close_pre,        &
-                        & broadening,print_spectrum,start,finish,step,if_check_orth, if_random_init, &
-                        & if_check_her,p_nbnd_occ,p_nbnd_virt,poor_of_ram,poor_of_ram2,max_iter,     &
+                        & reference,single_pole, sort_contr, diag_of_h, close_pre,        &
+                        & broadening,print_spectrum,start,finish,step, if_random_init, &
+                        & p_nbnd_occ,p_nbnd_virt,poor_of_ram,poor_of_ram2,max_iter,     &
                        & conv_assistant,if_dft_spectrum,no_hxc,d0psi_rs,lshift_d0psi,               &
                        & lplot_drho, vccouple_shift, ltammd
  !
@ -174,7 +174,6 @@ SUBROUTINE lr_readin
     close_pre=1.0E-5
     turn2planb=1.0E-3
     precondition=.true.
-     dav_debug=.false.
     reference=0.0d0
     vccouple_shift=0.0d0
     single_pole=.false.
@ -183,8 +182,6 @@ SUBROUTINE lr_readin
     start=0.0d0
     finish=1.0d0
     step=0.001d0
-     if_check_orth=.false.
-     if_check_her=.false.
     if_random_init=.false.
     p_nbnd_occ=10
     p_nbnd_virt=10
--- a/TDDFPT/src/lr_us.f90
+++ b/TDDFPT/src/lr_us.f90
@ -42,12 +42,16 @@ SUBROUTINE lr_apply_s(vect, svect)
       WRITE(stdout,'("<lr_apply_s>")')
    ENDIF
    !
+    !$acc data present_or_copyin(vect(1:npwx*npol,1:nbnd,1:nksq)) present_or_copyout(svect(1:npwx*npol,1:nbnd,1:nksq))
    IF ( nkb==0 .or. (.not.okvan) ) THEN
       !
+       !$acc kernels
       svect(:,:,:) = vect(:,:,:) 
+       !$acc end kernels
       RETURN
       !
    ENDIF
+    !$acc end data
    !
    CALL start_clock('lr_apply_s')
    !
--- a/TDDFPT/src/lr_variables.f90
+++ b/TDDFPT/src/lr_variables.f90
@ -89,9 +89,7 @@ MODULE lr_variables
       sevc1(:,:,:),      &    ! S * "    "
       sevc1_new(:,:,:),  &    ! S * "    "
       d0psi(:,:,:,:),    &    ! for saving the original starting vectors
-       d0psi2(:,:,:,:),   &    ! for saving the original starting vectors (without P^+_c)
-       revc0(:,:,:),      &    ! ground state wavefunctions in real space
-       tg_revc0(:,:,:)         ! ground state wavefunctions in real space
+       d0psi2(:,:,:,:)         ! for saving the original starting vectors (without P^+_c)
  REAL(kind=dp), ALLOCATABLE ::    &
       rho_1(:,:)              ! response charge density in real space
  COMPLEX(kind=dp), ALLOCATABLE :: &