diff --git a/.flake8 b/.flake8
index 0916bd4d3b..ae3cf77257 100644
--- a/.flake8
+++ b/.flake8
@@ -1,19 +1,18 @@
 [flake8]
 # https://flake8.pycqa.org/en/2.5.5/warnings.html#error-codes
-ignore = \
+ignore =
         # Indentation:
-        E126, E127, E128, E129, \
+        E126, E127, E128, E129,
         # Whitespaces:
-        E201, E202, E203, E211, E221, E222, E225, E226, E228, E231, E241, \
-        E251, \
+        E201, E202, E203, E211, E221, E222, E225, E226, E228, E231, E241, E251,
         # Comments:
-        E261, E262, E265, E266, \
+        E261, E262, E265, E266,
         # Blank lines:
-        E301, E302, E303, E305, E306, \
+        E301, E302, E303, E305, E306,
         # Imports:
-        E401, E402, \
+        E401, E402,
         # Other:
-        E701, E731, E741, E275, \
+        E701, E731, E741, E275,
         F401, C901, W391, W503, W504
 
 exclude = test, .git, __pycache__, build, dist, __init__.py .eggs, *.egg
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 0000000000..a1e642f85a
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,13 @@
+# These are supported funding model platforms
+
+github: [sunqm]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
+custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ea723f130b..d07c6579f7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -26,11 +26,11 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-20.04]
-        python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
+        python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11"]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install and Test
@@ -62,7 +62,7 @@ jobs:
             yum install -y epel-release && \
             yum-config-manager --enable epel && \
             yum install -y openblas-devel gcc cmake curl && \
-            cd ./pyscf/lib && curl -o deps.tar.gz -L "https://github.com/pyscf/pyscf-build-deps/blob/master/pyscf-2.2a-aarch64-deps.tar.gz?raw=true"  && \
+            cd ./pyscf/lib && curl -o deps.tar.gz -L "https://github.com/pyscf/pyscf-build-deps/blob/master/pyscf-2.2-aarch64-deps.tar.gz?raw=true"  && \
             tar xzf deps.tar.gz && \
             mkdir build && cd build && \
             cmake -DBUILD_LIBXC=OFF -DBUILD_XCFUN=OFF -DBUILD_LIBCINT=OFF .. && \
@@ -80,9 +80,9 @@ jobs:
       matrix:
         python-version: ["3.7"]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install and Test
diff --git a/.github/workflows/ci_linux/build_pyscf.sh b/.github/workflows/ci_linux/build_pyscf.sh
index 28bc882324..125257eb16 100755
--- a/.github/workflows/ci_linux/build_pyscf.sh
+++ b/.github/workflows/ci_linux/build_pyscf.sh
@@ -3,7 +3,7 @@
 set -e
 
 cd ./pyscf/lib
-curl -L "https://github.com/pyscf/pyscf-build-deps/blob/master/pyscf-2.2a-deps.tar.gz?raw=true" | tar xzf -
+curl -L "https://github.com/pyscf/pyscf-build-deps/blob/master/pyscf-2.2.1-deps.tar.gz?raw=true" | tar xzf -
 mkdir build; cd build
 cmake -DBUILD_LIBXC=OFF -DBUILD_XCFUN=OFF -DBUILD_LIBCINT=OFF ..
 make -j4
diff --git a/.github/workflows/ci_linux/python_deps.sh b/.github/workflows/ci_linux/python_deps.sh
index 22d1eb10e3..1511ded008 100755
--- a/.github/workflows/ci_linux/python_deps.sh
+++ b/.github/workflows/ci_linux/python_deps.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 python -m pip install --upgrade pip
-pip install "numpy!=1.16,!=1.17" "scipy!=1.5" h5py pytest pytest-cov pytest-timer codecov
+pip install "numpy!=1.16,!=1.17" "scipy!=1.5" h5py pytest pytest-cov pytest-timer
 pip install pyberny geometric
 pip install spglib
 
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 60efec82c6..6c168c5ea2 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -13,7 +13,7 @@ jobs:
   release-pypi-linux:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Build wheels
         uses: docker://pyscf/pyscf-pypa-env:latest
         with:
@@ -30,23 +30,23 @@ jobs:
           #password: ${{ secrets.PYPI_TEST_API_TOKEN }}
           #repository_url: https://test.pypi.org/legacy/
           password: ${{ secrets.PYPI_API_TOKEN }}
-          packages_dir: ${{ github.workspace }}/linux-wheels
+          packages-dir: ${{ github.workspace }}/linux-wheels
           verbose: true
 
   release-pypi-aarch64:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        pyver: [cp36-cp36m, cp37-cp37m, cp38-cp38, cp39-cp39, cp310-cp310]
+        pyver: [cp36-cp36m, cp37-cp37m, cp38-cp38, cp39-cp39, cp310-cp310, cp311-cp311]
       fail-fast: false
     env:
-      img: quay.io/pypa/manylinux2014_aarch64
+      img: quay.io/pypa/manylinux2014_aarch64:2023-03-12-25fd859
     steps:
     - name: Checkout
-      uses: actions/checkout@v2
+      uses: actions/checkout@v3
     - name: Set up QEMU
       id: qemu
-      uses: docker/setup-qemu-action@v1
+      uses: docker/setup-qemu-action@v2
     - name: Build Wheel
       run: |
             docker run --rm -v ${{ github.workspace }}:/src/pyscf:rw --workdir=/src/pyscf \
@@ -59,7 +59,7 @@ jobs:
             export dst=${GITHUB_WORKSPACE:-/src/pyscf}/linux-wheels && \
             export CMAKE_CONFIGURE_ARGS="-DWITH_F12=OFF" && \
             mkdir -p /root/wheelhouse $src/linux-wheels && \
-            sed -i "/            if basename(fn) not in needed_libs:/s/basename.*libs/1/" /opt/_internal/pipx/venvs/auditwheel/lib/python3.9/site-packages/auditwheel/wheel_abi.py && \
+            sed -i "/            if basename(fn) not in needed_libs:/s/basename.*libs/1/" /opt/_internal/pipx/venvs/auditwheel/lib/python*/site-packages/auditwheel/wheel_abi.py && \
             /opt/python/${{ matrix.pyver }}/bin/pip wheel -v --no-deps --no-clean -w /root/wheelhouse $src && \
             export whl=`ls /root/wheelhouse/pyscf-*-linux_*.whl` && \
             auditwheel -v repair "$whl" --lib-sdir /lib -w $dst'
@@ -74,13 +74,13 @@ jobs:
         #password: ${{ secrets.PYPI_TEST_API_TOKEN }}
         #repository_url: https://test.pypi.org/legacy/
         password: ${{ secrets.PYPI_API_TOKEN }}
-        packages_dir: ${{ github.workspace }}/linux-wheels
+        packages-dir: ${{ github.workspace }}/linux-wheels
         verbose: true
 
   release-pypi-sdist:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Build sdist
         run: |
           python3 setup.py sdist
@@ -95,7 +95,7 @@ jobs:
           #password: ${{ secrets.PYPI_TEST_API_TOKEN }}
           #repository_url: https://test.pypi.org/legacy/
           password: ${{ secrets.PYPI_API_TOKEN }}
-          packages_dir: ${{ github.workspace }}/dist
+          packages-dir: ${{ github.workspace }}/dist
           verbose: true
 
   release-pypi-macos:
@@ -103,11 +103,11 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
+        python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11"]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
       - name: Build wheels
@@ -128,22 +128,22 @@ jobs:
   release-conda-linux:
     runs-on: ubuntu-latest
     strategy:
-      matrix:
-        python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
       fail-fast: false
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Setup conda
-        uses: actions/setup-python@v2
+        uses: s-weigand/setup-conda@v1
         with:
-          python-version: ${{ matrix.python-version }}
+          update-conda: true
+          conda-channels: anaconda, conda-forge
+      - run: conda --version
+      - run: which python
       - name: Publish to conda
         run: |
-          $CONDA/bin/conda install -y anaconda-client conda-build
-          $CONDA/bin/conda build --output-folder . conda
           export ANACONDA_API_TOKEN=${{ secrets.ANACONDA_TOKEN }}
-          $CONDA/bin/conda config --set anaconda_upload yes
-          $CONDA/bin/anaconda upload linux-64/*.tar.bz2
+          conda install -y anaconda-client conda-build
+          conda config --set anaconda_upload yes
+          conda build --output-folder . conda
 
   # Disable macos conda release for Error during compling:
   # MACOSX_DEPLOYMENT_TARGET mismatch: now "10.9" but "10.15" during configure
@@ -154,9 +154,9 @@ jobs:
   #      python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
   #    fail-fast: false
   #  steps:
-  #    - uses: actions/checkout@v2
+  #    - uses: actions/checkout@v3
   #    - name: Setup conda
-  #      uses: actions/setup-python@v2
+  #      uses: actions/setup-python@v4
   #      with:
   #        python-version: ${{ matrix.python-version }}
   #    - name: Publish to conda
@@ -168,14 +168,11 @@ jobs:
   #        $CONDA/bin/anaconda upload linux-64/*.tar.bz2
 
   release-docker:
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-latest
     needs: release-pypi-linux
-    strategy:
-      matrix:
-        os: [ubuntu-18.04]
     steps:
       - name: Login to Docker Hub
-        uses: docker/login-action@v1
+        uses: docker/login-action@v2
         with:
           username: pyscf
           password: ${{ secrets.DOCKERHUB_TOKEN }}
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 6841500a83..0000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "doc"]
-	path = doc
-	url = git://github.com/pyscf/pyscf-doc.git
diff --git a/CHANGELOG b/CHANGELOG
index 1a4a7d8b97..dc615e0ee0 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,60 @@
+PySCF 2.2.1 (2023-03-29)
+------------------------
+* Added
+  - Density fitting gradients and hessian for RSH functionals
+  - SCS-MP2 and SCS-KMP2
+  - Configurations of f-in-core ECPs
+  - ao2mo integral transformation for GHF orbitals with scalar ERIs
+  - Interface to access basis Set Exchange DB
+  - DF-UCISD
+  - Moment resolved GF-CCSD
+  - MOM (maximum overlap method) for GHF and DHF
+* Improved
+  - PBC GDF K-build performance
+  - Read cubegen for crystalline systems
+* Fixes
+  - Outdated examples
+  - Update to libcint 5.3.0 to filter warning messages produced by 5.2.0
+  - PBC density fitting CDERIArray object backward compatibility
+  - DIIS numerical instability
+  - C2v molecule orientation when symmetry is enabled
+  - The missed HF exchange in RSH functional wB97
+  - Gauge origin of Boys localization
+  - Check the root numbers during Davidson diagonalization iterations
+  - Removed hard-coded environment variable MKL_NUM_THREADS=1
+  - Conflicts in GDF cderi tensor for RSH functionals
+
+
+PySCF 2.2.0 (2023-03-09)
+------------------------
+* Added
+  - Molecular nuc grad with DF ERIs for ROHF, ROKS, CASSCF, SA-CASSCF
+  - K-point symmetry adapted crystalline orbital basis
+  - K-point symmetry for GDF, MDF, KCCSD
+  - VV10 gradients
+  - Spin density population analysis for MCSCF
+  - BCCD
+  - Cylindrical symmetry for FCI wavefunction
+  - Use a linearly independent basis in DIIS
+  - Full CI for complex integrals
+  - Maxwell-Boltzmann Velocity Distribution in BOMD
+* Improved
+  - PBC integral accuracy for GDF, MDF, FFT, AFT
+  - PBC integral accuracy of 2D system for insufficient integral grids
+  - Update Libxc to Libxc-6
+  - get_monkhorst_pack_size for numerical noise
+  - Unit recognization unified
+* Fixed
+  - MCSCF without initializing SCF
+  - Memory leaks in C kernels for CCSD(T)
+  - Fully spin-polarized UMP2, DF-UMP2, UCCSD, UCCSD(T) bugfixes
+  - numpy_helper.hermi_sum for non-contiguous array
+  - Padding and frozen orbitals bugs in KMP2
+  - PolEmbed interface and CASCI with PE
+  - Molecular density fitting hessian
+  - K-point symmetry adapted KS-DFT with SOSCF solvers
+
+
 PySCF 2.1.1 (2022-09-21)
 ------------------------
 * Added
diff --git a/MANIFEST.in b/MANIFEST.in
index aaeb31aa13..b81d383473 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,9 +1,6 @@
 include MANIFEST.in
 include README.md setup.py CHANGELOG AUTHORS LICENSE NOTICE
 
-global-exclude *.py[cod]
-prune pyscf/lib/build
-
 recursive-include pyscf *.dat
 recursive-include pyscf/lib/deps *.so
 include pyscf/lib/*.so pyscf/lib/config.h.in
@@ -11,12 +8,14 @@ include pyscf/lib/*.so pyscf/lib/config.h.in
 # macos dynamic libraries
 include pyscf/lib/*.dylib
 include pyscf/lib/deps/lib*/libcint.[45].dylib
-include pyscf/lib/deps/lib*/libxc.dylib
+include pyscf/lib/deps/lib*/libxc.*.dylib
 include pyscf/lib/deps/lib*/libxcfun.[23].dylib
 
 include pyscf/geomopt/log.ini
 
 # source code
-prune pyscf/lib/build
 recursive-include pyscf/lib *.c *.h CMakeLists.txt
 recursive-exclude pyscf/lib *.cl
+
+global-exclude *.py[cod]
+prune pyscf/lib/build
diff --git a/NOTICE b/NOTICE
index c69b211d77..a34398669b 100644
--- a/NOTICE
+++ b/NOTICE
@@ -92,6 +92,12 @@ Seunghoon Lee
 Chia-Nan Yeh
 Chun-Yu Chow
 odidev (from puresoftware)
+Nike Dattani
+Egor Trushin
+Wanja Schulze
+Till Hanke
+Kevin J. Sung
+Jonathan Edward Moussa
 
 
 
diff --git a/README.md b/README.md
index 98a2c2cbb7..fe2bd62951 100644
--- a/README.md
+++ b/README.md
@@ -7,9 +7,9 @@ Python-based Simulations of Chemistry Framework
 [![Build Status](https://github.com/pyscf/pyscf/workflows/CI/badge.svg)](https://github.com/pyscf/pyscf/actions?query=workflow%3ACI)
 [![codecov](https://codecov.io/gh/pyscf/pyscf/branch/master/graph/badge.svg)](https://codecov.io/gh/pyscf/pyscf)
 
-2022-09-21
+2023-03-29
 
-* [Stable release 2.1.1](https://github.com/pyscf/pyscf/releases/tag/v2.1.1)
+* [Stable release 2.2.1](https://github.com/pyscf/pyscf/releases/tag/v2.2.1)
 * [Changelog](../master/CHANGELOG)
 * [Documentation](http://www.pyscf.org)
 * [Installation](#installation)
diff --git a/conda/build.sh b/conda/build.sh
index 8eb8f3cce9..4fe6afe44b 100755
--- a/conda/build.sh
+++ b/conda/build.sh
@@ -11,7 +11,10 @@ set -x -e
 # find pyscf/lib/deps -name "*cint*" -exec rm {} \+
 # rm pyscf-2.0-depsa-openblas.tar.gz
 
+# C extensions must be installed with sequential BLAS library
+# https://pyscf.org/install.html#using-optimized-blas
+export CMAKE_CONFIGURE_ARGS="-DWITH_F12=OFF -DBLA_VENDOR=Intel10_64lp_seq"
+
 # env PYTHON not defined in certain conda-build version
 # $PYTHON -m pip install . -vv
-export CMAKE_CONFIGURE_ARGS="-DWITH_F12=OFF"
 pip install -v --prefix=$PREFIX .
diff --git a/conda/conda_build_config.yaml b/conda/conda_build_config.yaml
index 4c25caf9a7..8d657fe804 100644
--- a/conda/conda_build_config.yaml
+++ b/conda/conda_build_config.yaml
@@ -1,2 +1,7 @@
 python:
+  - 3.6
   - 3.7
+  - 3.8
+  - 3.9
+  - 3.10
+  - 3.11
diff --git a/conda/meta.yaml b/conda/meta.yaml
index fd41a3201e..c1fb4c51ff 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -13,23 +13,24 @@ build:
 
 requirements:
   build:
+    - python {{ python }}
     - cmake
     - make
     - mkl
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
+  host:
+    - python
   run:
+    - python
     - mkl
     - numpy>=1.13
     - scipy!=1.5
     - h5py>=2.7
 
 test:
-  commands:
-    - test -f ${PREFIX}/lib/python*/site-packages/pyscf/lib/libcvhf.so
-    # python version in testing environment is often changed to a different
-    # version than the building
-    # - python -c "import pyscf.lib"
+  imports:
+    - pyscf.lib
 
 about:
   home: "{{ data['url'] }}"
diff --git a/docker/pypa-env/Dockerfile b/docker/pypa-env/Dockerfile
deleted file mode 120000
index ad782bcc96..0000000000
--- a/docker/pypa-env/Dockerfile
+++ /dev/null
@@ -1 +0,0 @@
-Dockerfile-2.1-openblas
\ No newline at end of file
diff --git a/docker/pypa-env/Dockerfile b/docker/pypa-env/Dockerfile
new file mode 100644
index 0000000000..2d3b079f3a
--- /dev/null
+++ b/docker/pypa-env/Dockerfile
@@ -0,0 +1,26 @@
+#FROM quay.io/pypa/manylinux2014_x86_64:latest
+#RUN yum install -y openblas-devel.x86_64
+#
+FROM quay.io/pypa/manylinux2010_x86_64:2022-08-05-4535177
+
+# libquadmath from devtoolset-8-libquadmath-devel.x86_64 was not built with -fPIC.
+# Remove libquadmath-devel and restore the system default libquadmath.so
+RUN yum remove -y devtoolset-8-libquadmath-devel.x86_64 && \
+    yum install -y libquadmath.x86_64 && \
+    ln -fs libquadmath.so.0 /usr/lib64/libquadmath.so
+
+RUN yum install -y openblas-devel.x86_64 gcc && \
+    yum clean all && \
+    rm -rf /var/cache/yum
+
+RUN /opt/_internal/tools/bin/pip3 install --no-cache-dir cmake
+
+COPY build-wheels.sh /build-wheels.sh
+CMD ['/build-wheels.sh']
+
+# # openblas in quay.io/pypa/manylinux1_x86_64 has a bug that causes segfault
+# # (issue https://github.com/pyscf/pyscf/issues/1095). openblas r0-3.3 fixed
+# # the bug
+# COPY --from 0 /usr/lib64/libopenblas.so /usr/lib64/libopenblas.so.0
+# RUN rm -f libopenblas-r0.2.18.so && \
+#     ln -fs /usr/lib64/libopenblas.so.0 /usr/lib64/libopenblas.so
diff --git a/docker/pypa-env/build-wheels.sh b/docker/pypa-env/build-wheels.sh
index 869777cab2..784b1f7e0d 100755
--- a/docker/pypa-env/build-wheels.sh
+++ b/docker/pypa-env/build-wheels.sh
@@ -14,14 +14,14 @@ else
 fi
 
 # In certain versions of auditwheel, some .so files was excluded.
-sed -i '/            if basename(fn) not in needed_libs:/s/basename.*libs/1/' /opt/_internal/pipx/venvs/auditwheel/lib/python3.9/site-packages/auditwheel/wheel_abi.py
+sed -i '/            if basename(fn) not in needed_libs:/s/basename.*libs/1/' /opt/_internal/pipx/venvs/auditwheel/lib/python*/site-packages/auditwheel/wheel_abi.py
 
 # Compile wheels
-for PYVERSION in cp36-cp36m cp37-cp37m cp38-cp38 cp39-cp39 cp310-cp310; do
+for PYVERSION in cp36-cp36m cp37-cp37m cp38-cp38 cp39-cp39 cp310-cp310 cp311-cp311; do
     PYBIN=/opt/python/$PYVERSION/bin
     "${PYBIN}/pip" wheel -v --no-deps --no-clean -w /root/wheelhouse $src
 
     # Bundle external shared libraries into the wheels
-    whl=`ls /root/wheelhouse/pyscf-*-$PYVERSION-linux*_x86_64.whl`
+    whl=`ls /root/wheelhouse/pyscf-*-$PYVERSION-*linux*_x86_64.whl`
     auditwheel -v repair "$whl" --lib-sdir /lib -w $dst
 done
diff --git a/examples/1-advanced/031-MEP.py b/examples/1-advanced/031-MEP.py
index 13931bef2f..566a2c0d20 100644
--- a/examples/1-advanced/031-MEP.py
+++ b/examples/1-advanced/031-MEP.py
@@ -64,6 +64,10 @@
 fakemol = gto.fakemol_for_charges(points)
 Vele = numpy.einsum('ijp,ij->p', df.incore.aux_e2(mol, fakemol), mf.make_rdm1())
 
+# Method 3 (most efficient): int1e_grids computes the integrals <i| 1/|r-r_p| |j>
+# for a group of grids r_p
+Vele = numpy.einsum('pij,ij->p', mol.intor('int1e_grids', grids=points), mf.make_rdm1())
+
 #
 # 4. MEP at each point
 #
diff --git a/examples/cc/11-frozen_core.py b/examples/cc/11-frozen_core.py
index 690e9d6d5c..d45b0291bf 100644
--- a/examples/cc/11-frozen_core.py
+++ b/examples/cc/11-frozen_core.py
@@ -69,13 +69,13 @@
 # number of elec screened by ECP > number of chemical core electrons
 #
 mol = gto.M(
-    atom = 'Mg 0 0 0',
-    basis = 'def2-svp')
+    atom = 'Xe 0 0 0',
+    basis = 'cc-pvtz-dk')
 mf = scf.RHF(mol).run()
 mycc = cc.CCSD(mf)
 mycc.set_frozen()
 print('Number of core orbital frozen: %d' % mycc.frozen)
-mol.set(basis='lanl2dz', ecp='lanl2dz').build()
+mol.set(basis='def2-svp', ecp='def2-svp').build()
 mf = scf.RHF(mol).run()
 mycc = cc.CCSD(mf)
 mycc.set_frozen()
diff --git a/examples/cc/31-remove_linear_dep.py b/examples/cc/31-remove_linear_dep.py
index 665913c621..00d3b5e33f 100644
--- a/examples/cc/31-remove_linear_dep.py
+++ b/examples/cc/31-remove_linear_dep.py
@@ -12,9 +12,11 @@
 mol.basis = 'ccpvdz'
 mol.verbose = 4
 mol.build()
+# Without handling the linear dependency in basis, HF and CCSD can produce
+# incorrect results
 mf = scf.RHF(mol).run()
 mycc = cc.CCSD(mf).run()
 
-mf = scf.addons.remove_linear_dep_(mf).run()
+mf = scf.addons.remove_linear_dep_(mol.RHF()).run()
 mycc = cc.CCSD(mf).run()
 
diff --git a/examples/cc/50-simple_momgfccsd.py b/examples/cc/50-simple_momgfccsd.py
new file mode 100644
index 0000000000..d250915d40
--- /dev/null
+++ b/examples/cc/50-simple_momgfccsd.py
@@ -0,0 +1,65 @@
+# Author: Oliver Backhouse <olbackhouse@gmail.com>
+
+"""
+Construct a Green's function at the CCSD level via a
+number of spectral moment constraints
+
+Ref: Backhouse, Booth, arXiv:2206.13198 (2022).
+"""
+
+import numpy
+from pyscf import gto, scf, cc, lib
+
+# Define system
+mol = gto.Mole()
+mol.atom = "O 0 0 0; O 0 0 1.2"
+mol.unit = "A"
+mol.basis = "cc-pvdz"
+mol.verbose = 4
+mol.build()
+
+# Run mean-field
+mf = scf.RHF(mol)
+mf.conv_tol_grad = 1e-10
+mf.kernel()
+assert mf.converged
+
+# Run CCSD
+ccsd = cc.CCSD(mf)
+ccsd.kernel()
+assert ccsd.converged
+
+# Solve lambda equations
+ccsd.solve_lambda()
+assert ccsd.converged_lambda
+
+# Run moment-constrained GF-CCSD
+#
+# Here we use 4 cycles in both the occupied (hole) and virtual
+# (particle) sector, which ensures conservation of the first
+# 2 * niter + 2 = 10 spectral moments (0th through 9th) of the separate
+# occupied (hole) and virtual (particle) Green's functions.
+# These can be increased for more accuracy but will eventually
+# lose numerical precision.
+#
+# The gfcc object will store information on the resulting
+# pole energies and residues of the Green's function.
+gfcc = cc.MomGFCCSD(ccsd, niter=(4, 4))
+gfcc.kernel()
+
+# Compare IPs and EAs to IP/EA-EOM-CCSD
+eip,cip = ccsd.ipccsd(nroots=6)
+eea,cea = ccsd.eaccsd(nroots=6)
+
+# The poles of the full-frequency Green's function can then be 
+# accessed and very cheaply expressed on a real or Matsubara 
+# axis to give access to the full Green's function and photoemission 
+# spectrum at (an approximation to) the EOM-CCSD level of theory, 
+# with broadening eta.
+e = numpy.concatenate([gfcc.eh, gfcc.ep], axis=0)
+v = numpy.concatenate([gfcc.vh[0], gfcc.vp[0]], axis=1)
+u = numpy.concatenate([gfcc.vh[1], gfcc.vp[1]], axis=1)
+grid = numpy.linspace(-5.0, 5.0, 100)
+eta = 1e-2
+denom = grid[:, None] - (e + numpy.sign(e) * eta * 1.0j)[None]
+gf = lib.einsum("pk,qk,wk->wpq", v, u.conj(), 1.0/denom)
diff --git a/examples/cc/51-momgfccsd_hermiticity.py b/examples/cc/51-momgfccsd_hermiticity.py
new file mode 100644
index 0000000000..6092d8c05c
--- /dev/null
+++ b/examples/cc/51-momgfccsd_hermiticity.py
@@ -0,0 +1,65 @@
+# Author: Oliver Backhouse <olbackhouse@gmail.com>
+
+"""
+Moment constrained GF-CCSD with different hermiticity options.
+
+Ref: Backhouse, Booth, arXiv:2206.13198 (2022).
+"""
+
+from pyscf import gto, scf, cc
+
+# Define system
+mol = gto.Mole()
+mol.atom = "C 0 0 0; O 0 0 1.13"
+mol.basis = "cc-pvdz"
+mol.verbose = 5
+mol.build()
+
+# Run mean-field
+mf = scf.RHF(mol)
+mf.conv_tol_grad = 1e-10
+mf.kernel()
+assert mf.converged
+
+# Run CCSD
+ccsd = cc.CCSD(mf)
+ccsd.kernel()
+assert ccsd.converged
+
+# Solve lambda equations
+ccsd.solve_lambda()
+assert ccsd.converged_lambda
+
+# Run moment-constrained GF-CCSD
+#
+# Default mode: GF Moments are non-hermitian, and 
+# full Hamiltonian/Green's function is non-hermitian
+gfcc1 = cc.MomGFCCSD(ccsd, niter=(4, 4))
+gfcc1.hermi_moments = False
+gfcc1.hermi_solver = False
+gfcc1.kernel()
+ip1 = gfcc1.ipgfccsd(nroots=1)[0]
+
+# We can force the CCSD GF moments to be hermitian
+gfcc2 = cc.MomGFCCSD(ccsd, niter=(4, 4))
+gfcc2.hermi_moments = True
+gfcc2.hermi_solver = False
+gfcc2.kernel()
+ip2 = gfcc2.ipgfccsd(nroots=1)[0]
+
+# We can constrain the GF moments and full GF / 
+# hamiltonian to be hermitian
+gfcc3 = cc.MomGFCCSD(ccsd, niter=(4, 4))
+gfcc3.hermi_moments = True
+gfcc3.hermi_solver = True
+gfcc3.kernel()
+ip3 = gfcc3.ipgfccsd(nroots=1)[0]
+
+# Compare to EOM-CCSD-IP first ionization potential
+eip = ccsd.ipccsd(nroots=1)[0]
+
+print("Ionisation potentials:")
+print("non-hermitian solver, non-hermitian moments", ip1)
+print("non-hermitian solver, hermitian moments    ", ip2)
+print("hermitian solver,     hermitian moments    ", ip3)
+print("IP-EOM-CCSD solver                         ", eip)
diff --git a/examples/cc/52-momgfccsd_moment_input.py b/examples/cc/52-momgfccsd_moment_input.py
new file mode 100644
index 0000000000..6fbef56be6
--- /dev/null
+++ b/examples/cc/52-momgfccsd_moment_input.py
@@ -0,0 +1,79 @@
+# Author: Oliver Backhouse <olbackhouse@gmail.com>
+
+"""
+Moment-constrained GF-CCSD with reuse of moments or custom moment input
+from other level of theory.
+
+Ref: Backhouse, Booth, arXiv:2206.13198 (2022).
+"""
+
+from pyscf import gto, scf, cc
+import numpy as np
+
+# Define system
+mol = gto.Mole()
+mol.atom = "Li 0 0 0; H 0 0 1.64"
+mol.basis = "cc-pvdz"
+mol.verbose = 5
+mol.build()
+
+# Run mean-field
+mf = scf.RHF(mol)
+mf.conv_tol_grad = 1e-10
+mf.kernel()
+assert mf.converged
+
+# Run CCSD
+ccsd = cc.CCSD(mf)
+ccsd.kernel()
+assert ccsd.converged
+
+# Solve lambda equations
+ccsd.solve_lambda()
+assert ccsd.converged_lambda
+
+# Run a moment-constrained GF-CCSD calculation
+# Note: 5 cycles of moment constraint in the EA
+# sector compared to 3 in the IP sector.
+gfcc = cc.MomGFCCSD(ccsd, niter=(3, 5))
+gfcc.kernel()
+ip = gfcc.ipgfccsd(nroots=1)[0]
+
+# We can also build the moments ahead of time, and
+# pass them in as the moment constraints, with the
+# subsequent GF construction agnostic to the 
+# provenance of these moments.
+th = gfcc.build_hole_moments()
+tp = gfcc.build_part_moments()
+gfcc = cc.MomGFCCSD(ccsd, niter=(3, 5))
+gfcc.kernel(hole_moments=th, part_moments=tp)
+assert np.allclose(ip, gfcc.ipgfccsd(nroots=1)[0])
+
+# Or use custom moments of the Green's function, via ndarrays which
+# must enumerate at least the moments of order 0 through 2n+1 where
+# n is the `niter` parameter in each sector. Note that physical
+# moments must be in an orthogonal basis.
+
+# For example, moments of the Hartree--Fock Green's function (powers
+# of the Fock matrix), which should give exactly the MO energies
+# (the other states will be linearly dependent):
+f = np.diag(mf.mo_energy)
+t = np.array([np.linalg.matrix_power(f, n) for n in range(5*2+2)])
+th, tp = t.copy(), t.copy()
+th[:, ccsd.nocc:, ccsd.nocc:] = 0.0
+tp[:, :ccsd.nocc, :ccsd.nocc] = 0.0
+gfcc = cc.MomGFCCSD(ccsd, niter=(3, 5))
+gfcc.kernel(hole_moments=th, part_moments=tp)
+
+# Or, moments from another post-HF Green's function method to
+# approximate its spectrum, i.e. AGF2:
+from pyscf.agf2 import AGF2
+agf2 = AGF2(mf)
+agf2.kernel()
+gf = agf2.gf
+th = gf.get_occupied().moment(np.arange(3*2+2))
+tp = gf.get_virtual().moment(np.arange(5*2+2))
+gfcc = cc.MomGFCCSD(ccsd, niter=(3, 5))
+gfcc.hermi_moments = True
+gfcc.hermi_solver = True
+gfcc.kernel(hole_moments=th, part_moments=tp)
diff --git a/examples/cc/53-momgfccsd_weight_threshold.py b/examples/cc/53-momgfccsd_weight_threshold.py
new file mode 100644
index 0000000000..36c27834dc
--- /dev/null
+++ b/examples/cc/53-momgfccsd_weight_threshold.py
@@ -0,0 +1,57 @@
+# Author: Oliver Backhouse <olbackhouse@gmail.com>
+
+"""
+Moment-constrained GF-CCSD with custom physical weight threshold
+for excitations.
+
+Ref: Backhouse, Booth, arXiv:2206.13198 (2022).
+"""
+
+from pyscf import gto, scf, cc
+import numpy as np
+
+# Define system
+mol = gto.Mole()
+mol.atom = "Li 0 0 0; Li 0 0 1.7"
+mol.basis = "cc-pvdz"
+mol.verbose = 4
+mol.build()
+
+# Run mean-field
+mf = scf.RHF(mol)
+mf.conv_tol_grad = 1e-10
+mf.kernel()
+assert mf.converged
+
+# Run CCSD
+ccsd = cc.CCSD(mf)
+ccsd.kernel()
+assert ccsd.converged
+
+# Solve lambda equations
+ccsd.solve_lambda()
+assert ccsd.converged_lambda
+
+# The parameter weight_tol controls which excitations are
+# reported as IPs or EAs in the output. Due to the nature
+# of the method, at some set of constraints, the approximate
+# GFCCSD solver may give non-physical excitations near the
+# Fermi energy with non-zero (though generally small) weight.
+# This weight_tol parameter will screen these unphysical 
+# excitations.
+
+# Run a GF-CCSD calculation with the default weight_tol
+gfcc = cc.MomGFCCSD(ccsd, niter=(3, 3))
+gfcc.weight_tol = 1e-1
+gfcc.kernel()
+
+# Run a GF-CCSD calculation with a much lower weight_tol -
+# one observes additional low-weighted IPs in the output
+gfcc = cc.MomGFCCSD(ccsd, niter=(3, 3))
+gfcc.weight_tol = 1e-5
+gfcc.kernel()
+
+# Note that these low-weighted poles can also be found in
+# EOM methods, though their low-weight means they can be
+# discarded for state-specific properties.
+eip = ccsd.ipccsd(nroots=12)[0]
diff --git a/examples/cc/54-momgfccsd_self_energy.py b/examples/cc/54-momgfccsd_self_energy.py
new file mode 100644
index 0000000000..058a074ca7
--- /dev/null
+++ b/examples/cc/54-momgfccsd_self_energy.py
@@ -0,0 +1,101 @@
+# Author: Oliver Backhouse <olbackhouse@gmail.com>
+
+"""
+Directly construct a pole representation of the 
+self-energy via an implicit Dyson equation
+for a Green's function computed at the CCSD level
+via moment-constrained GFCCSD.
+
+Ref: Backhouse, Booth, arXiv:2206.13198 (2022).
+"""
+
+import numpy as np
+from pyscf import gto, scf, cc, lib
+import scipy.linalg
+
+# Define system
+mol = gto.Mole()
+mol.atom = "O 0 0 0; O 0 0 1.2"
+mol.unit = "A"
+mol.basis = "cc-pvdz"
+mol.verbose = 4
+mol.build()
+
+# Run mean-field
+mf = scf.RHF(mol)
+mf.conv_tol_grad = 1e-10
+mf.kernel()
+assert mf.converged
+
+# Run CCSD
+ccsd = cc.CCSD(mf)
+ccsd.kernel()
+assert ccsd.converged
+
+# Solve lambda equations
+ccsd.solve_lambda()
+assert ccsd.converged_lambda
+
+# Run GF-CCSD:
+gfcc = cc.MomGFCCSD(ccsd, niter=(4, 4))
+gfcc.kernel()
+ip = gfcc.ipgfccsd(nroots=1)[0]
+ea = gfcc.eagfccsd(nroots=1)[0]
+
+# Transform the Green's function poles from the GFCCSD calculation
+# to poles of the self-energy. With the moment-conserving
+# GFCCSD solver, this can be done statically without the need to
+# numerically solve the Dyson equation. This procedure is described
+# in arXiv:2206.13198 (2022).
+
+# Combine hole and particle excitations:
+e = np.concatenate([gfcc.eh, gfcc.ep], axis=0)
+v = np.concatenate([gfcc.vh[0], gfcc.vp[0]], axis=1).T.conj()
+u = np.concatenate([gfcc.vh[1], gfcc.vp[1]], axis=1).T.conj()
+
+# Biorthogonalise physical vectors:
+m = np.dot(v.T.conj(), u)
+mv, mu = scipy.linalg.lu(m, permute_l=True)
+v = np.dot(np.linalg.inv(mv), v.T.conj()).T.conj()
+u = np.dot(u, np.linalg.inv(mu))
+
+# Find a basis for the null space:
+i = np.eye(u.shape[0]) - np.dot(v, u.T.conj())
+w, v_rest = np.linalg.eig(i)
+u_rest = np.linalg.inv(v_rest).T.conj()
+u_rest = u_rest[:, np.abs(w) > 0.5] * w[np.abs(w) > 0.5][None]
+v_rest = v_rest[:, np.abs(w) > 0.5] * w[np.abs(w) > 0.5][None]
+
+# Biorthogonalise external vectors:
+i = np.eye(u.shape[0]) - np.dot(v, u.T.conj())
+w, v_rest = np.linalg.eig(i)
+u_rest = np.linalg.inv(v_rest).T.conj()
+u_rest = u_rest[:, np.abs(w) > 0.5] * w[np.abs(w) > 0.5][None]
+v_rest = v_rest[:, np.abs(w) > 0.5] * w[np.abs(w) > 0.5][None]
+
+# Combine physical and external vectors:
+u = np.block([u, u_rest])
+v = np.block([v, v_rest])
+
+# Construct Hamiltonian, and rotate into arrowhead form:
+h = np.dot(v.T.conj() * e[None], u)
+w, v = np.linalg.eig(h[gfcc.nmo:, gfcc.nmo:])
+v = np.block([
+    [np.eye(gfcc.nmo), np.zeros((gfcc.nmo, w.size))],
+    [np.zeros((w.size, gfcc.nmo)), v],
+])
+h = np.linalg.multi_dot((np.linalg.inv(v), h, v))
+
+# Extract blocks:
+phys = h[:gfcc.nmo, :gfcc.nmo]            # Static part of the self-energy
+e_aux = np.diag(h[gfcc.nmo:, gfcc.nmo:])  # Energies of the self-energy
+v_aux = h[:gfcc.nmo, gfcc.nmo:]           # Left couplings of the self-energy
+u_aux = h[gfcc.nmo:, :gfcc.nmo].T.conj()  # Right couplings of the self-energy
+
+# Diagonalise the self-energy to check the energies match:
+e, v = np.linalg.eig(h)
+e = e[np.einsum("xi,ix->i", v[:gfcc.nmo], np.linalg.inv(v)[:, :gfcc.nmo]).real > gfcc.weight_tol]
+e = np.sort(e.real)
+
+print("IP directly from GFCCSD:", ip)
+print("IP recovered from self-energy:", -np.max(e[e < 0.5*(ea-ip)]))
diff --git a/examples/df/00-with_df.py b/examples/df/00-with_df.py
index 3ba31d9031..7379477045 100644
--- a/examples/df/00-with_df.py
+++ b/examples/df/00-with_df.py
@@ -39,8 +39,7 @@
 
 
 #
-# In PBC calculations, DF/MDF method should be used for all-electron
-# calculation.  There are various ways to initialize the DF methods.
+# In PBC calculations, DF method can be used for all-electron calculation.
 #
 cell = pgto.Cell()
 cell.atom='''
@@ -53,17 +52,9 @@
 3.370137329, 0.000000000, 3.370137329
 3.370137329, 3.370137329, 0.000000000'''
 cell.unit = 'B'
-cell.mesh = [10]*3
-#cell.verbose = 4
+cell.verbose = 4
 cell.build()
 kpts = cell.make_kpts([2,2,2])
 
-# Method 1: Calling .density_fit
 mf = pdft.KRKS(cell, kpts=kpts).density_fit(auxbasis='ahlrichs')
 mf.kernel()
-
-# Method 2: Overwriting the existed .with_df attribute.  All PBC SCF object
-# has the .with_df attribute.  Don't forget to pass kpts to DF object.
-mf.with_df = pdft.MDF(cell, kpts=kpts)
-mf.kernel()
-
diff --git a/examples/dft/24-define_xc_functional.py b/examples/dft/24-define_xc_functional.py
index d199febfdd..e4efae3089 100644
--- a/examples/dft/24-define_xc_functional.py
+++ b/examples/dft/24-define_xc_functional.py
@@ -35,7 +35,18 @@ def eval_xc(xc_code, rho, spin=0, relativity=0, deriv=1, omega=None, verbose=Non
     vlapl = None
     vtau = None
     vxc = (vrho, vgamma, vlapl, vtau)
-    fxc = None  # 2nd order functional derivative
+    v2rho2 = 0.01 * 6 * rho0
+    v2rhosigma = .02 * .5 * (gamma+.001)**(-.5)
+    v2sigma2 = 0.02 * .5 * -.5 * (gamma+.001)**(-1.5)
+    v2lapl2 = None
+    vtau2 = None
+    v2rholapl = None
+    v2rhotau = None
+    v2lapltau = None
+    v2sigmalapl = None
+    v2sigmatau = None
+    # 2nd order functional derivative
+    fxc = (v2rho2, v2rhosigma, v2sigma2, v2lapl2, vtau2, v2rholapl, v2rhotau, v2lapltau, v2sigmalapl, v2sigmatau)
     kxc = None  # 3rd order functional derivative
 
     # Mix with existing functionals
diff --git a/examples/dft/33-nlc_functionals.py b/examples/dft/33-nlc_functionals.py
index f100fa1833..71fe5cb3cb 100644
--- a/examples/dft/33-nlc_functionals.py
+++ b/examples/dft/33-nlc_functionals.py
@@ -5,13 +5,12 @@
 
 '''
 A simple example to run density functional with non-local correlation calculation.
-
-Available NLC functionals: wB97M_V, wB97X_V, B97M_V
 '''
 
 from pyscf import gto, dft
 
-mol = gto.M(atom='H    0.000000000  -0.120407870  -0.490828400; F    0.000000000   0.009769450  -1.404249780', basis='6-31G', symmetry=False, verbose=10, unit='Angstrom', spin=0)
+mol = gto.M(atom='H    0.000000000  -0.120407870  -0.490828400; F    0.000000000   0.009769450  -1.404249780',
+            basis='6-31G')
 mf = dft.RKS(mol)
 mf.xc='wB97M_V'
 mf.nlc='VV10'
diff --git a/examples/fci/03-cylindrical_symmetry.py b/examples/fci/03-cylindrical_symmetry.py
new file mode 100644
index 0000000000..78ad41eca7
--- /dev/null
+++ b/examples/fci/03-cylindrical_symmetry.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+
+'''
+FCI solver direct_spin1_cyl_sym is designed for systems with cylindrical
+symmetry. It is not compatible with the wavefunction obtained from
+direct_spin1_symm. The wave function of direct_spin1_cyl_sym is computed with
+the cylindrical symmetry adapted complex orbitals.
+'''
+
+from pyscf import gto, ao2mo
+from pyscf.fci import direct_spin1_cyl_sym
+
+mol = gto.Mole()
+mol.build(
+    atom = 'Li 0 0 0; Li 0 0 1.5',
+    basis = 'cc-pVDZ',
+    symmetry = True,
+)
+mf = mol.RHF().run()
+# To reduce the cost, this example takes just the first 19 orbitals
+c = mf.mo_coeff[:,:19]
+orbsym = mf.mo_coeff.orbsym[:19]
+
+h1e = c.T.dot(mf.get_hcore()).dot(c)
+eri = ao2mo.kernel(mol, c)
+solver = direct_spin1_cyl_sym.FCI(mol)
+for sym in ['A1g', 'A1u', 'E1ux', 'E1uy', 'E1gx', 'E1gy', 'E2ux', 'E2uy', 'E2gx', 'E2gy', 'E3ux', 'E3uy', 'E3gx', 'E3gy']:
+    e, v = solver.kernel(h1e, eri, c.shape[1], mol.nelec, orbsym=orbsym,
+                         wfnsym=sym, nroots=5, verbose=0)
+    print(f'{sym}: {e}')
diff --git a/examples/fci/11-large_ci.py b/examples/fci/11-large_ci.py
index 92edc0c722..06d901aee2 100644
--- a/examples/fci/11-large_ci.py
+++ b/examples/fci/11-large_ci.py
@@ -26,7 +26,7 @@
 
 # Output all determinants coefficients
 print('   det-alpha,    det-beta,    CI coefficients')
-occslst = fci.cistring._gen_occslst(range(ncas), nelec//2)
+occslst = fci.cistring.gen_occslst(range(ncas), nelec//2)
 for i,occsa in enumerate(occslst):
     for j,occsb in enumerate(occslst):
         print('   %s       %s      %.12f' % (occsa, occsb, mc.ci[i,j]))
diff --git a/examples/fci/15-FCI_hamiltonian.py b/examples/fci/15-FCI_hamiltonian.py
index f322b62449..5cecf7376b 100644
--- a/examples/fci/15-FCI_hamiltonian.py
+++ b/examples/fci/15-FCI_hamiltonian.py
@@ -5,6 +5,8 @@
 
 '''
 Generate the entire FCI Hamiltonian for small system
+
+See also 36-determinants_basis_matrix.py
 '''
 
 import numpy
diff --git a/examples/fci/35-transition_density_matrix.py b/examples/fci/35-transition_density_matrix.py
new file mode 100644
index 0000000000..1dec4c94ae
--- /dev/null
+++ b/examples/fci/35-transition_density_matrix.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+#
+# Author: Qiming Sun <osirpt.sun@gmail.com>
+#
+
+'''
+Transition density matrix between singlet and triplet states
+'''
+
+import numpy as np
+from pyscf import gto
+from pyscf import fci
+from pyscf.fci import cistring
+
+# <T|i_alpha^+ j_beta|S>
+def make_rdm1_t2s(bra, ket, norb, nelec_ket):
+    neleca, nelecb = nelec = nelec_ket
+    ades_index = cistring.gen_des_str_index(range(norb), neleca+1)
+    bdes_index = cistring.gen_des_str_index(range(norb), nelecb)
+    na_bra = cistring.num_strings(norb, neleca+1)
+    nb_bra = cistring.num_strings(norb, nelecb-1)
+    na_ket = cistring.num_strings(norb, neleca)
+    nb_ket = cistring.num_strings(norb, nelecb)
+    assert bra.shape == (na_bra, nb_bra)
+    assert ket.shape == (na_ket, nb_ket)
+
+    t1bra = np.zeros((na_ket,nb_bra,norb))
+    t1ket = np.zeros((na_ket,nb_bra,norb))
+    for str0, tab in enumerate(bdes_index):
+        for _, i, str1, sign in tab:
+            t1ket[:,str1,i] += sign * ket[:,str0]
+    for str0, tab in enumerate(ades_index):
+        for _, i, str1, sign in tab:
+            t1bra[str1,:,i] += sign * bra[str0,:]
+    dm1 = np.einsum('abp,abq->pq', t1bra, t1ket)
+    return dm1
+
+# <S|i_beta^+ j_alpha|T>
+def make_rdm1_s2t(bra, ket, norb, nelec_ket):
+    '''Inefficient version. A check for make_rdm1_t2s'''
+    neleca, nelecb = nelec = nelec_ket
+    ades_index = cistring.gen_des_str_index(range(norb), neleca)
+    bcre_index = cistring.gen_cre_str_index(range(norb), nelecb)
+    na_bra = cistring.num_strings(norb, neleca-1)
+    nb_bra = cistring.num_strings(norb, nelecb+1)
+    na_ket = cistring.num_strings(norb, neleca)
+    nb_ket = cistring.num_strings(norb, nelecb)
+    assert bra.shape == (na_bra, nb_bra)
+    assert ket.shape == (na_ket, nb_ket)
+
+    t1ket = np.zeros((na_bra,nb_ket,norb))
+    for str0, tab in enumerate(ades_index):
+        for _, i, str1, sign in tab:
+            t1ket[str1,:,i] += sign * ket[str0]
+
+    t1bra = np.zeros((na_bra,nb_bra,norb,norb))
+    for str0, tab in enumerate(bcre_index):
+        for a, _, str1, sign in tab:
+            t1bra[:,str1,a] += sign * t1ket[:,str0]
+    dm1 = np.einsum('ab,abpq->pq', bra, t1bra)
+    return dm1
+
+
+if __name__ == '__main__':
+    mol = gto.M(
+        atom = '''
+        Be 0   0  0
+        H  0 -.9 .3
+        H  0  .9 .3
+        ''',
+        basis = 'sto-3g'
+    )
+    mf = mol.RHF().run()
+    neleca, nelecb = mol.nelec
+    norb = mf.mo_coeff.shape[1]
+
+    np.set_printoptions(4, linewidth=150)
+    cisolver = fci.FCI(mf)
+    e_s, wfn_s = cisolver.kernel()
+
+    cisolver.spin = 2
+    e_t, wfn_t = cisolver.kernel()
+    print(f'Singlet state energy = {e_s}, Triplet state energy = {e_t}')
+
+    dm_st = make_rdm1_s2t(wfn_s, wfn_t, norb, (neleca+1, nelecb-1))
+    dm_ts = make_rdm1_t2s(wfn_t, wfn_s, norb, (neleca, nelecb))
+    print(abs(dm_st - dm_ts.T).max())
diff --git a/examples/fci/36-determinants_basis_matrix.py b/examples/fci/36-determinants_basis_matrix.py
new file mode 100644
index 0000000000..7817334e3b
--- /dev/null
+++ b/examples/fci/36-determinants_basis_matrix.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+#
+# Author: Qiming Sun <osirpt.sun@gmail.com>
+#
+
+'''
+One-particle and two-particle operators represented in Slater determinant basis.
+'''
+
+import numpy as np
+from pyscf import lib
+from pyscf import ao2mo
+from pyscf import fci
+from pyscf.fci import cistring
+from pyscf.fci import fci_slow
+
+np.random.seed(1)
+norb = 7
+nelec = (4,4)
+h1 = np.random.random((norb,norb))
+eri = np.random.random((norb,norb,norb,norb))
+# Restore permutation symmetry
+h1 = h1 + h1.T
+eri = eri + eri.transpose(1,0,2,3)
+eri = eri + eri.transpose(0,1,3,2)
+eri = eri + eri.transpose(2,3,0,1)
+
+link_indexa = link_indexb = cistring.gen_linkstr_index(range(norb), nelec[0])
+na = nb = cistring.num_strings(norb, nelec[0])
+idx_a = idx_b = np.arange(na)
+
+# One-particle operator is straightforward. Using the link_index we can
+# propagate determinants in ket to determinants in bra for operator
+#    O_{pq} * a_p^+ a_q
+mat1 = np.zeros((na,nb,na,nb))
+for str0, tab in enumerate(link_indexa):
+    for p, q, str1, sign in tab:
+        # alpha spin
+        mat1[str1,idx_b,str0,idx_b] += sign * h1[p,q]
+        # beta spin
+        mat1[idx_a,str1,idx_a,str0] += sign * h1[p,q]
+mat1 = mat1.reshape(na*nb,na*nb)
+
+# Two-particle operator is a little bit complicated. The code below requires a
+# representation of the operator in this form
+#    O_{pqrs} * a_p^+ a_q a_r^+ a_s
+# However, the regular ERI tensor is
+#    eri_{pqrs} * a_p^+ a_r^+ a_s a_q
+# We need to use the absorb_h1e function to transform the ERI tensor first
+h2 = fci_slow.absorb_h1e(h1*0, eri, norb, nelec)
+t1 = np.zeros((norb,norb,na,nb,na,nb))
+for str0, tab in enumerate(link_indexa):
+    for a, i, str1, sign in tab:
+        # alpha spin
+        t1[a,i,str1,idx_b,str0,idx_b] += sign
+        # beta spin
+        t1[a,i,idx_a,str1,idx_a,str0] += sign
+t1 = lib.einsum('psqr,qrABab->psABab', h2, t1)
+mat2 = np.zeros((na,nb,na,nb))
+for str0, tab in enumerate(link_indexa):
+    for a, i, str1, sign in tab:
+        # alpha spin
+        mat2[str1] += sign * t1[a,i,str0]
+        # beta spin
+        mat2[:,str1] += sign * t1[a,i,:,str0]
+mat2 = mat2.reshape(na*nb,na*nb)
+
+H_fci = mat1 + mat2 * .5
+H_ref = fci.direct_spin1.pspace(h1, eri, norb, nelec, np=1225)[1]
+print('Check', abs(H_fci - H_ref).max())
diff --git a/examples/geomopt/01-geomeTRIC.py b/examples/geomopt/01-geomeTRIC.py
index f69bcb04dd..ef5f12e7c5 100644
--- a/examples/geomopt/01-geomeTRIC.py
+++ b/examples/geomopt/01-geomeTRIC.py
@@ -39,3 +39,28 @@
 
 # method 2
 mol_eq = mc.Gradients().optimizer(solver='geomeTRIC').kernel(conv_params)
+
+
+#
+# geometry optimization for DFT, MP2, CCSD
+#
+mol = gto.M(atom='''
+C       1.1879  -0.3829 0.0000
+C       0.0000  0.5526  0.0000
+O       -1.1867 -0.2472 0.0000
+H       -1.9237 0.3850  0.0000
+H       2.0985  0.2306  0.0000
+H       1.1184  -1.0093 0.8869
+H       1.1184  -1.0093 -0.8869
+H       -0.0227 1.1812  0.8852
+H       -0.0227 1.1812  -0.8852
+            ''', basis='3-21g')
+
+mf = mol.RKS(xc='pbe,')
+mol1 = optimize(mf)
+
+mymp2 = mol.MP2()
+mol1 = optimize(mymp2)
+
+mycc = mol.CCSD()
+mol1 = optimize(mycc)
diff --git a/examples/pbc/09-talk_to_ase.py b/examples/pbc/09-talk_to_ase.py
index c68eb8add9..dbbcdfc14f 100644
--- a/examples/pbc/09-talk_to_ase.py
+++ b/examples/pbc/09-talk_to_ase.py
@@ -48,7 +48,7 @@
 energies = []
 for x in np.linspace(0.95, 1.2, 5):
     ase_atom.set_cell(ase_cell * x, scale_atoms = True)
-    print "[x: %f, E: %f]" % (x, ase_atom.get_potential_energy())
+    print("[x: %f, E: %f]" % (x, ase_atom.get_potential_energy()))
     volumes.append(ase_atom.get_volume())
     energies.append(ase_atom.get_potential_energy())
 
diff --git a/examples/pbc/22-k_points_tddft.py b/examples/pbc/22-k_points_tddft.py
index a6178e1926..f7d3799ab5 100644
--- a/examples/pbc/22-k_points_tddft.py
+++ b/examples/pbc/22-k_points_tddft.py
@@ -55,7 +55,6 @@
 td.nstates = 5
 td.verbose = 5
 print(td.kernel()[0] * 27.2114)
-print(td.oscillator_strength())
 
 
 # TODO:
diff --git a/examples/pbc/26-linear_dep.py b/examples/pbc/26-linear_dep.py
index 44767ee4fa..29f9724545 100644
--- a/examples/pbc/26-linear_dep.py
+++ b/examples/pbc/26-linear_dep.py
@@ -9,12 +9,14 @@
 from pyscf import scf as mol_scf
 from pyscf.pbc import gto, dft
 
+aug_basis = [[0, [0.08, 1]], [0, [0.12, 1]]]
+
 cell = gto.Cell()
 cell.atom='''
 C 0.000000000000   0.000000000000   0.000000000000
 C 1.685068664391   1.685068664391   1.685068664391
 '''
-cell.basis = 'gth-qzvp'
+cell.basis = ('gth-dzvp', aug_basis)
 cell.pseudo = 'gth-pade'
 cell.a = '''
 0.000000000, 3.370137329, 3.370137329
diff --git a/examples/pbc/40-custom_gdf.py b/examples/pbc/40-custom_gdf.py
new file mode 100644
index 0000000000..3bf8c9fe2c
--- /dev/null
+++ b/examples/pbc/40-custom_gdf.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+
+'''
+This example shows how to use a pre-computed GDF tensor in a PBC calculation
+'''
+
+from pyscf.pbc import gto
+
+cell = gto.Cell()
+cell.atom='''
+C 0.000000000000   0.000000000000   0.000000000000
+C 1.685068664391   1.685068664391   1.685068664391
+'''
+cell.a = '''
+0.000000000, 3.370137329, 3.370137329
+3.370137329, 0.000000000, 3.370137329
+3.370137329, 3.370137329, 0.000000000'''
+cell.basis = 'sto3g'
+cell.unit = 'B'
+cell.verbose = 5
+cell.build()
+
+kpts = cell.make_kpts([2,2,2])
+
+# First, generate GDF tensor and saved in a file. Filename needs to be assigned
+# to the attribute _cderi_to_save
+mf = cell.KRHF(kpts=kpts).density_fit()
+mf.with_df._cderi_to_save = 'gdf-sample.h5'
+mf.with_df.build()
+
+# To reuse the pre-computed GDF CDERI tensor, assign the GDF CDERI file to
+# the attribute _cderi
+mf = cell.KRHF(kpts=kpts).density_fit()
+mf.with_df._cderi = 'gdf-sample.h5'
+mf.run()
+
+# For DFT hybrid functionals, flag _j_only needs to be set manually.
+# DFT program assumes that the GDF CDERI file does not contain integral data for
+# HF exchange matrix. Without specifying this flag, GDF cderi tensor may be
+# regenerated for hybrid functionals.
+mf = cell.KRKS(kpts=kpts).density_fit()
+mf.xc = 'pbe'
+mf.with_df._cderi = 'gdf-sample.h5'
+mf.with_df._j_only = False
+mf.run()
diff --git a/examples/scf/04-dirac_hf.py b/examples/scf/04-dirac_hf.py
index 8914c307ff..96b1ce8bc3 100644
--- a/examples/scf/04-dirac_hf.py
+++ b/examples/scf/04-dirac_hf.py
@@ -25,7 +25,7 @@
     atom = '''
 Cl 0  0     0
 H  0  1.9   0''',
-    basis = {'Cl': gto.uncontract_basis(gto.basis.load('ccpvdz', 'Cl')),
+    basis = {'Cl': 'unc-ccpvdz',
              'H' : 'ccpvdz'},
 )
 lib.param.LIGHT_SPEED = 90.  # Change light speed globally
diff --git a/examples/scf/05-breit_gaunt.py b/examples/scf/05-breit_gaunt.py
index 09dbe85f79..dab63bae8a 100644
--- a/examples/scf/05-breit_gaunt.py
+++ b/examples/scf/05-breit_gaunt.py
@@ -14,7 +14,8 @@
     atom = '''
 Cl 0  0     0
 H  0  1.9   0''',
-    basis = {'Cl': gto.uncontract_basis(gto.basis.load('ccpvdz', 'Cl')),
+    # The prefix 'unc-" decontracts basis to primitive Gaussian basis
+    basis = {'Cl': 'unc-ccpvdz',
              'H' : 'ccpvdz'},
 )
 mf = scf.DHF(mol)
diff --git a/examples/scf/41-hf_with_given_densityfit_ints.py b/examples/scf/41-hf_with_given_densityfit_ints.py
index bff0df7820..364b4804f3 100644
--- a/examples/scf/41-hf_with_given_densityfit_ints.py
+++ b/examples/scf/41-hf_with_given_densityfit_ints.py
@@ -48,7 +48,7 @@
 #
 with h5py.File(ftmp.name, 'r') as file1:
     mf = scf.density_fit(scf.RHF(fake_mol))
-    mf._cderi = file1['j3c']
+    mf._cderi = file1
     mf.get_hcore = lambda *args: (mol.intor('cint1e_kin_sph') +
                                   mol.intor('cint1e_nuc_sph'))
     mf.get_ovlp = lambda *args: mol.intor('cint1e_ovlp_sph')
diff --git a/examples/scf/42-remove_linear_dep.py b/examples/scf/42-remove_linear_dep.py
index 5c3cb89b23..d64bfe4a30 100644
--- a/examples/scf/42-remove_linear_dep.py
+++ b/examples/scf/42-remove_linear_dep.py
@@ -48,22 +48,28 @@
 
 def eig(h, s):
     d, t = numpy.linalg.eigh(s)
-# Removing the eigenvectors assoicated to the smallest eigenvalue, the new
-# basis defined by x matrix has 139 vectors.
-    x = t[:,d>1e-8] / numpy.sqrt(d[d>1e-8])
+    # Removing the eigenvectors assoicated to the smallest eigenvalue.
+    x = t[:,d>1e-7] / numpy.sqrt(d[d>1e-7])
     xhx = reduce(numpy.dot, (x.T, h, x))
     e, c = numpy.linalg.eigh(xhx)
     c = numpy.dot(x, c)
-# Return 139 eigenvalues and 139 eigenvectors.
     return e, c
 #
 # Replacing the default eig function with the above one,  the HF solver
-# generate only 139 canonical orbitals
+# generate only 138 canonical orbitals
 #
 mf = scf.RHF(mol)
 mf.eig = eig
 mf.verbose = 4
 mf.kernel()
+#
+# Note: The default settings in scf.addons.remove_linear_dep_ may lead to
+# convergence issue in the following CASSCF calculations due to numerical
+# noises. We need to drop more basis by overwriting the keyword arguments
+# threshold and lindep
+#
+# mf = scf.addons.remove_linear_dep_(mf, threshold=1e-7, lindep=1e-7)
+#
 
 #
 # The CASSCF solver takes the HF orbital as initial guess.  The MCSCF problem
@@ -116,7 +122,7 @@ def eig(h, s):
 #
     for ir in range(nirrep):
         d, t = numpy.linalg.eigh(s[ir])
-        x = t[:,d>1e-8] / numpy.sqrt(d[d>1e-8])
+        x = t[:,d>1e-7] / numpy.sqrt(d[d>1e-7])
         xhx = reduce(numpy.dot, (x.T, h[ir], x))
         e, c = numpy.linalg.eigh(xhx)
         cs.append(reduce(numpy.dot, (mol.symm_orb[ir], x, c)))
diff --git a/pyscf/__init__.py b/pyscf/__init__.py
index 3c05feee2e..3c79b02ad1 100644
--- a/pyscf/__init__.py
+++ b/pyscf/__init__.py
@@ -27,24 +27,18 @@
 and the web-based Python IDE `Ipython notebook <http://ipython.org/notebook.html>`_
 to try out the package::
 
-    >>> from pyscf import gto, scf
-    >>> mol = gto.M(atom='H 0 0 0; H 0 0 1.2', basis='cc-pvdz')
-    >>> mol.apply(scf.RHF).run()
+    >>> import pyscf
+    >>> mol = pyscf.M(atom='H 0 0 0; H 0 0 1.2', basis='cc-pvdz')
+    >>> mol.RHF().run()
     converged SCF energy = -1.06111199785749
     -1.06111199786
 
 '''
 
-__version__ = '2.1.1'
+__version__ = '2.2.1'
 
 import os
 import sys
-# Avoid too many threads being created in OMP loops.
-# See issue https://github.com/pyscf/pyscf/issues/317
-if 'OPENBLAS_NUM_THREADS' not in os.environ:
-    os.environ['OPENBLAS_NUM_THREADS'] = '1'
-if 'MKL_NUM_THREADS' not in os.environ:
-    os.environ['MKL_NUM_THREADS'] = '1'
 
 # Load modules which are developed as plugins of the namespace package
 PYSCF_EXT_PATH = os.getenv('PYSCF_EXT_PATH')
@@ -88,25 +82,15 @@
                          'these plugins through the environment variable '
                          'PYSCF_EXT_PATH\n' % '\n'.join(__path__[1:]))
 
-from distutils.version import LooseVersion
 import numpy
-if LooseVersion(numpy.__version__) <= '1.8.0':
-    raise SystemError("You're using an old version of Numpy (%s). "
-                      "It is recommended to upgrade numpy to 1.8.0 or newer. \n"
-                      "You still can use all features of PySCF with the old numpy by removing this warning msg. "
-                      "Some modules (DFT, CC, MRPT) might be affected because of the bug in old numpy." %
-                      numpy.__version__)
-elif '1.16.2' <= LooseVersion(numpy.__version__) < '1.18':
-    #sys.stderr.write('Numpy 1.16 has memory leak bug  '
-    #                 'https://github.com/numpy/numpy/issues/13808\n'
-    #                 'It is recommended to downgrade to numpy 1.15 or older\n')
+if numpy.__version__[:5] in ('1.16.', '1.17.'):
+    # Numpy memory leak bug https://github.com/numpy/numpy/issues/13808
     import ctypes
     from numpy.core import _internal
     def _get_void_ptr(arr):
         simple_arr = numpy.asarray(_internal._unsafe_first_element_pointer(arr))
         c_arr = (ctypes.c_char * 0).from_buffer(simple_arr)
         return ctypes.cast(ctypes.byref(c_arr), ctypes.c_void_p)
-    # patch _get_void_ptr as a workaround to numpy issue #13808
     _internal._get_void_ptr = _get_void_ptr
 
 from pyscf import __config__
@@ -127,4 +111,4 @@ def M(**kwargs):
     else:  # Molecule
         return gto.M(**kwargs)
 
-del os, sys, LooseVersion
+del os, sys
diff --git a/pyscf/ao2mo/__init__.py b/pyscf/ao2mo/__init__.py
index 24cdf195ff..1039da73cf 100644
--- a/pyscf/ao2mo/__init__.py
+++ b/pyscf/ao2mo/__init__.py
@@ -477,30 +477,3 @@ def get_ao_eri(mol):
     return mol.intor('int2e', aosym='s4')
 
 get_mo_eri = kernel
-
-
-if __name__ == '__main__':
-    from pyscf import scf
-    from pyscf import gto
-    from pyscf.ao2mo import addons
-    mol = gto.M(
-        verbose = 0,
-        atom = [
-            ["O" , (0. , 0.     , 0.)],
-            [1   , (0. , -0.757 , 0.587)],
-            [1   , (0. , 0.757  , 0.587)]],
-        basis = 'ccpvdz')
-
-    mf = scf.RHF(mol)
-    mf.scf()
-
-    eri0 = full(mf._eri, mf.mo_coeff)
-    mos = (mf.mo_coeff,)*4
-    print(numpy.allclose(eri0, full(mol, mf.mo_coeff)))
-    print(numpy.allclose(eri0, general(mf._eri, mos)))
-    print(numpy.allclose(eri0, general(mol, mos)))
-    with load(full(mol, mf.mo_coeff, 'h2oeri.h5', dataname='dat1'), 'dat1') as eri1:
-        print(numpy.allclose(eri0, eri1))
-    with load(general(mol, mos, 'h2oeri.h5', dataname='dat1'), 'dat1') as eri1:
-        print(numpy.allclose(eri0, eri1))
-
diff --git a/pyscf/ao2mo/_ao2mo.py b/pyscf/ao2mo/_ao2mo.py
index 18589bb7f1..2fde69a48a 100644
--- a/pyscf/ao2mo/_ao2mo.py
+++ b/pyscf/ao2mo/_ao2mo.py
@@ -210,7 +210,7 @@ def r_e1(intor, mo_coeff, orbs_slice, sh_range, atm, bas, env,
     assert (aosym in ('s4', 's2ij', 's2kl', 's1', 'a2ij', 'a2kl', 'a4ij',
                      'a4kl', 'a4'))
     intor = ascint3(intor)
-    mo_coeff = numpy.asfortranarray(mo_coeff)
+    mo_coeff = numpy.asarray(mo_coeff, dtype=numpy.complex128, order='F')
     i0, i1, j0, j1 = orbs_slice
     icount = i1 - i0
     jcount = j1 - j0
diff --git a/pyscf/ao2mo/addons.py b/pyscf/ao2mo/addons.py
index d20d6ae88a..097fc8eb79 100644
--- a/pyscf/ao2mo/addons.py
+++ b/pyscf/ao2mo/addons.py
@@ -40,10 +40,10 @@ def __enter__(self):
             feri = self.feri = h5py.File(self.eri, 'r')
         elif isinstance(self.eri, h5py.Group):
             feri = self.eri
+        elif isinstance(self.eri, (numpy.ndarray, h5py.Dataset)):
+            return self.eri
         elif isinstance(getattr(self.eri, 'name', None), str):
             feri = self.feri = h5py.File(self.eri.name, 'r')
-        elif isinstance(self.eri, numpy.ndarray):
-            return self.eri
         else:
             raise RuntimeError('Unknown eri type %s', type(self.eri))
 
diff --git a/pyscf/ao2mo/nrr_outcore.py b/pyscf/ao2mo/nrr_outcore.py
new file mode 100644
index 0000000000..a3f8a40d7c
--- /dev/null
+++ b/pyscf/ao2mo/nrr_outcore.py
@@ -0,0 +1,389 @@
+#!/usr/bin/env python
+# Copyright 2014-2018 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+ao2mo of scalar integrals to complex MO integrals for GHF orbitals
+'''
+
+import time
+import tempfile
+import numpy
+import h5py
+import ctypes
+import _ctypes
+from pyscf import lib
+from pyscf import gto
+from pyscf.lib import logger
+from pyscf.ao2mo import _ao2mo
+from pyscf.ao2mo import outcore
+from pyscf import __config__
+from pyscf.gto.moleintor import make_cintopt, make_loc, ascint3
+
+libao2mo = lib.load_library('libao2mo')
+def _fpointer(name):
+    return ctypes.c_void_p(_ctypes.dlsym(libao2mo._handle, name))
+
+IOBLK_SIZE = getattr(__config__, 'ao2mo_outcore_ioblk_size', 256)  # 256 MB
+IOBUF_WORDS = getattr(__config__, 'ao2mo_outcore_iobuf_words', 1e8)  # 1.6 GB
+IOBUF_ROW_MIN = getattr(__config__, 'ao2mo_outcore_row_min', 160)
+MAX_MEMORY = getattr(__config__, 'ao2mo_outcore_max_memory', 4000)  # 4GB
+
+def full(mol, mo_coeff, erifile, dataname='eri_mo',
+         intor='int2e_sph', motype='ghf', aosym='s1', comp=None,
+         max_memory=MAX_MEMORY, ioblk_size=IOBLK_SIZE, verbose=logger.WARN):
+    general(mol, (mo_coeff,)*4, erifile, dataname,
+            intor, motype, aosym, comp, max_memory, ioblk_size, verbose)
+    return erifile
+
+def general(mol, mo_coeffs, erifile, dataname='eri_mo',
+            intor='int2e_sph', motype='ghf', aosym='s1', comp=None,
+            max_memory=MAX_MEMORY, ioblk_size=IOBLK_SIZE, verbose=logger.debug):
+    time_0pass = (logger.process_clock(), logger.perf_counter())
+    log = logger.new_logger(mol, verbose)
+    intor, comp = gto.moleintor._get_intor_and_comp(mol._add_suffix(intor), comp)
+    klsame = iden_coeffs(mo_coeffs[2], mo_coeffs[3])
+
+    nmoi = mo_coeffs[0].shape[1]
+    nmoj = mo_coeffs[1].shape[1]
+    nmok = mo_coeffs[2].shape[1]
+    nmol = mo_coeffs[3].shape[1]
+    nao = mo_coeffs[0].shape[0]//2
+
+    if motype == 'j-spinor':
+        ca, cb = mol.sph2spinor_coeff()
+        mo_alph = [numpy.dot(ca, moi) for moi in mo_coeffs]
+        mo_beta = [numpy.dot(cb, moi) for moi in mo_coeffs]
+    elif motype == 'ghf':
+        nao = mo_coeffs[0].shape[0] // 2
+        mo_alph = [moi[:nao,:] for moi in mo_coeffs]
+        mo_beta = [moi[nao:,:] for moi in mo_coeffs]
+    else:
+        raise AssertionError(f'Unknown motype {motype}. should be one of "j-spinor" or "ghf".')
+
+    aosym = outcore._stand_sym_code(aosym)
+    if aosym in ('s1', 's2ij', 'a2ij'):
+        nao_pair = nao * nao
+    else:
+        nao_pair = _count_naopair(mol, nao)
+
+    nij_pair = nmoi*nmoj
+    nkl_pair = nmok*nmol
+
+    if klsame and aosym in ('s4', 's2kl', 'a2kl', 'a4ij', 'a4kl', 'a4'):
+        log.debug('k-mo == l-mo')
+        mokla = numpy.asarray(mo_alph[2], dtype=numpy.complex128, order='F')
+        moklb = numpy.asarray(mo_beta[2], dtype=numpy.complex128, order='F')
+        klshape = (0, nmok, 0, nmok)
+    else:
+        mokla = numpy.asarray(numpy.hstack((mo_alph[2],mo_alph[3])),
+                             dtype=numpy.complex128, order='F')
+        moklb = numpy.asarray(numpy.hstack((mo_beta[2],mo_beta[3])),
+                             dtype=numpy.complex128, order='F')
+        klshape = (0, nmok, nmok, nmok+nmol)
+
+    if isinstance(erifile, str):
+        if h5py.is_hdf5(erifile):
+            feri = h5py.File(erifile, 'a')
+            if dataname in feri:
+                del(feri[dataname])
+        else:
+            feri = h5py.File(erifile, 'w')
+    else:
+        assert(isinstance(erifile, h5py.Group))
+        feri = erifile
+
+    if comp == 1:
+        chunks = (nmoj,nmol)
+        shape = (nij_pair, nkl_pair)
+    else:
+        chunks = (1,nmoj,nmol)
+        shape = (comp, nij_pair, nkl_pair)
+
+    if nij_pair == 0 or nkl_pair == 0:
+        feri.create_dataset(dataname, shape, 'c16')
+        if isinstance(erifile, str):
+            feri.close()
+        return erifile
+    else:
+        h5d_eri = feri.create_dataset(dataname, shape, 'c16', chunks=chunks)
+
+    log.debug('MO integrals %s are saved in %s/%s', intor, erifile, dataname)
+    log.debug('num. MO ints = %.8g, required disk %.8g MB',
+              float(nij_pair)*nkl_pair*comp, nij_pair*nkl_pair*comp*16/1e6)
+
+# transform e1
+    swapfile = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+    half_e1(mol, (mo_alph, mo_beta), swapfile.name, intor, aosym, comp,
+            max_memory, ioblk_size, log)
+    time_1pass = log.timer('AO->MO transformation for %s 1 pass'%intor,
+                           *time_0pass)
+
+    e2buflen = guess_e2bufsize(ioblk_size, nij_pair, nao_pair)[0]
+
+    log.debug('step2: kl-pair (ao %d, mo %d), mem %.8g MB, '
+              'ioblock (r/w) %.8g/%.8g MB',
+              nao_pair, nkl_pair, e2buflen*nao_pair*16/1e6,
+              e2buflen*nij_pair*16/1e6, e2buflen*nkl_pair*16/1e6)
+
+    fswap = h5py.File(swapfile.name, 'r')
+    klaoblks = len(fswap['0'])
+    ijmoblks = int(numpy.ceil(float(nij_pair)/e2buflen)) * comp
+    #ao_loc = numpy.asarray(mol.ao_loc_2c(), dtype=numpy.int32)
+    ao_loc = numpy.asarray(mol.ao_loc, dtype=numpy.int32)
+    tao = numpy.asarray(mol.tmap(), dtype=numpy.int32)
+    ti0 = time_1pass
+    buf = numpy.empty((e2buflen, nao_pair), dtype=numpy.complex128)
+    istep = 0
+    for row0, row1 in prange(0, nij_pair, e2buflen):
+        nrow = row1 - row0
+
+        for icomp in range(comp):
+            istep += 1
+            tioi = 0
+            log.debug('step 2 [%d/%d], [%d,%d:%d], row = %d',
+                      istep, ijmoblks, icomp, row0, row1, nrow)
+
+            col0 = 0
+            for ic in range(klaoblks):
+                dat = fswap['%d/%d'%(icomp,ic)]
+                col1 = col0 + dat.shape[1]
+                buf[:nrow,col0:col1] = dat[row0:row1]
+                col0 = col1
+            ti2 = log.timer('step 2 [%d/%d], load buf'%(istep,ijmoblks), *ti0)
+            tioi += ti2[1]-ti0[1]
+            pbuf = _ao2mo.r_e2(buf[:nrow], mokla, klshape, tao, ao_loc, aosym)
+            pbuf2 = _ao2mo.r_e2(buf[:nrow], moklb, klshape, tao, ao_loc, aosym)
+
+            tw1 = logger.perf_counter()
+            if comp == 1:
+                h5d_eri[row0:row1] = pbuf+pbuf2
+            else:
+                h5d_eri[icomp,row0:row1] = pbuf+pbuf2
+            tioi += logger.perf_counter()-tw1
+
+            ti1 = (logger.process_clock(), logger.perf_counter())
+            log.debug('step 2 [%d/%d] CPU time: %9.2f, Wall time: %9.2f, I/O time: %9.2f',
+                      istep, ijmoblks, ti1[0]-ti0[0], ti1[1]-ti0[1], tioi)
+            ti0 = ti1
+    buf = pbuf = None
+    fswap.close()
+    if isinstance(erifile, str):
+        feri.close()
+
+    log.timer('AO->MO transformation for %s 2 pass'%intor, *time_1pass)
+    log.timer('AO->MO transformation for %s '%intor, *time_0pass)
+    return erifile
+
+def full_iofree(mol, mo_coeff, dataname='eri_mo', intor='int2e_sph',
+                motype='ghf', aosym='s1', comp=None, verbose=logger.debug,
+                **kwargs):
+    erifile = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+    general(mol, (mo_coeff,)*4, erifile.name, dataname='eri_mo',
+            intor=intor, motype=motype, aosym=aosym, comp=comp,
+            verbose=verbose)
+    with h5py.File(erifile.name, 'r') as feri:
+        return numpy.asarray(feri['eri_mo'])
+
+def general_iofree(mol, mo_coeffs, dataname='eri_mo', intor='int2e_sph',
+                   motype='ghf', aosym='s1', comp=None, verbose=logger.debug,
+                   **kwargs):
+    erifile = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+    general(mol, mo_coeffs, erifile.name, dataname='eri_mo',
+            intor=intor, motype=motype, aosym=aosym, comp=comp,
+            verbose=verbose)
+    with h5py.File(erifile.name, 'r') as feri:
+        return numpy.asarray(feri['eri_mo'])
+
+# swapfile will be overwritten if exists.
+# mo_coeffs contains two sets of mos, one is alpha back transformed, one is beta back transformed.
+def half_e1(mol, mo_coeffs, swapfile,
+            intor='int2e_sph', aosym='s1', comp=None,
+            max_memory=MAX_MEMORY, ioblk_size=IOBLK_SIZE, verbose=logger.WARN,
+            ao2mopt=None):
+    time0 = (logger.process_clock(), logger.perf_counter())
+    log = logger.new_logger(mol, verbose)
+
+    mo_alph = mo_coeffs[0]
+    mo_beta = mo_coeffs[1]
+    ijsame = iden_coeffs(mo_alph[0], mo_alph[1])
+
+    nmoi = mo_alph[0].shape[1]
+    nmoj = mo_alph[1].shape[1]
+    nao = mo_alph[0].shape[0]
+    aosym = outcore._stand_sym_code(aosym)
+    if aosym in ('s1', 's2kl', 'a2kl'):
+        nao_pair = nao * nao
+    else:
+        nao_pair = _count_naopair(mol, nao)
+    nij_pair = nmoi * nmoj
+
+    if ijsame and aosym in ('s4', 's2ij', 'a2ij', 'a4ij', 'a4kl', 'a4'):
+        log.debug('i-mo == j-mo')
+        moija = numpy.asarray(mo_alph[0], order='F')
+        moijb = numpy.asarray(mo_beta[0], order='F')
+        ijshape = (0, nmoi, 0, nmoi)
+    else:
+        moija = numpy.asarray(numpy.hstack((mo_alph[0],mo_alph[1])), order='F')
+        moijb = numpy.asarray(numpy.hstack((mo_beta[0],mo_beta[1])), order='F')
+        ijshape = (0, nmoi, nmoi, nmoi+nmoj)
+
+    e1buflen, mem_words, iobuf_words, ioblk_words = \
+            guess_e1bufsize(max_memory, ioblk_size, nij_pair, nao_pair, comp)
+# The buffer to hold AO integrals in C code
+    aobuflen = int((mem_words - iobuf_words) // (nao*nao*comp))
+    shranges = outcore.guess_shell_ranges(mol, (aosym not in ('s1', 's2ij', 'a2ij')),
+                                          aobuflen, e1buflen, mol.ao_loc, False)
+
+    log.debug('step1: tmpfile %.8g MB', nij_pair*nao_pair*16/1e6)
+    log.debug('step1: (ij,kl) = (%d,%d), mem cache %.8g MB, iobuf %.8g MB',
+              nij_pair, nao_pair, mem_words*16/1e6, iobuf_words*16/1e6)
+
+    fswap = h5py.File(swapfile, 'w')
+    for icomp in range(comp):
+        fswap.create_group(str(icomp))  # for h5py old version
+
+    tao = numpy.asarray(mol.tmap(), dtype=numpy.int32)
+
+    # transform e1
+    ti0 = log.timer('Initializing ao2mo.outcore.half_e1', *time0)
+    nstep = len(shranges)
+    for istep,sh_range in enumerate(shranges):
+        log.debug('step 1 [%d/%d], AO [%d:%d], len(buf) = %d',
+                  istep+1, nstep, *(sh_range[:3]))
+        buflen = sh_range[2]
+        iobuf = numpy.empty((comp,buflen,nij_pair), dtype=numpy.complex128)
+        nmic = len(sh_range[3])
+        p0 = 0
+        for imic, aoshs in enumerate(sh_range[3]):
+            log.debug1('      fill iobuf micro [%d/%d], AO [%d:%d], len(aobuf) = %d',
+                       imic+1, nmic, *aoshs)
+            buf = r_e1(intor, moija, moijb, ijshape, aoshs,
+                       mol._atm, mol._bas, mol._env,
+                       tao, aosym, comp, ao2mopt)
+            iobuf[:,p0:p0+aoshs[2]] = buf
+            p0 += aoshs[2]
+        ti2 = log.timer('gen AO/transform MO [%d/%d]'%(istep+1,nstep), *ti0)
+
+        e2buflen, chunks = guess_e2bufsize(ioblk_size, nij_pair, buflen)
+        for icomp in range(comp):
+            dset = fswap.create_dataset('%d/%d'%(icomp,istep),
+                                        (nij_pair,iobuf.shape[1]), 'c16',
+                                        chunks=None)
+            for col0, col1 in prange(0, nij_pair, e2buflen):
+                dset[col0:col1] = lib.transpose(iobuf[icomp,:,col0:col1])
+        ti0 = log.timer('transposing to disk', *ti2)
+    fswap.close()
+    return swapfile
+
+# if out is not None, transform AO to MO in-place
+def r_e1(intor, mo_a, mo_b, orbs_slice, sh_range, atm, bas, env,
+         tao, aosym='s1', comp=1, ao2mopt=None, out=None):
+    assert(aosym in ('s4', 's2ij', 's2kl', 's1', 'a2ij', 'a2kl', 'a4ij',
+                     'a4kl', 'a4'))
+    intor = ascint3(intor)
+    mo_a = numpy.asarray(mo_a, dtype=numpy.complex128, order='F')
+    mo_b = numpy.asarray(mo_b, dtype=numpy.complex128, order='F')
+    i0, i1, j0, j1 = orbs_slice
+    icount = i1 - i0
+    jcount = j1 - j0
+    ij_count = icount * jcount
+
+    c_atm = numpy.asarray(atm, dtype=numpy.int32)
+    c_bas = numpy.asarray(bas, dtype=numpy.int32)
+    c_env = numpy.asarray(env)
+    natm = ctypes.c_int(c_atm.shape[0])
+    nbas = ctypes.c_int(c_bas.shape[0])
+
+    klsh0, klsh1, nkl = sh_range
+
+    #if icount <= jcount:
+    fmmm = _fpointer('AO2MOmmm_nrr_iltj')
+    #else:
+    #   fmmm = _fpointer('AO2MOmmm_nrr_igtj')
+
+    out = numpy.ndarray((2*comp,nkl,ij_count), dtype=numpy.complex128,
+                        buffer=out)
+    if out.size == 0:
+        return out
+
+    if ao2mopt is not None:
+        cao2mopt = ao2mopt._this
+        cintopt = ao2mopt._cintopt
+        intor = ao2mopt._intor
+    else:
+        cao2mopt = lib.c_null_ptr()
+        cintopt = make_cintopt(c_atm, c_bas, c_env, intor)
+    cintor = _fpointer(intor)
+
+    tao = numpy.asarray(tao, dtype=numpy.int32)
+    ao_loc = make_loc(c_bas, 'int2e_sph')
+
+    fdrv = getattr(libao2mo, 'AO2MOnrr_e1_drv')
+    fill = _fpointer('AO2MOfill_nrr_' + aosym)
+    ftrans = _fpointer('AO2MOtranse1_nrr_' + aosym)
+    fdrv(cintor, fill, ftrans, fmmm,
+         out.ctypes.data_as(ctypes.c_void_p),
+         mo_a.ctypes.data_as(ctypes.c_void_p),
+         mo_b.ctypes.data_as(ctypes.c_void_p),
+         ctypes.c_int(klsh0), ctypes.c_int(klsh1-klsh0),
+         ctypes.c_int(nkl), ctypes.c_int(comp),
+         (ctypes.c_int*4)(*orbs_slice), tao.ctypes.data_as(ctypes.c_void_p),
+         ao_loc.ctypes.data_as(ctypes.c_void_p), cintopt, cao2mopt,
+         c_atm.ctypes.data_as(ctypes.c_void_p), natm,
+         c_bas.ctypes.data_as(ctypes.c_void_p), nbas,
+         c_env.ctypes.data_as(ctypes.c_void_p))
+    for i in range(comp):
+        out[i,:,:] += out[2*i+1,:,:]
+    return out[:comp,:,:]
+
+def iden_coeffs(mo1, mo2):
+    return (id(mo1) == id(mo2)) \
+            or (mo1.shape==mo2.shape and numpy.allclose(mo1,mo2))
+
+def prange(start, end, step):
+    for i in range(start, end, step):
+        yield i, min(i+step, end)
+
+def guess_e1bufsize(max_memory, ioblk_size, nij_pair, nao_pair, comp):
+    mem_words = max_memory * 1e6 / 16
+# part of the max_memory is used to hold the AO integrals.  The iobuf is the
+# buffer to temporary hold the transformed integrals before streaming to disk.
+# iobuf is then divided to small blocks (ioblk_words) and streamed to disk.
+    if mem_words > IOBUF_WORDS * 2:
+        iobuf_words = int(IOBUF_WORDS)
+    else:
+        iobuf_words = int(mem_words // 2)
+    ioblk_words = int(min(ioblk_size*1e6/16, iobuf_words))
+
+    e1buflen = int(min(iobuf_words//(comp*nij_pair), nao_pair))
+    return e1buflen, mem_words, iobuf_words, ioblk_words
+
+def guess_e2bufsize(ioblk_size, nrows, ncols):
+    e2buflen = int(min(ioblk_size*1e6/16/ncols, nrows))
+    e2buflen = max(e2buflen//IOBUF_ROW_MIN, 1) * IOBUF_ROW_MIN
+    chunks = (IOBUF_ROW_MIN, ncols)
+    return e2buflen, chunks
+
+def _count_naopair(mol, nao):
+    ao_loc = mol.ao_loc_2c()
+    nao_pair = 0
+    for i in range(mol.nbas):
+        di = ao_loc[i+1] - ao_loc[i]
+        for j in range(i+1):
+            dj = ao_loc[j+1] - ao_loc[j]
+            nao_pair += di * dj
+    return nao_pair
+
+del(MAX_MEMORY)
diff --git a/pyscf/ao2mo/test/test_nrr_outcore.py b/pyscf/ao2mo/test/test_nrr_outcore.py
new file mode 100644
index 0000000000..a81e275b34
--- /dev/null
+++ b/pyscf/ao2mo/test/test_nrr_outcore.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+# Copyright 2023 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import tempfile
+import numpy as np
+import h5py
+from pyscf import lib
+from pyscf import gto
+from pyscf.ao2mo import nrr_outcore
+
+class KnownValues(unittest.TestCase):
+    def test_nrr_ghf(self):
+        mol = gto.Mole()
+        mol.verbose = 5
+        mol.output = '/dev/null'
+        mol.atom = '''
+            O     0.   0.       0.
+            H     0.   -0.757   0.587
+            H     0.   0.757    0.587'''
+        mol.basis = '631g'
+        mol.build()
+        mf = mol.GHF().run()
+
+        nao = mol.nao
+        nmo = mf.mo_coeff.shape[1]
+        mo_a = mf.mo_coeff[:nao]
+        mo_b = mf.mo_coeff[nao:]
+        eri0 = mol.intor('int2e_sph')
+        eri1  = lib.einsum('pqrs,pi,qj->ijrs', eri0, mo_a.conj(), mo_a)
+        eri1 += lib.einsum('pqrs,pi,qj->ijrs', eri0, mo_b.conj(), mo_b)
+        ref  = lib.einsum('ijrs,rk,sl->ijkl', eri1, mo_a.conj(), mo_a)
+        ref += lib.einsum('ijrs,rk,sl->ijkl', eri1, mo_b.conj(), mo_b)
+
+        eri1 = nrr_outcore.full_iofree(mol, mf.mo_coeff, 'h2oeri.h5')
+        eri1 = eri1.reshape([nmo]*4)
+        self.assertAlmostEqual(abs(ref - eri1).max(), 0, 11)
+
+    def test_nrr_spinor(self):
+        mol = gto.Mole()
+        mol.verbose = 5
+        mol.output = '/dev/null'
+        mol.atom = '''
+            O     0.   0.       0.
+            H     0.   -0.757   0.587
+            H     0.   0.757    0.587'''
+        mol.basis = '631g'
+        mol.build()
+        mf = mol.GHF().x2c().run()
+        mo = mf.mo_coeff
+
+        nao = mol.nao
+        nmo = mf.mo_coeff.shape[1]
+        mo_a = mf.mo_coeff[:nao]
+        mo_b = mf.mo_coeff[nao:]
+        eri0 = mol.intor('int2e_spinor')
+        ref = lib.einsum('pqrs,pi,qj,rk,sl->ijkl', eri0, mo.conj(), mo, mo.conj(), mo)
+        eri1 = nrr_outcore.full_iofree(mol, mo, 'h2oeri.h5', motype='j-spinor')
+        eri1 = eri1.reshape([nmo]*4)
+        self.assertAlmostEqual(abs(ref - eri1).max(), 0, 11)
+
+
+if __name__ == '__main__':
+    print('Full Tests for ao2mo.nrr_outcore')
+    unittest.main()
diff --git a/pyscf/ao2mo/test/test_r_outcore.py b/pyscf/ao2mo/test/test_r_outcore.py
index 8866ac8e18..271968f950 100644
--- a/pyscf/ao2mo/test/test_r_outcore.py
+++ b/pyscf/ao2mo/test/test_r_outcore.py
@@ -113,4 +113,29 @@ def test_ao2mo_r_e2(self):
 if __name__ == '__main__':
     print('Full Tests for ao2mo.r_outcore')
     unittest.main()
-
+#
+#if __name__ == '__main__':
+#    from pyscf import scf
+#    from pyscf import gto
+#    from pyscf.ao2mo import addons
+#    mol = gto.M(
+#        verbose = 0,
+#        atom = [
+#            ["O" , (0. , 0.     , 0.)],
+#            [1   , (0. , -0.757 , 0.587)],
+#            [1   , (0. , 0.757  , 0.587)]],
+#        basis = 'ccpvdz')
+#
+#    mf = scf.RHF(mol)
+#    mf.scf()
+#
+#    eri0 = full(mf._eri, mf.mo_coeff)
+#    mos = (mf.mo_coeff,)*4
+#    print(numpy.allclose(eri0, full(mol, mf.mo_coeff)))
+#    print(numpy.allclose(eri0, general(mf._eri, mos)))
+#    print(numpy.allclose(eri0, general(mol, mos)))
+#    with load(full(mol, mf.mo_coeff, 'h2oeri.h5', dataname='dat1'), 'dat1') as eri1:
+#        print(numpy.allclose(eri0, eri1))
+#    with load(general(mol, mos, 'h2oeri.h5', dataname='dat1'), 'dat1') as eri1:
+#        print(numpy.allclose(eri0, eri1))
+#
diff --git a/pyscf/cc/__init__.py b/pyscf/cc/__init__.py
index 64f3a5c969..15780fdf60 100644
--- a/pyscf/cc/__init__.py
+++ b/pyscf/cc/__init__.py
@@ -76,6 +76,7 @@
 from pyscf.cc import eom_gccsd
 from pyscf.cc import qcisd
 from pyscf.cc import gfccsd
+from pyscf.cc import momgfccsd
 from pyscf import scf
 
 def CCSD(mf, frozen=None, mo_coeff=None, mo_occ=None):
@@ -213,3 +214,5 @@ def _finalize(self):
         return self
     mycc._finalize = _finalize.__get__(mycc, mycc.__class__)
     return mycc
+
+MomGFCCSD = momgfccsd.MomGFCCSD
diff --git a/pyscf/cc/eom_rccsd.py b/pyscf/cc/eom_rccsd.py
index b0cb3f2c1f..8c66c37a88 100644
--- a/pyscf/cc/eom_rccsd.py
+++ b/pyscf/cc/eom_rccsd.py
@@ -528,15 +528,18 @@ def contract_pr2p(r1, r2, a0, a1, b0, b1, cache_vvop_a, cache_vvop_b):
 class EOMIP(EOM):
     def get_init_guess(self, nroots=1, koopmans=True, diag=None):
         size = self.vector_size()
-        dtype = getattr(diag, 'dtype', np.double)
         nroots = min(nroots, size)
         guess = []
         if koopmans:
+            dtype = getattr(diag, 'dtype', np.double)
             for n in range(nroots):
                 g = np.zeros(int(size), dtype)
                 g[self.nocc-n-1] = 1.0
                 guess.append(g)
         else:
+            if diag is None:
+                diag = self.get_diag()
+            dtype = getattr(diag, 'dtype', np.double)
             idx = diag.argsort()[:nroots]
             for i in idx:
                 g = np.zeros(int(size), dtype)
@@ -894,15 +897,18 @@ def contract_pr2p(r1, r2, i0, i1, j0, j1, cache_ovvv_i, cache_ovvv_j):
 class EOMEA(EOM):
     def get_init_guess(self, nroots=1, koopmans=True, diag=None):
         size = self.vector_size()
-        dtype = getattr(diag, 'dtype', np.double)
         nroots = min(nroots, size)
         guess = []
         if koopmans:
+            dtype = getattr(diag, 'dtype', np.double)
             for n in range(nroots):
                 g = np.zeros(size, dtype)
                 g[n] = 1.0
                 guess.append(g)
         else:
+            if diag is None:
+                diag = self.get_diag()
+            dtype = getattr(diag, 'dtype', np.double)
             idx = diag.argsort()[:nroots]
             for i in idx:
                 g = np.zeros(size, dtype)
@@ -1657,6 +1663,8 @@ def eeccsd_diag(eom, imds=None):
 
 class EOMEE(EOM):
     def get_init_guess(self, nroots=1, koopmans=True, diag=None):
+        if diag is None:
+            diag = self.get_diag()
         if koopmans:
             nocc = self.nocc
             nvir = self.nmo - nocc
@@ -1699,9 +1707,12 @@ class EOMEESinglet(EOMEE):
     eomee_ccsd_singlet = eomee_ccsd_singlet
     matvec = eeccsd_matvec_singlet
 
+    def get_diag(self, imds=None):
+        return eeccsd_diag(self, imds=None)[0]
+
     def gen_matvec(self, imds=None, diag=None, **kwargs):
         if imds is None: imds = self.make_imds()
-        if diag is None: diag = self.get_diag(imds)[0]
+        if diag is None: diag = self.get_diag(imds)
         matvec = lambda xs: [self.matvec(x, imds) for x in xs]
         return matvec, diag
 
@@ -1722,9 +1733,12 @@ class EOMEETriplet(EOMEE):
     eomee_ccsd_triplet = eomee_ccsd_triplet
     matvec = eeccsd_matvec_triplet
 
+    def get_diag(self, imds=None):
+        return eeccsd_diag(self, imds=None)[1]
+
     def gen_matvec(self, imds=None, diag=None, **kwargs):
         if imds is None: imds = self.make_imds()
-        if diag is None: diag = self.get_diag(imds)[1]
+        if diag is None: diag = self.get_diag(imds)
         matvec = lambda xs: [self.matvec(x, imds) for x in xs]
         return matvec, diag
 
@@ -1745,9 +1759,12 @@ class EOMEESpinFlip(EOMEE):
     eomsf_ccsd = eomsf_ccsd
     matvec = eeccsd_matvec_sf
 
+    def get_diag(self, imds=None):
+        return eeccsd_diag(self, imds=None)[2]
+
     def gen_matvec(self, imds=None, diag=None, **kwargs):
         if imds is None: imds = self.make_imds()
-        if diag is None: diag = self.get_diag(imds)[2]
+        if diag is None: diag = self.get_diag(imds)
         matvec = lambda xs: [self.matvec(x, imds) for x in xs]
         return matvec, diag
 
diff --git a/pyscf/cc/eom_uccsd.py b/pyscf/cc/eom_uccsd.py
index 46ccde6491..f9385b200e 100644
--- a/pyscf/cc/eom_uccsd.py
+++ b/pyscf/cc/eom_uccsd.py
@@ -328,6 +328,8 @@ def __init__(self, cc):
         self.nmo = cc.get_nmo()
 
     def get_init_guess(self, nroots=1, koopmans=True, diag=None):
+        if diag is None:
+            diag = self.get_diag()
         if koopmans:
             nocca, noccb = self.nocc
             idx = diag[:nocca+noccb].argsort()
@@ -853,6 +855,8 @@ def __init__(self, cc):
         self.nmo = cc.get_nmo()
 
     def get_init_guess(self, nroots=1, koopmans=True, diag=None):
+        if diag is None:
+            diag = self.get_diag()
         if koopmans:
             nocca, noccb = self.nocc
             nmoa, nmob = self.nmo
@@ -1850,6 +1854,8 @@ class EOMEESpinKeep(EOMEE):
     get_diag = eeccsd_diag
 
     def get_init_guess(self, nroots=1, koopmans=True, diag=None):
+        if diag is None:
+            diag = self.get_diag()
         if koopmans:
             nocca, noccb = self.nocc
             nmoa, nmob = self.nmo
@@ -1904,6 +1910,8 @@ class EOMEESpinFlip(EOMEE):
     matvec = eomsf_ccsd_matvec
 
     def get_init_guess(self, nroots=1, koopmans=True, diag=None):
+        if diag is None:
+            diag = self.get_diag()
         if koopmans:
             nocca, noccb = self.nocc
             nmoa, nmob = self.nmo
diff --git a/pyscf/cc/momgfccsd.py b/pyscf/cc/momgfccsd.py
new file mode 100644
index 0000000000..f8b139fc14
--- /dev/null
+++ b/pyscf/cc/momgfccsd.py
@@ -0,0 +1,913 @@
+#!/usr/bin/env python
+# Copyright 2014-2021 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Author: Oliver Backhouse <olbackhouse@gmail.com>
+#
+
+"""
+GF-CCSD solver via moment constraints.
+
+See reference: Backhouse, Booth, arXiv:2206.13198 (2022).
+"""
+
+from collections import defaultdict
+
+import numpy as np
+import scipy.linalg
+
+from pyscf import lib, cc, ao2mo
+from pyscf.lib import logger
+from pyscf.agf2 import mpi_helper
+
+
+def kernel(
+        gfccsd,
+        hole_moments=None,
+        part_moments=None,
+        t1=None,
+        t2=None,
+        l1=None,
+        l2=None,
+        eris=None,
+        imds=None,
+        verbose=None,
+):
+    if gfccsd.verbose >= logger.WARN:
+        gfccsd.check_sanity()
+    gfccsd.dump_flags()
+
+    log = logger.new_logger(gfccsd, verbose)
+
+    if (l1 is None and gfccsd._cc.l1 is None) or (l2 is None and gfccsd._cc.l2 is None):
+        raise ValueError(
+                "Lambda amplitudes must be set for %s. This "
+                "can be done by calling solve_lambda on the "
+                "CC method or by setting l1, l2 attributes. "
+                % gfccsd.__class__.__name__
+        )
+
+    if (hole_moments is None or part_moments is None) and imds is None:
+        ip = hole_moments is None
+        ea = part_moments is None
+        imds = gfccsd.make_imds(eris=eris, ip=ip, ea=ea)
+
+    if hole_moments is None:
+        log.info("Building hole moments:")
+        hole_moments = gfccsd.build_hole_moments(t1=t1, t2=t2, l1=l1, l2=l2, imds=imds)
+    else:
+        log.info("Hole moments passed by argument.")
+
+    if part_moments is None:
+        log.info("Building particle moments:")
+        part_moments = gfccsd.build_part_moments(t1=t1, t2=t2, l1=l1, l2=l2, imds=imds)
+    else:
+        log.info("Particle moments passed by argument.")
+
+    if gfccsd.hermi_moments:
+        hole_moments = 0.5 * (hole_moments + hole_moments.swapaxes(1, 2).conj())
+        part_moments = 0.5 * (part_moments + part_moments.swapaxes(1, 2).conj())
+
+    if gfccsd.hermi_solver:
+        solver = block_lanczos_symm
+        eig = eigh_block_tridiagonal
+    else:
+        solver = block_lanczos_nosymm
+        eig = eig_block_tridiagonal
+
+    log.info("Solving for the hole moments.")
+    blocks = solver(gfccsd, hole_moments)
+    orth = mat_sqrt(hole_moments[0], hermi=gfccsd.hermi_solver)
+    eh, vh = eig(gfccsd, *blocks, orth=orth)
+
+    log.info("Solving for the particle moment.")
+    blocks = solver(gfccsd, part_moments)
+    orth = mat_sqrt(part_moments[0], hermi=gfccsd.hermi_solver)
+    ep, vp = eig(gfccsd, *blocks, orth=orth)
+
+    # Check the moments
+    if gfccsd.niter[0] is not None:
+        for n in range(2*gfccsd.niter[0]+2):
+            a = lib.einsum("xk,yk,k->xy", vh[0], vh[1].conj(), eh**n)
+            a /= np.max(np.abs(a))
+            b = hole_moments[n] / np.max(np.abs(hole_moments[n]))
+            err = np.max(np.abs(a - b))
+            (logger.debug1 if err < 1e-8 else logger.warn)(
+                    gfccsd, "Error in hole moment %d:  %10.6g", n, err)
+    if gfccsd.niter[0] is not None:
+        for n in range(2*gfccsd.niter[1]+2):
+            a = lib.einsum("xk,yk,k->xy", vp[0], vp[1].conj(), ep**n)
+            a /= np.max(np.abs(a))
+            b = part_moments[n] / np.max(np.abs(part_moments[n]))
+            err = np.max(np.abs(a - b))
+            (logger.debug1 if err < 1e-8 else logger.warn)(
+                    gfccsd, "Error in particle moment %d:  %10.6g", n, err)
+
+    mask = np.argsort(eh.real)
+    eh, vh = eh[mask], (vh[0][:, mask], vh[1][:, mask])
+    mask = np.argsort(ep.real)
+    ep, vp = ep[mask], (vp[0][:, mask], vp[1][:, mask])
+
+    return eh, vh, ep, vp
+
+
+def mat_sqrt(m, hermi=False):
+    """Return the square root of a matrix.
+    """
+
+    if hermi:
+        w, v = np.linalg.eigh(m)
+        mask = w >= 0
+        w, v = w[mask], v[:, mask]
+        out = np.dot(v * w[None]**0.5, v.T.conj())
+
+    else:
+        w, v = np.linalg.eig(m)
+        out = np.dot(v * w[None]**(0.5+0j), np.linalg.inv(v))
+
+    return out
+
+
+def mat_isqrt(m, tol=1e-16, hermi=False):
+    """Return the inverse square root of a matrix.
+    """
+
+    if hermi:
+        w, v = np.linalg.eigh(m)
+        mask = w > tol
+        w, v = w[mask], v[:, mask]
+        out = np.dot(v * w[None]**-0.5, v.T.conj())
+
+    else:
+        w, v = np.linalg.eig(m)
+        mask = np.abs(w) >= tol
+        vinv = np.linalg.inv(v)[mask]
+        w, v = w[mask], v[:, mask]
+        out = np.dot(v * w[None]**(-0.5+0j), vinv)
+
+    return out
+
+
+def build_block_tridiagonal(a, b, c=None):
+    """Construct a block tridiagonal matrix from a list of on-diagonal
+    and off-diagonal blocks.
+    """
+
+    z = np.zeros_like(a[0], dtype=a[0].dtype)
+
+    if c is None:
+        c = [x.T.conj() for x in b]
+
+    h = np.block([[
+        a[i] if i == j else
+        b[j] if j == i-1 else
+        c[i] if i == j-1 else z
+        for j in range(len(a))]
+        for i in range(len(a))]
+    )
+
+    return h
+
+
+def eig_block_tridiagonal(gfccsd, a, b, c, orth=None):
+    """Diagonalise a non-Hermitian block-tridiagonal Hamiltonian and
+    transform its eigenvectors appropriately.
+    """
+
+    h_tri = build_block_tridiagonal(a, b, c)
+
+    e, u = np.linalg.eig(h_tri)
+
+    if orth is not None:
+        vl = np.dot(orth, u[:gfccsd.nmo])
+        vr = np.dot(np.linalg.inv(u)[:, :gfccsd.nmo], orth).T.conj()
+    else:
+        vl = u[:gfccsd.nmo]
+        vr = np.linalg.inv(u)[:, :gfccsd.nmo].T.conj()
+
+    return e, (vl, vr)
+
+
+def eigh_block_tridiagonal(gfccsd, a, b, orth=None):
+    """Diagonalise a Hermitian block-tridiagonal Hamiltonian and
+    transform its eigenvectors appropriately.
+    """
+
+    h_tri = build_block_tridiagonal(a, b)
+
+    e, u = np.linalg.eigh(h_tri)
+
+    if orth is not None:
+        v = np.dot(orth, u[:gfccsd.nmo])
+    else:
+        v = u[:gfccsd.nmo]
+
+    return e, (v, v)
+
+
+def _matrix_info(x, hermi=False):
+    norm = np.abs(np.einsum("pq,qp->", x, x))
+    eigvals = np.linalg.eigvals(x)
+    mineig = np.min(np.abs(eigvals))
+    maxeig = np.max(np.abs(eigvals))
+    return norm, mineig, maxeig
+
+
+def block_lanczos_symm(gfccsd, moments, verbose=None):
+    """Hermitian block Lanczos solver, returns a set of poles that
+    best reproduce the inputted moments.
+
+    Args:
+        gfccsd : MomGFCCSD
+            GF-CCSD object
+        moments : ndarray (2*niter+2, n, n)
+            Array of moments with which the resulting poles should
+            be consistent with.
+
+    Kwargs:
+        verbose : int
+            Level of verbosity.
+
+    Returns:
+        a : ndarray (niter+1, n, n)
+            On-diagonal blocks of the block tridiagonal Hamiltonian.
+        b : ndarray (niter, n, n)
+            Off-diagonal blocks of the block tridiagonal Hamiltonian.
+    """
+
+    log = logger.new_logger(gfccsd, verbose)
+    log.debug1("block_lanczos_symm: %d moments", len(moments))
+
+    nmo = gfccsd.nmo
+    niter = (len(moments) - 2) // 2
+    dtype = np.complex128
+
+    a = np.zeros((niter+1, nmo, nmo), dtype=dtype)
+    b = np.zeros((niter, nmo, nmo), dtype=dtype)
+    t = np.zeros((len(moments), nmo, nmo), dtype=dtype)
+
+    v = defaultdict(lambda: np.zeros((nmo, nmo), dtype=dtype))
+    v[0, 0] = np.eye(nmo).astype(dtype)
+
+    orth = mat_isqrt(moments[0], hermi=True)
+    for i in range(len(moments)):
+        t[i] = np.linalg.multi_dot((orth, moments[i], orth))
+
+    a[0] = t[1]
+
+    log.debug1("Raw moments:")
+    log.debug1("  %4s %12s %12s %12s", "N", "norm", "min(|eig|)", "max(|eig|)")
+    for i in range(len(moments)):
+        log.debug1("  %4d %12.6g %12.6g %12.6g", i, *_matrix_info(moments[i], hermi=True))
+
+    log.debug1("Orthogonalised moments:")
+    log.debug1("  %4s %12s %12s %12s", "N", "norm", "min(|eig|)", "max(|eig|)")
+    for i in range(len(moments)):
+        log.debug1("  %4d %12.6g %12.6g %12.6g", i, *_matrix_info(t[i], hermi=True))
+
+    for i in range(niter):
+        log.info("Iteration %d", i)
+
+        b2 = np.zeros((nmo, nmo), dtype=dtype)
+        for j in range(i+2):
+            for l in range(i+1):
+                b2 += np.linalg.multi_dot((v[i, l].T.conj(), t[j+l+1], v[i, j-1]))
+
+        b2 -= np.dot(a[i], a[i])
+        if i:
+            b2 -= np.dot(b[i-1], b[i-1])
+
+        b[i] = mat_sqrt(b2, hermi=True)
+        binv = mat_isqrt(b2, hermi=True)
+
+        for j in range(i+2):
+            r = (
+                    + v[i, j-1]
+                    - np.dot(v[i, j], a[i])
+                    - np.dot(v[i-1, j], b[i-1])
+            )
+            v[i+1, j] = np.dot(r, binv)
+
+        for j in range(i+2):
+            for l in range(i+2):
+                a[i+1] += np.linalg.multi_dot((v[i+1, l].T.conj(), t[j+l+1], v[i+1, j]))
+
+        log.debug1("  %4s %12s %12s %12s", "mat", "norm", "min(|eig|)", "max(|eig|)")
+        log.debug1("  %4s %12.6g %12.6g %12.6g", "B^2", *_matrix_info(b2))
+        log.debug1("  %4s %12.6g %12.6g %12.6g", "B", *_matrix_info(b[i]))
+        log.debug1("  %4s %12.6g %12.6g %12.6g", "B^-1", *_matrix_info(binv))
+        log.debug1("  %4s %12.6g %12.6g %12.6g", "A", *_matrix_info(a[i+1]))
+
+        biorth_error = 0.0
+        for j in range(i+2):
+            x = np.zeros_like(v[0, 0])
+            for k in range(i+2):
+                for l in range(i+2):
+                    x += np.linalg.multi_dot((v[i+1, l].T.conj(), t[k+l], v[j, k]))
+            biorth_error = max(biorth_error, np.max(np.abs(x - np.eye(nmo)*((i+1)==j))))
+        log.info("  Error in biorthogonality:  %12.6g", biorth_error)
+
+    return a, b
+
+
+def block_lanczos_nosymm(gfccsd, moments, verbose=None):
+    """Non-Hermitian block Lanczos solver, returns a set of poles that
+    best reproduce the inputted moments.
+
+    Args:
+        gfccsd : MomGFCCSD
+            GF-CCSD object
+        moments : ndarray (2*niter+2, n, n)
+            Array of moments with which the resulting poles should
+            be consistent with.
+
+    Kwargs:
+        verbose : int
+            Level of verbosity.
+
+    Returns:
+        a : ndarray (niter+1, n, n)
+            On-diagonal blocks of the block tridiagonal Hamiltonian.
+        b : ndarray (niter, n, n)
+            Upper off-diagonal blocks of the block tridiagonal
+            Hamiltonian.
+        c : ndarray (niter, n, n)
+            Lower off-diagonal blocks of the block tridiagonal
+            Hamiltonian.
+    """
+
+    log = logger.new_logger(gfccsd, verbose)
+    log.debug1("block_lanczos_nosymm: %d moments", len(moments))
+
+    nmo = gfccsd.nmo
+    niter = (len(moments) - 2) // 2
+    dtype = np.complex128
+
+    a = np.zeros((niter+1, nmo, nmo), dtype=dtype)
+    b = np.zeros((niter, nmo, nmo), dtype=dtype)
+    c = np.zeros((niter, nmo, nmo), dtype=dtype)
+    t = np.zeros((len(moments), nmo, nmo), dtype=dtype)
+
+    v = defaultdict(lambda: np.zeros((nmo, nmo), dtype=dtype))
+    w = defaultdict(lambda: np.zeros((nmo, nmo), dtype=dtype))
+    v[0, 0] = np.eye(nmo).astype(dtype)
+    w[0, 0] = np.eye(nmo).astype(dtype)
+
+    orth = mat_isqrt(moments[0])
+    for i in range(len(moments)):
+        t[i] = np.linalg.multi_dot((orth, moments[i], orth))
+
+    a[0] = t[1]
+
+    log.debug1("Raw moments:")
+    log.debug1("  %4s %12s %12s %12s", "N", "norm", "min(|eig|)", "max(|eig|)")
+    for i in range(len(moments)):
+        log.debug1("  %4d %12.6g %12.6g %12.6g", i, *_matrix_info(moments[i]))
+
+    log.debug1("Orthogonalised moments:")
+    log.debug1("  %4s %12s %12s %12s", "N", "norm", "min(|eig|)", "max(|eig|)")
+    for i in range(len(moments)):
+        log.debug1("  %4d %12.6g %12.6g %12.6g", i, *_matrix_info(t[i]))
+
+    for i in range(niter):
+        log.info("Iteration %d", i)
+
+        b2 = np.zeros((nmo, nmo), dtype=dtype)
+        c2 = np.zeros((nmo, nmo), dtype=dtype)
+
+        for j in range(i+2):
+            for l in range(i+1):
+                b2 += np.linalg.multi_dot((w[i, l], t[j+l+1], v[i, j-1]))
+                c2 += np.linalg.multi_dot((w[i, j-1], t[j+l+1], v[i, l]))
+
+        b2 -= np.dot(a[i], a[i])
+        c2 -= np.dot(a[i], a[i])
+        if i:
+            b2 -= np.dot(c[i-1], c[i-1])
+            c2 -= np.dot(b[i-1], b[i-1])
+
+        b[i] = mat_sqrt(b2)
+        c[i] = mat_sqrt(c2)
+
+        binv = mat_isqrt(b2)
+        cinv = mat_isqrt(c2)
+
+        for j in range(i+2):
+            r = (
+                    + v[i, j-1]
+                    - np.dot(v[i, j], a[i])
+                    - np.dot(v[i-1, j], b[i-1])
+            )
+            v[i+1, j] = np.dot(r, cinv)
+
+            s = (
+                    + w[i, j-1]
+                    - np.dot(a[i], w[i, j])
+                    - np.dot(c[i-1], w[i-1, j])
+            )
+            w[i+1, j] = np.dot(binv, s)
+
+        for j in range(i+2):
+            for l in range(i+2):
+                a[i+1] += np.linalg.multi_dot((w[i+1, l], t[j+l+1], v[i+1, j]))
+
+        log.debug1("  %4s %12s %12s %12s", "mat", "norm", "min(|eig|)", "max(|eig|)")
+        log.debug1("  %4s %12.6g %12.6g %12.6g", "B^2", *_matrix_info(b2))
+        log.debug1("  %4s %12.6g %12.6g %12.6g", "B", *_matrix_info(b[i]))
+        log.debug1("  %4s %12.6g %12.6g %12.6g", "B^-1", *_matrix_info(binv))
+        log.debug1("  %4s %12.6g %12.6g %12.6g", "C^2", *_matrix_info(c2))
+        log.debug1("  %4s %12.6g %12.6g %12.6g", "C", *_matrix_info(c[i]))
+        log.debug1("  %4s %12.6g %12.6g %12.6g", "C^-1", *_matrix_info(cinv))
+        log.debug1("  %4s %12.6g %12.6g %12.6g", "A", *_matrix_info(a[i+1]))
+
+        biorth_error = 0.0
+        for j in range(i+2):
+            x = np.zeros_like(v[0, 0])
+            y = np.zeros_like(v[0, 0])
+            for k in range(i+2):
+                for l in range(i+2):
+                    x += np.linalg.multi_dot((w[i+1, l], t[k+l], v[j, k]))
+                    y += np.linalg.multi_dot((w[j, l], t[k+l], v[i+1, k]))
+            biorth_error = max(biorth_error, np.max(np.abs(x - np.eye(nmo)*((i+1)==j))))
+            biorth_error = max(biorth_error, np.max(np.abs(y - np.eye(nmo)*((i+1)==j))))
+        log.info("  Error in biorthogonality:  %12.6g", biorth_error)
+
+    return a, b, c
+
+
+def _kd(n, i):
+    v = np.zeros((n,))
+    v[i] = 1.0
+    return v
+
+
+def contract_ket_hole(gfccsd, eom, t1, t2, v, orb):
+    r"""Contract a vector with \bar{a}^\dagger_p |\Psi>.
+    """
+
+    nocc, nvir = t1.shape
+
+    if orb < nocc:
+        return v[orb]
+    else:
+        b1 = t1[:, orb-nocc]
+        b2 = t2[:, :, orb-nocc]
+        b = eom.amplitudes_to_vector(b1, b2)
+        return np.dot(v, b)
+
+
+def build_ket_hole(gfccsd, eom, t1, t2, orb):
+    r"""Build \bar{a}^\dagger_p |\Psi>.
+    """
+
+    nocc, nvir = t1.shape
+
+    if orb < nocc:
+        b1 = np.eye(nocc)[orb]
+        b2 = np.zeros((nocc, nocc, nvir))
+    else:
+        b1 = t1[:, orb-nocc]
+        b2 = t2[:, :, orb-nocc]
+
+    return eom.amplitudes_to_vector(b1, b2)
+
+
+def build_bra_hole(gfccsd, eom, t1, t2, l1, l2, orb):
+    """Get the first- and second-order contributions to the left-hand
+    transformed vector for a given orbital for the hole part of the
+    Green's function.
+    """
+
+    nocc, nvir = t1.shape
+
+    if orb < nocc:
+        e1 = _kd(nocc, orb)
+        e1 -= lib.einsum("ie,e->i", l1, t1[orb])
+        tmp = t2[orb] * 2.0
+        tmp -= t2[orb].swapaxes(1, 2)
+        e1 -= lib.einsum("imef,mef->i", l2, tmp)
+
+        tmp = -lib.einsum("ijea,e->ija", l2, t1[orb])
+        e2 = 2.0 * tmp
+        e2 -= tmp.swapaxes(0, 1)
+        tmp = lib.einsum("ja,i->ija", l1, _kd(nocc, orb))
+        e2 += tmp * 2.0
+        e2 -= tmp.swapaxes(0, 1)
+
+    else:
+        e1 = l1[:, orb-nocc].copy()
+        e2 = l2[:, :, orb-nocc] * 2.0
+        e2 -= l2[:, :, :, orb-nocc]
+
+    return eom.amplitudes_to_vector(e1, e2)
+
+
+def contract_ket_part(gfccsd, eom, t1, t2, v, orb):
+    r"""Contract a vector with \bar{a}_p |\Psi>.
+    """
+
+    nocc, nvir = t1.shape
+
+    if orb < nocc:
+        b1 = t1[orb]
+        b2 = t2[orb]
+        b = eom.amplitudes_to_vector(b1, b2)
+        return np.dot(v, b)
+    else:
+        return -v[orb-nocc]
+
+
+def build_ket_part(gfccsd, eom, t1, t2, orb):
+    r"""Build \bar{a}_p |\Psi>.
+    """
+
+    nocc, nvir = t1.shape
+
+    if orb < nocc:
+        b1 = t1[orb]
+        b2 = t2[orb]
+    else:
+        b1 = -np.eye(nvir)[orb-nocc]
+        b2 = np.zeros((nocc, nvir, nvir))
+
+    return eom.amplitudes_to_vector(b1, b2)
+
+
+def build_bra_part(gfccsd, eom, t1, t2, l1, l2, orb):
+    """Get the first- and second-order contributions to the left-hand
+    transformed vector for a given orbital for the particle part of the
+    Green's function.
+    """
+
+    nocc, nvir = t1.shape
+
+    if orb < nocc:
+        e1 = -l1[orb]
+        e2 = -l2[orb] * 2.0
+        e2 += l2[:, orb]
+
+    else:
+        e1 = _kd(nvir, orb-nocc)
+        e1 -= lib.einsum("mb,m->b", l1, t1[:, orb-nocc])
+        tmp = t2[:, :, :, orb-nocc] * 2.0
+        tmp -= t2[:, :, orb-nocc]
+        e1 -= lib.einsum("kmeb,kme->b", l2, tmp)
+
+        tmp = -lib.einsum("ikba,k->iab", l2, t1[:, orb-nocc])
+        e2 = tmp * 2.0
+        e2 -= tmp.swapaxes(1, 2)
+        tmp = lib.einsum("ib,a->iab", l1, _kd(nvir, orb-nocc))
+        e2 += tmp * 2.0
+        e2 -= tmp.swapaxes(1, 2)
+
+    return eom.amplitudes_to_vector(e1, e2)
+
+
+class MomGFCCSD(lib.StreamObject):
+    """Green's function coupled cluster singles and doubles using the
+    moment-resolved solver.
+
+    Attributes:
+        verbose : int
+            Print level. Default value equals to :class:`Mole.verbose`.
+        niter : tuple of (int, int)
+            Number of block Lanczos iterations for occupied and virtual
+            sectors. If either are `None` then said sector will not be
+            computed.
+        weight_tol : float
+            Threshold for weight in the physical space to consider a
+            pole an ionisation or removal event. Default value is 1e-1.
+        hermi_moments : bool
+            Whether to Hermitise the moments, default value is False.
+        hermi_solver : obol
+            Whether to use the real-valued, symmetric block Lanczos
+            solver, default value is False.
+
+    Results:
+        eh : ndarray
+            Energies of the compressed hole Green's function
+        vh : tuple of ndarray
+            Left- and right-hand transition amplitudes of the compressed
+            hole Green's function
+        ep : ndarray
+            Energies of the compressed particle Green's function
+        vp : tuple of ndarray
+            Left- and right-hand transition amplitudes of the compressed
+            particle Green's function
+    """
+
+    def __init__(self, mycc, niter=(2, 2)):
+        self._cc = mycc
+        self.verbose = mycc.verbose
+        self.stdout = mycc.stdout
+
+        if isinstance(mycc, cc.uccsd.UCCSD):
+            raise NotImplementedError("MomGFCCSD for unrestricted CCSD")
+
+        if isinstance(niter, int):
+            self.niter = (niter, niter)
+        else:
+            self.niter = niter
+        self.weight_tol = 1e-1
+        self.hermi_moments = False
+        self.hermi_solver = False
+        self.eh = None
+        self.ep = None
+        self.vh = None
+        self.vp = None
+        self._t1 = None
+        self._t2 = None
+        self._l1 = None
+        self._l2 = None
+        self.chkfile = self._cc.chkfile
+        self._keys = set(self.__dict__.keys())
+
+    def dump_flags(self, verbose=None):
+        log = logger.new_logger(self, verbose)
+        log.info("")
+        log.info("******** %s ********", self.__class__)
+        log.info("niter = %s", self.niter)
+        log.info("nmo = %s", self.nmo)
+        log.info("nocc = %s", self.nocc)
+        log.info("weight_tol = %s", self.weight_tol)
+        log.info("hermi_moments = %s", self.hermi_moments)
+        log.info("hermi_solver = %s", self.hermi_solver)
+        log.info("chkfile = %s", self.chkfile)
+
+    def _finalize(self):
+        self.ipgfccsd()
+        self.eagfccsd()
+        return self
+
+    def reset(self, mol=None):
+        self._cc.reset(mol)
+        return self
+
+    @property
+    def eomip_method(self):
+        return self._cc.eomip_method()
+
+    @property
+    def eomea_method(self):
+        return self._cc.eomea_method()
+
+    build_bra_hole = build_bra_hole
+    build_bra_part = build_bra_part
+    contract_ket_hole = contract_ket_hole
+    contract_ket_part = contract_ket_part
+
+    def make_imds(self, eris=None, ip=True, ea=True):
+        """Build EOM intermediates.
+        """
+
+        imds = cc.eom_rccsd._IMDS(self._cc, eris=eris)
+
+        if ip:
+            imds.make_ip()
+        if ea:
+            imds.make_ea()
+
+        return imds
+
+    def build_hole_moments(self, t1=None, t2=None, l1=None, l2=None, imds=None, niter=None):
+        """Build moments of the hole (IP-EOM-CCSD) Green's function.
+        """
+
+        if t1 is None:
+            t1 = self._cc.t1
+        if t2 is None:
+            t2 = self._cc.t2
+        if l1 is None:
+            l1 = self._cc.l1
+        if l2 is None:
+            l2 = self._cc.l2
+
+        if niter is None:
+            niter = self.niter[0]
+        nmom = 2 * niter + 2
+        moments = np.zeros((nmom, self.nmo, self.nmo))
+
+        cput0 = (logger.process_clock(), logger.perf_counter())
+
+        eom = self.eomip_method()
+        if imds is None:
+            imds = self.make_imds(ea=False)
+        diag = eom.get_diag(imds)
+
+        for p in mpi_helper.nrange(self.nmo):
+            ket = self.build_bra_hole(eom, t1, t2, l1, l2, p)
+            for n in range(nmom):
+                for q in range(self.nmo):
+                    moments[n, q, p] += self.contract_ket_hole(eom, t1, t2, ket, q)
+                if (n+1) != nmom:
+                    ket = -eom.l_matvec(ket, imds, diag)
+
+        mpi_helper.barrier()
+        moments = mpi_helper.allreduce(moments)
+
+        logger.timer(self, "IP-EOM-CCSD moments", *cput0)
+
+        return moments
+
+    def build_part_moments(self, t1=None, t2=None, l1=None, l2=None, imds=None, niter=None):
+        """Build moments of the particle (EA-EOM-CCSD) Green's function.
+        """
+
+        if t1 is None:
+            t1 = self._cc.t1
+        if t2 is None:
+            t2 = self._cc.t2
+        if l1 is None:
+            l1 = self._cc.l1
+        if l2 is None:
+            l2 = self._cc.l2
+
+        if niter is None:
+            niter = self.niter[1]
+        nmom = 2 * niter + 2
+        moments = np.zeros((nmom, self.nmo, self.nmo))
+
+        cput0 = (logger.process_clock(), logger.perf_counter())
+
+        eom = self.eomea_method()
+        if imds is None:
+            imds = self.make_imds(ip=False)
+        diag = eom.get_diag(imds)
+
+        for p in mpi_helper.nrange(self.nmo):
+            ket = self.build_bra_part(eom, t1, t2, l1, l2, p)
+            for n in range(nmom):
+                for q in range(self.nmo):
+                    moments[n, q, p] -= self.contract_ket_part(eom, t1, t2, ket, q)
+                if (n+1) != nmom:
+                    ket = eom.l_matvec(ket, imds, diag)
+
+        mpi_helper.barrier()
+        moments = mpi_helper.allreduce(moments)
+
+        logger.timer(self, "EA-EOM-CCSD moments", *cput0)
+
+        return moments
+
+    def make_rdm1(self, ao_repr=False, eris=None, imds=None):
+        """Build the first-order reduced density matrix at the CCSD
+        level using the zeroth-order moment of the hole part of the
+        CCSD Green's function.
+        """
+
+        if imds is None:
+            imds = self.make_imds(eris=eris, ea=False)
+
+        dm1 = self.build_hole_moments(imds=imds, niter=0)[0]
+        dm1 = dm1 + dm1.T.conj()
+
+        if ao_repr:
+            mo = self._cc.mo_coeff
+            dm1 = np.linalg.multi_dot((mo, dm1, mo.T.conj()))
+
+        return dm1
+
+    def kernel(self, **kwargs):
+        eh, vh, ep, vp = kernel(self, **kwargs)
+
+        self.eh = eh
+        self.vh = vh
+        self.ep = ep
+        self.vp = vp
+
+        self._finalize()
+
+        return eh, vh, ep, vp
+
+    def dump_chk(self, chkfile=None, key="gfccsd"):
+        if chkfile is None:
+            chkfile = self.chkfile
+
+        lib.chkfile.dump(chkfile, key+"/eh", self.eh)
+        lib.chkfile.dump(chkfile, key+"/vh_left", self.vh[0])
+        lib.chkfile.dump(chkfile, key+"/vh_right", self.vh[1])
+        lib.chkfile.dump(chkfile, key+"/ep", self.ep)
+        lib.chkfile.dump(chkfile, key+"/vp_left", self.vp[0])
+        lib.chkfile.dump(chkfile, key+"/vp_right", self.vp[1])
+        lib.chkfile.dump(chkfile, key+"/niter", np.array(self.niter))
+
+        return self
+
+    def update_from_chk_(self, chkfile=None, key="gfccsd"):
+        if chkfile is None:
+            chkfile = self.chkfile
+
+        self.eh = lib.chkfile.load(chkfile, key+"/eh")
+        self.vh = (
+                lib.chkfile.load(chkfile, key+"/vh_left"),
+                lib.chkfile.load(chkfile, key+"/vh_right"),
+        )
+        self.ep = lib.chkfile.load(chkfile, key+"/ep")
+        self.vp = (
+                lib.chkfile.load(chkfile, key+"/vp_left"),
+                lib.chkfile.load(chkfile, key+"/vp_right"),
+        )
+        self.niter = tuple(lib.chkfile.load(chkfile, key+"/niter"))
+
+    update = update_from_chk = update_from_chk_
+
+    def ipgfccsd(self, nroots=5):
+        """Print and return ionisation potentials.
+        """
+
+        eh, (vh, uh) = self.eh, self.vh
+
+        mask = np.abs(np.sum(vh * uh.conj(), axis=0)) > self.weight_tol
+        mask = np.arange(mask.size)[mask][::-1]
+        e_ip = -eh[mask]
+        v_ip, u_ip = vh[:, mask], uh[:, mask]
+
+        nroots = min(nroots, len(e_ip))
+        logger.note(self, "  %s %s %16s %10s", "", "", "Energy", "Weight")
+        for n in range(nroots):
+            qpwt = np.abs(np.sum(v_ip[:, n] * u_ip[:, n].conj())).real
+            warn = ""
+            if np.abs(e_ip[n].imag) > 1e-8:
+                warn += "(Warning: imag part: %.6g)" % e_ip[n].imag
+            logger.note(self, "  %2s %2d %16.10g %10.6g %s" % ("IP", n, e_ip[n].real, qpwt, warn))
+
+        if nroots == 1:
+            return e_ip[0].real, v_ip[:, 0], u_ip[:, 0]
+        else:
+            return e_ip.real, v_ip, u_ip
+
+    def eagfccsd(self, nroots=5):
+        """Print and return electron affinities.
+        """
+
+        ep, (vp, up) = self.ep, self.vp
+
+        mask = np.abs(np.sum(vp * up.conj(), axis=0)) > self.weight_tol
+        e_ea = ep[mask]
+        v_ea, u_ea = vp[:, mask], up[:, mask]
+
+        nroots = min(nroots, len(e_ea))
+        logger.note(self, "  %s %s %16s %10s", "", "", "Energy", "Weight")
+        for n in range(nroots):
+            qpwt = np.abs(np.sum(v_ea[:, n] * u_ea[:, n].conj())).real
+            warn = ""
+            if np.abs(e_ea[n].imag) > 1e-8:
+                warn += "(Warning: imag part: %.6g)" % e_ea[n].imag
+            logger.note(self, "  %2s %2d %16.10g %10.6g %s" % ("EA", n, e_ea[n].real, qpwt, warn))
+
+        if nroots == 1:
+            return e_ea[0].real, v_ea[:, 0], u_ea[:, 0]
+        else:
+            return e_ea.real, v_ea, u_ea
+
+    @property
+    def nmo(self):
+        return self._cc.nmo
+
+    @property
+    def nocc(self):
+        return self._cc.nocc
+
+
+if __name__ == "__main__":
+    from pyscf import gto, scf
+
+    mol = gto.M(
+            #atom="O 0 0 0; O 0 0 1",
+            atom="N 0 0 0; N 0 0 1",
+            basis="cc-pvdz",
+            verbose=0,
+    )
+    mf = scf.RHF(mol)
+    mf = mf.run()
+    ccsd = cc.CCSD(mf)
+    ccsd = ccsd.run()
+    ccsd.solve_lambda()
+
+    niter = 5
+
+    gfcc = MomGFCCSD(ccsd, (niter, niter))
+    gfcc.kernel()
+
+    ip1, vip1 = ccsd.ipccsd(nroots=8)
+    ip2, vip2, uip2 = gfcc.ipgfccsd(nroots=8)
+
+    ea1, vea1 = ccsd.eaccsd(nroots=8)
+    ea2, vea2, uea2 = gfcc.eagfccsd(nroots=8)
+
+    print("    %12s %12s %12s" % ("EOM", "GF", "Error"))
+    print("IP1 %12.8f %12.8f %12.8f" % (ip1[0],ip2[0],np.abs(ip1[0]-ip2[0])))
+    print("IP2 %12.8f %12.8f %12.8f" % (ip1[1],ip2[1],np.abs(ip1[1]-ip2[1])))
+    print("EA1 %12.8f %12.8f %12.8f" % (ea1[0],ea2[0],np.abs(ea1[0]-ea2[0])))
+    print("EA2 %12.8f %12.8f %12.8f" % (ea1[1],ea2[1],np.abs(ea1[1]-ea2[1])))
diff --git a/pyscf/cc/test/test_gccsd.py b/pyscf/cc/test/test_gccsd.py
index f0b8d1fd54..24e3be681f 100644
--- a/pyscf/cc/test/test_gccsd.py
+++ b/pyscf/cc/test/test_gccsd.py
@@ -181,6 +181,7 @@ def test_update_amps(self):
     def test_rdm_real(self):
         nocc = 6
         nvir = 10
+        mol = gto.M()
         mf = scf.GHF(mol)
         nmo = nocc + nvir
         npair = nmo*(nmo//2+1)//4
diff --git a/pyscf/cc/test/test_momgfccsd.py b/pyscf/cc/test/test_momgfccsd.py
new file mode 100644
index 0000000000..a2608d0bbd
--- /dev/null
+++ b/pyscf/cc/test/test_momgfccsd.py
@@ -0,0 +1,198 @@
+import os
+import unittest
+import numpy as np
+from pyscf import gto, scf, cc, lib
+
+
+class KnownValues(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.mol = gto.Mole()
+        cls.mol.atom = "O 0 0 0; H 0 -0.757 0.587; H 0 0.757 0.587"
+        cls.mol.basis = "6-31g"
+        cls.mol.verbose = 0
+        cls.mol.build()
+
+        cls.mf = scf.RHF(cls.mol)
+        cls.mf.conv_tol_grad = 1e-8
+        cls.mf.kernel()
+
+        cls.mycc = cc.ccsd.CCSD(cls.mf)
+        cls.mycc.conv_tol = 1e-10
+        cls.mycc.kernel()
+        cls.mycc.solve_lambda()
+
+        gfcc = cc.momgfccsd.MomGFCCSD(cls.mycc, niter=(5, 5))
+        imds = gfcc.make_imds()
+        cls.hole_moments = gfcc.build_hole_moments(imds=imds)
+        cls.part_moments = gfcc.build_part_moments(imds=imds)
+
+        cls.ips = {
+                0: 0.4390402520837295,
+                1: 0.43398194103807186,
+                2: 0.43139244825126516,
+                3: 0.42846325587576917,
+                4: 0.4282277692533328,
+                5: 0.42792429922566255,
+                (2, True): 0.43138084146173405,
+                (2, True, True): 0.43138084146173455,
+        }
+        cls.eas = {
+                0: 0.20957238161541483,
+                1: 0.19259609010353557,
+                2: 0.19169190195958974,
+                3: 0.19093540225391029,
+                4: 0.19072953794288366,
+                5: 0.19054512389397538,
+                (2, True): 0.19262006823074979,
+                (2, True, True): 0.19153041329652043,
+        }
+
+    @classmethod
+    def tearDownClass(cls):
+        del cls.mol, cls.mf, cls.mycc, cls.hole_moments, cls.part_moments, cls.ips, cls.eas
+
+    def test_lambda_assertion(self):
+        with lib.temporary_env(self.mycc, l1=None, l2=None):
+            gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(0, 0))
+            self.assertRaises(ValueError, gfcc.kernel)
+
+    def _test_moments(self, e, v, nmax, ref):
+        m1 = ref[:nmax+1] / np.max(np.abs(ref[:nmax+1]), axis=(1, 2), keepdims=True)
+        m2 = lib.einsum("xk,yk,nk->nxy", v[0], v[1].conj(), e[None]**np.arange(nmax+1)[:, None])
+        m2 /= np.max(np.abs(m2), axis=(1, 2), keepdims=True)
+        self.assertAlmostEqual(np.max(np.abs(m1-m2)), 0.0, 8)
+
+    def _test_niter(self, niter):
+        gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+        eh, vh, ep, vp = gfcc.kernel()
+        self.assertAlmostEqual(gfcc.ipgfccsd(nroots=1)[0], self.ips[niter])
+        self.assertAlmostEqual(gfcc.eagfccsd(nroots=1)[0], self.eas[niter])
+        self._test_moments(eh, vh, 2*niter+1, self.hole_moments)
+        self._test_moments(ep, vp, 2*niter+1, self.part_moments)
+
+    def test_0(self):
+        self._test_niter(0)
+
+    def test_1(self):
+        self._test_niter(1)
+
+    def test_2(self):
+        self._test_niter(2)
+
+    def test_3(self):
+        self._test_niter(3)
+
+    def test_4(self):
+        self._test_niter(4)
+
+    def test_5(self):
+        self._test_niter(5)
+
+    def test_amp_input(self):
+        niter = 2
+        imds = cc.eom_rccsd._IMDS(self.mycc)
+        imds.make_ip()
+        imds.make_ea()
+        t1, t2, l1, l2 = self.mycc.t1, self.mycc.t2, self.mycc.l1, self.mycc.l2
+        with lib.temporary_env(self.mycc, t1=None, t2=None, l1=None, l2=None):
+            gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+            eh, vh, ep, vp = gfcc.kernel(t1=t1, t2=t2, l1=l1, l2=l2, imds=imds)
+            self.assertAlmostEqual(gfcc.ipgfccsd(nroots=1)[0], self.ips[niter])
+            self.assertAlmostEqual(gfcc.eagfccsd(nroots=1)[0], self.eas[niter])
+            self._test_moments(eh, vh, 2*niter+1, self.hole_moments)
+            self._test_moments(ep, vp, 2*niter+1, self.part_moments)
+
+    def test_mom_input(self):
+        niter = 2
+        gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+        hole_moments = self.hole_moments[:2*niter+2]
+        part_moments = self.part_moments[:2*niter+2]
+        eh, vh, ep, vp = gfcc.kernel(hole_moments=hole_moments, part_moments=part_moments)
+        self.assertAlmostEqual(gfcc.ipgfccsd(nroots=1)[0], self.ips[niter])
+        self.assertAlmostEqual(gfcc.eagfccsd(nroots=1)[0], self.eas[niter])
+        self._test_moments(eh, vh, 2*niter+1, self.hole_moments)
+        self._test_moments(ep, vp, 2*niter+1, self.part_moments)
+
+    def test_hermi_moments(self):
+        niter = 2
+        gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+        gfcc.hermi_moments = True
+        hole_moments = self.hole_moments[:2*niter+2]
+        part_moments = self.part_moments[:2*niter+2]
+        eh, vh, ep, vp = gfcc.kernel(hole_moments=hole_moments, part_moments=part_moments)
+        self.assertAlmostEqual(gfcc.ipgfccsd(nroots=1)[0], self.ips[(niter, True)])
+        self.assertAlmostEqual(gfcc.eagfccsd(nroots=1)[0], self.eas[(niter, True)])
+        self._test_moments(eh, vh, 2*niter+1, 0.5*(self.hole_moments+self.hole_moments.swapaxes(1,2).conj()))
+        self._test_moments(ep, vp, 2*niter+1, 0.5*(self.part_moments+self.part_moments.swapaxes(1,2).conj()))
+
+    def test_hermi_moments(self):
+        niter = 2
+        gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+        gfcc.hermi_moments = True
+        gfcc.hermi_solver = True
+        hole_moments = self.hole_moments[:2*niter+2]
+        part_moments = self.part_moments[:2*niter+2]
+        eh, vh, ep, vp = gfcc.kernel(hole_moments=hole_moments, part_moments=part_moments)
+        self.assertAlmostEqual(gfcc.ipgfccsd(nroots=1)[0], self.ips[(niter, True, True)])
+        self.assertAlmostEqual(gfcc.eagfccsd(nroots=1)[0], self.eas[(niter, True, True)])
+        self._test_moments(eh, vh, 2*niter+1, 0.5*(self.hole_moments+self.hole_moments.swapaxes(1,2).conj()))
+        self._test_moments(ep, vp, 2*niter+1, 0.5*(self.part_moments+self.part_moments.swapaxes(1,2).conj()))
+
+    def test_misc(self):
+        niter = 2
+        gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+        gfcc.reset()
+        eh, vh, ep, vp = gfcc.kernel()
+        self.assertAlmostEqual(gfcc.ipgfccsd(nroots=1)[0], self.ips[niter])
+        self.assertAlmostEqual(gfcc.eagfccsd(nroots=1)[0], self.eas[niter])
+        self._test_moments(eh, vh, 2*niter+1, self.hole_moments)
+        self._test_moments(ep, vp, 2*niter+1, self.part_moments)
+        dma = gfcc.make_rdm1()
+        dmb = self.mycc.make_rdm1()
+        self.assertAlmostEqual(np.max(np.abs(dma-dmb)), 0.0, 8)
+        dma = gfcc.make_rdm1(ao_repr=True)
+        dmb = self.mycc.make_rdm1(ao_repr=True)
+        self.assertAlmostEqual(np.max(np.abs(dma-dmb)), 0.0, 8)
+
+    def test_chkfile(self):
+        niter = 1
+        gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+        eh, vh, ep, vp = gfcc.kernel()
+        self.assertAlmostEqual(gfcc.ipgfccsd(nroots=1)[0], self.ips[niter])
+        self.assertAlmostEqual(gfcc.eagfccsd(nroots=1)[0], self.eas[niter])
+        self._test_moments(eh, vh, 2*niter+1, self.hole_moments)
+        self._test_moments(ep, vp, 2*niter+1, self.part_moments)
+        gfcc.dump_chk(chkfile="tmp.chk")
+        gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+        gfcc.update("tmp.chk")
+        self.assertAlmostEqual(gfcc.ipgfccsd(nroots=1)[0], self.ips[niter])
+        self.assertAlmostEqual(gfcc.eagfccsd(nroots=1)[0], self.eas[niter])
+        self._test_moments(eh, vh, 2*niter+1, self.hole_moments)
+        self._test_moments(ep, vp, 2*niter+1, self.part_moments)
+        os.remove("tmp.chk")
+
+    def test_density_fitting(self):
+        mf = scf.RHF(self.mol)
+        mf = mf.density_fit()
+        mf.conv_tol_grad = 1e-8
+        mf.kernel()
+
+        mycc = cc.CCSD(mf)
+        mycc.conv_tol = 1e-10
+        mycc.kernel()
+        mycc.solve_lambda()
+
+        niter = 3
+        gfcc = cc.momgfccsd.MomGFCCSD(self.mycc, niter=(niter, niter))
+        eh, vh, ep, vp = gfcc.kernel()
+        self.assertAlmostEqual(gfcc.ipgfccsd(nroots=1)[0], self.ips[niter])
+        self.assertAlmostEqual(gfcc.eagfccsd(nroots=1)[0], self.eas[niter])
+        self._test_moments(eh, vh, 2*niter+1, self.hole_moments)
+        self._test_moments(ep, vp, 2*niter+1, self.part_moments)
+
+
+
+if __name__ == "__main__":
+    print("Tests for MomGFCCSD")
+    unittest.main()
diff --git a/pyscf/ci/cisd.py b/pyscf/ci/cisd.py
index fcc4603c5b..11017e2563 100644
--- a/pyscf/ci/cisd.py
+++ b/pyscf/ci/cisd.py
@@ -237,7 +237,7 @@ def contract(myci, civec, eris):
     t0  = numpy.einsum('ia,ia->', fov, c1) * 2
     t0 += numpy.einsum('iabj,ijab->', eris.ovvo, c2) * 2
     t0 -= numpy.einsum('iabj,jiab->', eris.ovvo, c2)
-    cinew = numpy.hstack((t0, t1.ravel(), t2.ravel()))
+    cinew = myci.amplitudes_to_cisdvec(t0, t1, t2)
     return cinew
 
 def amplitudes_to_cisdvec(c0, c1, c2):
@@ -851,7 +851,9 @@ class CISD(lib.StreamObject):
     max_cycle = getattr(__config__, 'ci_cisd_CISD_max_cycle', 50)
     max_space = getattr(__config__, 'ci_cisd_CISD_max_space', 12)
     lindep = getattr(__config__, 'ci_cisd_CISD_lindep', 1e-14)
-    level_shift = getattr(__config__, 'ci_cisd_CISD_level_shift', 0)  # in preconditioner
+    # level shift in preconditioner is helpful to avoid singularity and linear
+    # dependence basis in davidson diagonalization solver
+    level_shift = getattr(__config__, 'ci_cisd_CISD_level_shift', 1e-3)
     direct = getattr(__config__, 'ci_cisd_CISD_direct', False)
     async_io = getattr(__config__, 'ci_cisd_CISD_async_io', True)
 
diff --git a/pyscf/ci/test/test_cisd.py b/pyscf/ci/test/test_cisd.py
index ce472da4fe..20f0cac12a 100644
--- a/pyscf/ci/test/test_cisd.py
+++ b/pyscf/ci/test/test_cisd.py
@@ -414,6 +414,25 @@ def test_cisdvec_to_amplitudes_overwritten(self):
         c2[:] = 1
         self.assertAlmostEqual(abs(vec - vec_orig).max(), 0, 15)
 
+    # issue 1362
+    def test_cisd_hubbard(self):
+        mol = gto.M(verbose=0)
+        n, u = 6, 0.0
+        mol.nelectron = n
+        h1 = numpy.zeros((n,n))
+        for i in range(n-1):
+            h1[i,i+1] = h1[i+1,i] = -1.0
+        eri = numpy.zeros((n,n,n,n))
+        for i in range(1):
+            eri[i,i,i,i] = u
+        mf = scf.RHF(mol)
+        mf.get_hcore = lambda *args: h1
+        mf.get_ovlp = lambda *args: numpy.eye(n)
+        mf._eri = ao2mo.restore(8, eri, n)
+        mf.kernel()
+        myci = ci.CISD(mf)
+        ecisd, civec = myci.kernel()
+        self.assertAlmostEqual(ecisd, 0, 9)
 
 def t1_strs_ref(norb, nelec):
     nocc = nelec
diff --git a/pyscf/data/elements.py b/pyscf/data/elements.py
index 1ebede3dfb..4070d0842b 100644
--- a/pyscf/data/elements.py
+++ b/pyscf/data/elements.py
@@ -1102,13 +1102,12 @@ def chemcore(mol, spinorb=False):
         atm_nelec = mol.atom_charge(a)
         atm_z = charge(mol.atom_symbol(a))
         ne_ecp = atm_z - atm_nelec
+        ncore_ecp = ne_ecp // 2
         atm_ncore = chemcore_atm[atm_z]
-        if ne_ecp == 0:
-            core += atm_ncore
-        elif ne_ecp > atm_ncore:
+        if ncore_ecp > atm_ncore:
             core += 0
         else:
-            core += atm_ncore - ne_ecp
+            core += atm_ncore - ncore_ecp
 
     if spinorb:
         core *= 2
diff --git a/pyscf/df/addons.py b/pyscf/df/addons.py
index 42374be21f..b042963b3a 100644
--- a/pyscf/df/addons.py
+++ b/pyscf/df/addons.py
@@ -23,6 +23,7 @@
 from pyscf import gto
 from pyscf import ao2mo
 from pyscf.data import elements
+from pyscf.lib.exceptions import BasisNotFoundError
 from pyscf import __config__
 
 DFBASIS = getattr(__config__, 'df_addons_aug_etb_beta', 'weigend')
@@ -169,10 +170,16 @@ def make_auxbasis(mol, mp2fit=False):
                     auxb = DEFAULT_AUXBASIS[balias][1]
                 else:
                     auxb = DEFAULT_AUXBASIS[balias][0]
-                if auxb is not None and gto.basis.load(auxb, k):
-                    auxbasis[k] = auxb
-                    logger.info(mol, 'Default auxbasis %s is used for %s %s',
-                                auxb, k, _basis[k])
+                if auxb is not None:
+                    try:
+                        # Test if basis auxb for element k is available
+                        gto.basis.load(auxb, k)
+                    except BasisNotFoundError:
+                        pass
+                    else:
+                        auxbasis[k] = auxb
+                        logger.info(mol, 'Default auxbasis %s is used for %s %s',
+                                    auxb, k, _basis[k])
 
     if len(auxbasis) != len(_basis):
         # Some AO basis not found in DEFAULT_AUXBASIS
@@ -218,6 +225,8 @@ def make_auxmol(mol, auxbasis=None):
         _basis = auxbasis
     pmol._basis = pmol.format_basis(_basis)
 
+    # Note: To pass parameters like gauge origin, rsh-omega to auxmol,
+    # mol._env[:PTR_ENV_START] must be copied to auxmol._env
     pmol._atm, pmol._bas, pmol._env = \
             pmol.make_env(mol._atom, pmol._basis, mol._env[:gto.PTR_ENV_START])
     pmol._built = True
diff --git a/pyscf/df/df.py b/pyscf/df/df.py
index 768c3d8804..38f46b969d 100644
--- a/pyscf/df/df.py
+++ b/pyscf/df/df.py
@@ -23,6 +23,7 @@
 
 import copy
 import tempfile
+import contextlib
 import numpy
 import h5py
 from pyscf import lib
@@ -58,8 +59,8 @@ class DF(lib.StreamObject):
         _cderi : str or numpy array
             If _cderi is specified, the DF integral tensor will be read from
             this HDF5 file (or numpy array). When the DF integral tensor is
-            provided from the HDF5 file, it has to be stored under the dataset
-            'j3c'.
+            provided from the HDF5 file, its dataset name should be consistent
+            with DF._dataname, which is 'j3c' by default.
             The DF integral tensor :math:`V_{x,ij}` should be a 2D array in C
             (row-major) convention, where x corresponds to index of auxiliary
             basis, and the composite index ij is the orbital pair index. The
@@ -77,6 +78,7 @@ class DF(lib.StreamObject):
 
     # Store DF tensor in a format compatible to pyscf-1.1 - pyscf-1.6
     _compatible_format = getattr(__config__, 'df_df_DF_compatible_format', False)
+    _dataname = 'j3c'
 
     def __init__(self, mol, auxbasis=None):
         self.mol = mol
@@ -135,16 +137,16 @@ def build(self):
         naux = auxmol.nao_nr()
         nao_pair = nao*(nao+1)//2
 
+        is_custom_storage = isinstance(self._cderi_to_save, str)
         max_memory = self.max_memory - lib.current_memory()[0]
         int3c = mol._add_suffix('int3c2e')
         int2c = mol._add_suffix('int2c2e')
-        if (nao_pair*naux*8/1e6 < .9*max_memory and
-            not isinstance(self._cderi_to_save, str)):
+        if (nao_pair*naux*8/1e6 < .9*max_memory and not is_custom_storage):
             self._cderi = incore.cholesky_eri(mol, int3c=int3c, int2c=int2c,
                                               auxmol=auxmol,
                                               max_memory=max_memory, verbose=log)
         else:
-            if isinstance(self._cderi_to_save, str):
+            if is_custom_storage:
                 cderi = self._cderi_to_save
             else:
                 cderi = self._cderi_to_save.name
@@ -154,14 +156,14 @@ def build(self):
                 log.warn('Value of _cderi is ignored. DF integrals will be '
                          'saved in file %s .', cderi)
 
-            if self._compatible_format or isinstance(self._cderi_to_save, str):
-                outcore.cholesky_eri(mol, cderi, dataname='j3c',
+            if self._compatible_format:
+                outcore.cholesky_eri(mol, cderi, dataname=self._dataname,
                                      int3c=int3c, int2c=int2c, auxmol=auxmol,
                                      max_memory=max_memory, verbose=log)
             else:
                 # Store DF tensor in blocks. This is to reduce the
                 # initiailzation overhead
-                outcore.cholesky_eri_b(mol, cderi, dataname='j3c',
+                outcore.cholesky_eri_b(mol, cderi, dataname=self._dataname,
                                        int3c=int3c, int2c=int2c, auxmol=auxmol,
                                        max_memory=max_memory, verbose=log)
             self._cderi = cderi
@@ -177,8 +179,6 @@ def reset(self, mol=None):
             self.mol = mol
         self.auxmol = None
         self._cderi = None
-        if not isinstance(self._cderi_to_save, str):
-            self._cderi_to_save = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
         self._vjopt = None
         self._rsh_df = {}
         return self
@@ -189,7 +189,7 @@ def loop(self, blksize=None):
         if blksize is None:
             blksize = self.blockdim
 
-        with addons.load(self._cderi, 'j3c') as feri:
+        with addons.load(self._cderi, self._dataname) as feri:
             if isinstance(feri, numpy.ndarray):
                 naoaux = feri.shape[0]
                 for b0, b1 in self.prange(0, naoaux, blksize):
@@ -222,7 +222,7 @@ def get_naoaux(self):
         # object when self._cderi is provided.
         if self._cderi is None:
             self.build()
-        with addons.load(self._cderi, 'j3c') as feri:
+        with addons.load(self._cderi, self._dataname) as feri:
             if isinstance(feri, h5py.Group):
                 return feri['0'].shape[0]
             else:
@@ -235,14 +235,7 @@ def get_jk(self, dm, hermi=1, with_j=True, with_k=True,
             return df_jk.get_jk(self, dm, hermi, with_j, with_k, direct_scf_tol)
 
         # A temporary treatment for RSH-DF integrals
-        key = '%.6f' % omega
-        if key in self._rsh_df:
-            rsh_df = self._rsh_df[key]
-        else:
-            rsh_df = self._rsh_df[key] = copy.copy(self).reset()
-            logger.info(self, 'Create RSH-DF object %s for omega=%s', rsh_df, omega)
-
-        with rsh_df.mol.with_range_coulomb(omega):
+        with self.range_coulomb(omega) as rsh_df:
             return df_jk.get_jk(rsh_df, dm, hermi, with_j, with_k, direct_scf_tol)
 
     def get_eri(self):
@@ -274,6 +267,40 @@ def ao2mo(self, mo_coeffs,
         return mo_eri
     get_mo_eri = ao2mo
 
+    @contextlib.contextmanager
+    def range_coulomb(self, omega):
+        '''Creates a temporary density fitting object for RSH-DF integrals.
+        In this context, only LR or SR integrals for mol and auxmol are computed.
+        '''
+        key = '%.6f' % omega
+        if key in self._rsh_df:
+            rsh_df = self._rsh_df[key]
+        else:
+            rsh_df = self._rsh_df[key] = copy.copy(self).reset()
+            rsh_df._dataname = f'j3c/lr/{key}'
+            logger.info(self, 'Create RSH-DF object %s for omega=%s', rsh_df, omega)
+
+        mol = self.mol
+        auxmol = self.auxmol
+
+        mol_omega = mol.omega
+        mol.omega = omega
+        auxmol_omega = None
+        if auxmol is not None:
+            auxmol_omega = auxmol.omega
+            auxmol.omega = omega
+
+        assert rsh_df.mol.omega == omega
+        if rsh_df.auxmol is not None:
+            assert rsh_df.auxmol.omega == omega
+
+        try:
+            yield rsh_df
+        finally:
+            mol.omega = mol_omega
+            if auxmol_omega is not None:
+                auxmol.omega = auxmol_omega
+
 GDF = DF
 
 
@@ -302,9 +329,9 @@ def loop(self, blksize=None):
             self.build()
         if blksize is None:
             blksize = self.blockdim
-        with addons.load(self._cderi[0], 'j3c') as ferill:
+        with addons.load(self._cderi[0], self._dataname) as ferill:
             naoaux = ferill.shape[0]
-            with addons.load(self._cderi[1], 'j3c') as feriss: # python2.6 not support multiple with
+            with addons.load(self._cderi[1], self._dataname) as feriss: # python2.6 not support multiple with
                 for b0, b1 in self.prange(0, naoaux, blksize):
                     erill = numpy.asarray(ferill[b0:b1], order='C')
                     eriss = numpy.asarray(feriss[b0:b1], order='C')
@@ -316,15 +343,7 @@ def get_jk(self, dm, hermi=1, with_j=True, with_k=True,
         if omega is None:
             return df_jk.r_get_jk(self, dm, hermi, with_j, with_k)
 
-        # A temporary treatment for RSH-DF integrals
-        key = '%.6f' % omega
-        if key in self._rsh_df:
-            rsh_df = self._rsh_df[key]
-        else:
-            rsh_df = self._rsh_df[key] = copy.copy(self).reset()
-            logger.info(self, 'Create RSH-DF object %s for omega=%s', rsh_df, omega)
-
-        with rsh_df.mol.with_range_coulomb(omega):
+        with self.range_coulomb(omega) as rsh_df:
             return df_jk.r_get_jk(rsh_df, dm, hermi, with_j, with_k)
 
     def ao2mo(self, mo_coeffs):
diff --git a/pyscf/df/df_jk.py b/pyscf/df/df_jk.py
index e9bc3f75ac..9abf6abf97 100644
--- a/pyscf/df/df_jk.py
+++ b/pyscf/df/df_jk.py
@@ -118,18 +118,29 @@ def reset(self, mol=None):
         def get_jk(self, mol=None, dm=None, hermi=1, with_j=True, with_k=True,
                    omega=None):
             if dm is None: dm = self.make_rdm1()
-            if self.with_df and self.only_dfj:
-                vj = vk = None
-                if with_j:
-                    vj, vk = self.with_df.get_jk(dm, hermi, True, False,
+            if not self.with_df:
+                return mf_class.get_jk(self, mol, dm, hermi, with_j, with_k, omega)
+
+            with_dfk = with_k and not self.only_dfj
+            if isinstance(self, scf.ghf.GHF):
+                def jkbuild(mol, dm, hermi, with_j, with_k, omega=None):
+                    vj, vk = self.with_df.get_jk(dm.real, hermi, with_j, with_k,
                                                  self.direct_scf_tol, omega)
-                if with_k:
-                    vk = mf_class.get_jk(self, mol, dm, hermi, False, True, omega)[1]
-            elif self.with_df:
-                vj, vk = self.with_df.get_jk(dm, hermi, with_j, with_k,
-                                             self.direct_scf_tol, omega)
+                    if dm.dtype == numpy.complex128:
+                        vjI, vkI = self.with_df.get_jk(dm.imag, hermi, with_j, with_k,
+                                                       self.direct_scf_tol, omega)
+                        if with_j:
+                            vj = vj + vjI * 1j
+                        if with_k:
+                            vk = vk + vkI * 1j
+                    return vj, vk
+                vj, vk = scf.ghf.get_jk(mol, dm, hermi, with_j, with_dfk,
+                                        jkbuild, omega)
             else:
-                vj, vk = mf_class.get_jk(self, mol, dm, hermi, with_j, with_k, omega)
+                vj, vk = self.with_df.get_jk(dm, hermi, with_j, with_dfk,
+                                             self.direct_scf_tol, omega)
+            if with_k and not with_dfk:
+                vk = mf_class.get_jk(self, mol, dm, hermi, False, True, omega)[1]
             return vj, vk
 
         # for pyscf 1.0, 1.1 compatibility
diff --git a/pyscf/df/grad/rhf.py b/pyscf/df/grad/rhf.py
index b4a5cc8149..fa189191e1 100644
--- a/pyscf/df/grad/rhf.py
+++ b/pyscf/df/grad/rhf.py
@@ -38,6 +38,8 @@
 from itertools import product
 from pyscf.ao2mo import _ao2mo
 
+LINEAR_DEP_THRESHOLD = 1e-9
+
 def get_jk(mf_grad, mol=None, dm=None, hermi=0, with_j=True, with_k=True):
     assert (with_j or with_k)
     if not with_k:
@@ -339,6 +341,24 @@ def _decompose_rdm1 (mf_grad, mol, dm):
 
     return orbol, orbor
 
+def _gen_metric_solver(int2c):
+    try:
+        j2c = scipy.linalg.cho_factor(int2c, lower=True)
+        j2c_solver = lambda v: scipy.linalg.cho_solve(j2c, v, overwrite_b=True)
+    except (numpy.linalg.LinAlgError, scipy.linalg.LinAlgError):
+        w, v = scipy.linalg.eigh(int2c)
+        mask = w > LINEAR_DEP_THRESHOLD
+        #logger.debug(mf_grad, 'int2c2e cond = %.4g, drop %d bfns',
+        #             w[-1]/w[0], w.size-numpy.count_nonzero(mask))
+        v1 = v[:,mask]
+        j2c = lib.dot(v1/w[mask], v1.conj().T)
+        def j2c_solver(v):
+            if v.ndim == 2:
+                return lib.dot(j2c, v)
+            else:
+                return j2c.dot(v)
+    return j2c_solver
+
 def _cho_solve_rhojk (mf_grad, mol, auxmol, orbol, orbor):
     ''' Solve
 
@@ -374,7 +394,9 @@ def _cho_solve_rhojk (mf_grad, mol, auxmol, orbol, orbor):
     ao_loc = mol.ao_loc
     nocc = [o.shape[-1] for o in orbor]
 
-    int2c = scipy.linalg.cho_factor(auxmol.intor('int2c2e', aosym='s1'))
+    int2c = auxmol.intor('int2c2e', aosym='s1')
+    solve_j2c = _gen_metric_solver(int2c)
+    int2c = None
     get_int3c_s1 = _int3c_wrapper(mol, auxmol, 'int3c2e', 's1')
     rhoj = numpy.zeros((nset,naux))
     f_rhok = lib.H5TmpFile()
@@ -394,12 +416,12 @@ def _cho_solve_rhojk (mf_grad, mol, auxmol, orbol, orbor):
             t2 = logger.timer_debug1 (mf_grad, 'df grad einsum (P|mn) u_ni N_i = v_Pmi', *t2)
             rhoj[i] += numpy.dot (v, orbol[i][p0:p1].ravel ())
             t2 = logger.timer_debug1 (mf_grad, 'df grad einsum v_Pmi u_mi = rho_P', *t2)
-            v = scipy.linalg.cho_solve(int2c, v)
+            v = solve_j2c(v)
             t2 = logger.timer_debug1 (mf_grad, 'df grad cho_solve (P|Q) D_Qmi = v_Pmi', *t2)
             f_rhok['%s/%s'%(i,istep)] = v.reshape(naux,p1-p0,-1)
             t2 = logger.timer_debug1 (mf_grad, 'df grad cache D_Pmi (m <-> i transpose upon retrieval)', *t2)
         int3c = v = None
-    rhoj = scipy.linalg.cho_solve(int2c, rhoj.T).T
+    rhoj = solve_j2c(rhoj.T).T
     int2c = None
     t1 = logger.timer_debug1 (mf_grad, 'df grad vj and vk AO (P|Q) D_Q = (P|mn) D_mn solve', *t1)
     class get_rhok_class (object):
@@ -425,13 +447,19 @@ def __init__(self, mf):
         self.auxbasis_response = True
         rhf_grad.Gradients.__init__(self, mf)
 
-    get_jk = get_jk
+    def get_jk(self, mol=None, dm=None, hermi=0, with_j=True, with_k=True,
+               omega=None):
+        if omega is None:
+            return get_jk(self, mol, dm, hermi, with_j, with_k)
+
+        with self.base.with_df.range_coulomb(omega):
+            return get_jk(self, mol, dm, hermi, with_j, with_k)
 
-    def get_j(self, mol=None, dm=None, hermi=0):
-        return self.get_jk(mol, dm, with_k=False)[0]
+    def get_j(self, mol=None, dm=None, hermi=0, omega=None):
+        return self.get_jk(mol, dm, with_k=False, omega=omega)[0]
 
-    def get_k(self, mol=None, dm=None, hermi=0):
-        return self.get_jk(mol, dm, with_j=False)[1]
+    def get_k(self, mol=None, dm=None, hermi=0, omega=None):
+        return self.get_jk(mol, dm, with_j=False, omega=omega)[1]
 
     def get_veff(self, mol=None, dm=None):
         vj, vk = self.get_jk(mol, dm)
@@ -449,30 +477,3 @@ def extra_force(self, atom_id, envs):
             return 0
 
 Grad = Gradients
-
-if __name__ == '__main__':
-    mol = gto.Mole()
-    mol.atom = [
-        ['O' , (0. , 0.     , 0.)],
-        [1   , (0. , -0.757 , 0.587)],
-        [1   , (0. , 0.757  , 0.587)] ]
-    mol.basis = '631g'
-    mol.build()
-    mf = scf.RHF(mol).density_fit(auxbasis='ccpvdz-jkfit').run()
-    g = Gradients(mf).set(auxbasis_response=not False).kernel()
-    print(lib.finger(g) - 0.0055166381900824879)
-    g = Gradients(mf).kernel()
-    print(lib.finger(g) - 0.005516638190173352)
-    print(abs(g-scf.RHF(mol).run().Gradients().kernel()).max())
-# -0.0000000000    -0.0000000000    -0.0241140368
-#  0.0000000000     0.0043935801     0.0120570184
-#  0.0000000000    -0.0043935801     0.0120570184
-
-    mfs = mf.as_scanner()
-    e1 = mfs([['O' , (0. , 0.     , 0.001)],
-              [1   , (0. , -0.757 , 0.587)],
-              [1   , (0. , 0.757  , 0.587)] ])
-    e2 = mfs([['O' , (0. , 0.     ,-0.001)],
-              [1   , (0. , -0.757 , 0.587)],
-              [1   , (0. , 0.757  , 0.587)] ])
-    print((e1-e2)/0.002*lib.param.BOHR)
diff --git a/pyscf/df/grad/rks.py b/pyscf/df/grad/rks.py
index 76e9648649..08f24e3035 100644
--- a/pyscf/df/grad/rks.py
+++ b/pyscf/df/grad/rks.py
@@ -38,47 +38,58 @@ def get_veff(ks_grad, mol=None, dm=None):
 
     mf = ks_grad.base
     ni = mf._numint
-    if ks_grad.grids is not None:
-        grids = ks_grad.grids
-    else:
-        grids = mf.grids
-    if grids.coords is None:
-        grids.build(with_non0tab=True)
-
-    if mf.nlc != '':
-        raise NotImplementedError
-    #enabling range-separated hybrids
-    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
+    grids, nlcgrids = rks_grad._initialize_grids(ks_grad)
 
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, ks_grad.max_memory*.9-mem_now)
     if ks_grad.grid_response:
         exc, vxc = rks_grad.get_vxc_full_response(
-                ni, mol, grids, mf.xc, dm,
+            ni, mol, grids, mf.xc, dm,
+            max_memory=max_memory, verbose=ks_grad.verbose)
+        if mf.nlc or ni.libxc.is_nlc(mf.xc):
+            if ni.libxc.is_nlc(mf.xc):
+                xc = mf.xc
+            else:
+                xc = mf.nlc
+            enlc, vnlc = rks_grad.get_nlc_vxc_full_response(
+                ni, mol, nlcgrids, xc, dm,
                 max_memory=max_memory, verbose=ks_grad.verbose)
+            exc += enlc
+            vxc += vnlc
         logger.debug1(ks_grad, 'sum(grids response) %s', exc.sum(axis=0))
     else:
         exc, vxc = rks_grad.get_vxc(
-                ni, mol, grids, mf.xc, dm,
+            ni, mol, grids, mf.xc, dm,
+            max_memory=max_memory, verbose=ks_grad.verbose)
+        if mf.nlc or ni.libxc.is_nlc(mf.xc):
+            if ni.libxc.is_nlc(mf.xc):
+                xc = mf.xc
+            else:
+                xc = mf.nlc
+            enlc, vnlc = rks_grad.get_nlc_vxc(
+                ni, mol, nlcgrids, xc, dm,
                 max_memory=max_memory, verbose=ks_grad.verbose)
+            vxc += vnlc
     t0 = logger.timer(ks_grad, 'vxc', *t0)
 
-    if abs(hyb) < 1e-10 and abs(alpha) < 1e-10:
+    if not ni.libxc.is_hybrid_xc(mf.xc):
         vj = ks_grad.get_j(mol, dm)
         vxc += vj
         if ks_grad.auxbasis_response:
             e1_aux = vj.aux.sum ((0,1))
     else:
+        omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
         vj, vk = ks_grad.get_jk(mol, dm)
         if ks_grad.auxbasis_response:
-            vk_aux = vk.aux * hyb
+            vk.aux *= hyb
         vk[:] *= hyb # Don't erase the .aux tags!
-        if abs(omega) > 1e-10:  # For range separated Coulomb operator
-            raise NotImplementedError
+        if omega != 0:  # For range separated Coulomb operator
+            # TODO: replaced with vk_sr which is numerically more stable for
+            # inv(int2c2e)
             vk_lr = ks_grad.get_k(mol, dm, omega=omega)
-            vk += vk_lr * (alpha - hyb)
+            vk[:] += vk_lr * (alpha - hyb)
             if ks_grad.auxbasis_response:
-                vk_aux += vk_lr.aux * (alpha - hyb)
+                vk.aux[:] += vk_lr.aux * (alpha - hyb)
         vxc += vj - vk * .5
         if ks_grad.auxbasis_response:
             e1_aux = (vj.aux - vk.aux * .5).sum ((0,1))
@@ -98,58 +109,15 @@ def __init__(self, mf):
         self.auxbasis_response = True
         rks_grad.Gradients.__init__(self, mf)
 
-    get_jk = df_rhf_grad.get_jk
-
-    def get_j(self, mol=None, dm=None, hermi=0):
-        return self.get_jk(mol, dm, with_k=False)[0]
-
-    def get_k(self, mol=None, dm=None, hermi=0):
-        return self.get_jk(mol, dm, with_j=False)[1]
-
+    get_jk = df_rhf_grad.Gradients.get_jk
+    get_j = df_rhf_grad.Gradients.get_j
+    get_k = df_rhf_grad.Gradients.get_k
     get_veff = get_veff
 
     def extra_force(self, atom_id, envs):
+        e1 = rks_grad.Gradients.extra_force(self, atom_id, envs)
         if self.auxbasis_response:
-            e1 = rks_grad.Gradients.extra_force(self, atom_id, envs)
-            return e1 + envs['vhf'].aux[atom_id]
-        else:
-            return 0
+            e1 += envs['vhf'].aux[atom_id]
+        return e1
 
 Grad = Gradients
-
-
-if __name__ == '__main__':
-    from pyscf import gto
-    from pyscf import dft
-
-    mol = gto.Mole()
-    mol.atom = [
-        ['O' , (0. , 0.     , 0.)],
-        [1   , (0. , -0.757 , 0.587)],
-        [1   , (0. ,  0.757 , 0.587)] ]
-    mol.basis = '631g'
-    mol.build()
-    mf = dft.RKS(mol).density_fit(auxbasis='ccpvdz-jkfit')
-    mf.conv_tol = 1e-14
-    e0 = mf.scf()
-    g = Gradients(mf).set(auxbasis_response=False)
-    print(lib.finger(g.kernel()) - -0.04993147565973481)
-    g = Gradients(mf)
-    print(lib.finger(g.kernel()) - -0.04990283616418435)
-# O     0.0000000000    -0.0000000000     0.0210278440
-# H    -0.0000000000     0.0282041778    -0.0105201841
-# H    -0.0000000000    -0.0282041778    -0.0105201841
-    g.grid_response = True
-    print(lib.finger(g.kernel()) - -0.04990623599165457)
-# O     0.0000000000    -0.0000000000     0.0210353722
-# H    -0.0000000000     0.0282046127    -0.0105176861
-# H    -0.0000000000    -0.0282046127    -0.0105176861
-
-    mf.xc = 'b3lypg'
-    e0 = mf.kernel()
-    g = Gradients(mf)
-    print(lib.finger(g.kernel()) - -0.03562514802969775)
-# O     0.0000000000    -0.0000000000     0.0121660845
-# H     0.0000000000     0.0211156739    -0.0060869839
-# H    -0.0000000000    -0.0211156739    -0.0060869839
-
diff --git a/pyscf/df/grad/uhf.py b/pyscf/df/grad/uhf.py
index 23cb68b479..2aa7e86e13 100644
--- a/pyscf/df/grad/uhf.py
+++ b/pyscf/df/grad/uhf.py
@@ -37,13 +37,9 @@ def __init__(self, mf):
         self.auxbasis_response = True
         uhf_grad.Gradients.__init__(self, mf)
 
-    get_jk = df_rhf_grad.get_jk
-
-    def get_j(self, mol=None, dm=None, hermi=0):
-        return self.get_jk(mol, dm, with_k=False)[0]
-
-    def get_k(self, mol=None, dm=None, hermi=0):
-        return self.get_jk(mol, dm, with_j=False)[1]
+    get_jk = df_rhf_grad.Gradients.get_jk
+    get_j = df_rhf_grad.Gradients.get_j
+    get_k = df_rhf_grad.Gradients.get_k
 
     def get_veff(self, mol=None, dm=None):
         vj, vk = self.get_jk(mol, dm)
@@ -62,35 +58,3 @@ def extra_force(self, atom_id, envs):
             return 0
 
 Grad = Gradients
-
-
-if __name__ == '__main__':
-    from pyscf import gto
-    from pyscf import scf
-    mol = gto.Mole()
-    mol.atom = [
-        ['O' , (0. , 0.     , 0.)],
-        [1   , (0. , -0.757 , 0.587)],
-        [1   , (0. , 0.757  , 0.587)] ]
-    mol.basis = '631g'
-    mol.spin = 2
-    mol.build()
-    mf = scf.UHF(mol).density_fit()
-    mf.conv_tol = 1e-14
-    e0 = mf.scf()
-    g = Gradients(mf).set(auxbasis_response=False).kernel()
-    print(lib.finger(g) - -0.19670644982746546)
-    g = Gradients(mf).kernel()
-    print(lib.finger(g) - -0.19660674423263175)
-# O     0.0000000000    -0.0000000000     0.1236878122
-# H    -0.0000000000     0.0970412174    -0.0618439061
-# H     0.0000000000    -0.0970412174    -0.0618439061
-
-    mfs = mf.as_scanner()
-    e1 = mfs([['O' , (0. , 0.     , 0.001)],
-              [1   , (0. , -0.757 , 0.587)],
-              [1   , (0. , 0.757  , 0.587)] ])
-    e2 = mfs([['O' , (0. , 0.     ,-0.001)],
-              [1   , (0. , -0.757 , 0.587)],
-              [1   , (0. , 0.757  , 0.587)] ])
-    print((e1-e2)/0.002*lib.param.BOHR)
diff --git a/pyscf/df/grad/uks.py b/pyscf/df/grad/uks.py
index 34152f8a25..b925de2285 100644
--- a/pyscf/df/grad/uks.py
+++ b/pyscf/df/grad/uks.py
@@ -26,6 +26,7 @@
 import numpy
 from pyscf import lib
 from pyscf.lib import logger
+from pyscf.grad import rks as rks_grad
 from pyscf.grad import uks as uks_grad
 from pyscf.df.grad import rhf as df_rhf_grad
 
@@ -39,17 +40,7 @@ def get_veff(ks_grad, mol=None, dm=None):
 
     mf = ks_grad.base
     ni = mf._numint
-    if ks_grad.grids is not None:
-        grids = ks_grad.grids
-    else:
-        grids = mf.grids
-    if grids.coords is None:
-        grids.build(with_non0tab=True)
-
-    if mf.nlc != '':
-        raise NotImplementedError
-    #enabling range-separated hybrids
-    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
+    grids, nlcgrids = rks_grad._initialize_grids(ks_grad)
 
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, ks_grad.max_memory*.9-mem_now)
@@ -57,33 +48,52 @@ def get_veff(ks_grad, mol=None, dm=None):
         exc, vxc = uks_grad.get_vxc_full_response(
                 ni, mol, grids, mf.xc, dm,
                 max_memory=max_memory, verbose=ks_grad.verbose)
+        if mf.nlc or ni.libxc.is_nlc(mf.xc):
+            if ni.libxc.is_nlc(mf.xc):
+                xc = mf.xc
+            else:
+                xc = mf.nlc
+            enlc, vnlc = rks_grad.get_nlc_vxc_full_response(
+                ni, mol, nlcgrids, xc, dm[0]+dm[1],
+                max_memory=max_memory, verbose=ks_grad.verbose)
+            exc += enlc
+            vxc += vnlc
         logger.debug1(ks_grad, 'sum(grids response) %s', exc.sum(axis=0))
     else:
         exc, vxc = uks_grad.get_vxc(
                 ni, mol, grids, mf.xc, dm,
                 max_memory=max_memory, verbose=ks_grad.verbose)
+        if mf.nlc or ni.libxc.is_nlc(mf.xc):
+            if ni.libxc.is_nlc(mf.xc):
+                xc = mf.xc
+            else:
+                xc = mf.nlc
+            enlc, vnlc = rks_grad.get_nlc_vxc(
+                ni, mol, nlcgrids, xc, dm[0]+dm[1],
+                max_memory=max_memory, verbose=ks_grad.verbose)
+            vxc += vnlc
     t0 = logger.timer(ks_grad, 'vxc', *t0)
 
-    if abs(hyb) < 1e-10:
+    if not ni.libxc.is_hybrid_xc(mf.xc):
         vj = ks_grad.get_j(mol, dm)
         vxc += vj[0] + vj[1]
         if ks_grad.auxbasis_response:
             e1_aux = vj.aux.sum ((0,1))
     else:
+        omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
         vj, vk = ks_grad.get_jk(mol, dm)
         if ks_grad.auxbasis_response:
-            vk_aux = vk.aux * hyb
-        vk *= hyb
-        if abs(omega) > 1e-10:  # For range separated Coulomb operator
-            raise NotImplementedError
+            vk.aux = vk.aux * hyb
+        vk[:] *= hyb # inplace * for vk[:] to keep the .aux tag
+        if omega != 0:  # For range separated Coulomb operator
             vk_lr = ks_grad.get_k(mol, dm, omega=omega)
-            vk += vk_lr * (alpha - hyb)
+            vk[:] += vk_lr * (alpha - hyb)
             if ks_grad.auxbasis_response:
-                vk_aux += vk_lr.aux * (alpha - hyb)
+                vk.aux[:] += vk_lr.aux * (alpha - hyb)
         vxc += vj[0] + vj[1] - vk
         if ks_grad.auxbasis_response:
             e1_aux = vj.aux.sum ((0,1))
-            e1_aux -= numpy.trace (vk_aux, axis1=0, axis2=1)
+            e1_aux -= numpy.trace (vk.aux, axis1=0, axis2=1)
 
     if ks_grad.auxbasis_response:
         logger.debug1(ks_grad, 'sum(auxbasis response) %s', e1_aux.sum(axis=0))
@@ -100,60 +110,15 @@ def __init__(self, mf):
         self.auxbasis_response = True
         uks_grad.Gradients.__init__(self, mf)
 
-    get_jk = df_rhf_grad.get_jk
-
-    def get_j(self, mol=None, dm=None, hermi=0):
-        return self.get_jk(mol, dm, with_k=False)[0]
-
-    def get_k(self, mol=None, dm=None, hermi=0):
-        return self.get_jk(mol, dm, with_j=False)[1]
-
+    get_jk = df_rhf_grad.Gradients.get_jk
+    get_j = df_rhf_grad.Gradients.get_j
+    get_k = df_rhf_grad.Gradients.get_k
     get_veff = get_veff
 
     def extra_force(self, atom_id, envs):
+        e1 = uks_grad.Gradients.extra_force(self, atom_id, envs)
         if self.auxbasis_response:
-            e1 = uks_grad.Gradients.extra_force(self, atom_id, envs)
-            return e1 + envs['vhf'].aux[atom_id]
-        else:
-            return 0
+            e1 += envs['vhf'].aux[atom_id]
+        return e1
 
 Grad = Gradients
-
-
-if __name__ == '__main__':
-    from pyscf import gto
-    from pyscf import dft
-
-    mol = gto.Mole()
-    mol.atom = [
-        ['O' , (0. , 0.     , 0.)],
-        [1   , (0. , -0.757 , 0.587)],
-        [1   , (0. ,  0.757 , 0.587)] ]
-    mol.basis = '631g'
-    mol.charge = 1
-    mol.spin = 1
-    mol.build()
-    mf = dft.UKS(mol).density_fit()
-    mf.conv_tol = 1e-12
-    e0 = mf.scf()
-    g = Gradients(mf).set(auxbasis_response=False)
-    print(lib.finger(g.kernel()) - -0.12092643506961044)
-    g = Gradients(mf)
-    print(lib.finger(g.kernel()) - -0.12092884149543644)
-# O    -0.0000000000     0.0000000000     0.0533109212
-# H    -0.0000000000     0.0675360271    -0.0266615265
-# H     0.0000000000    -0.0675360271    -0.0266615265
-    g.grid_response = True
-# O    -0.0000000000     0.0000000000     0.0533189584
-# H    -0.0000000000     0.0675362403    -0.0266594792
-# H     0.0000000000    -0.0675362403    -0.0266594792
-    print(lib.finger(g.kernel()) - -0.12093220332146028)
-
-    mf.xc = 'b3lypg'
-    e0 = mf.kernel()
-    g = Gradients(mf)
-    print(lib.finger(g.kernel()) - -0.1020433598546214)
-# O    -0.0000000000    -0.0000000000     0.0397385108
-# H    -0.0000000000     0.0587977564    -0.0198734952
-# H     0.0000000000    -0.0587977564    -0.0198734952
-
diff --git a/pyscf/df/hessian/rhf.py b/pyscf/df/hessian/rhf.py
index 1d6133e61d..788d10c9c2 100644
--- a/pyscf/df/hessian/rhf.py
+++ b/pyscf/df/hessian/rhf.py
@@ -40,8 +40,8 @@
 from pyscf.lib import logger
 from pyscf import ao2mo
 from pyscf.hessian import rhf as rhf_hess
-from pyscf.df.grad.rhf import _int3c_wrapper
-
+from pyscf.df.grad.rhf import (_int3c_wrapper, _gen_metric_solver,
+                               LINEAR_DEP_THRESHOLD)
 
 def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
                       atmlst=None, max_memory=4000, verbose=None):
@@ -90,23 +90,21 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     #    (11|0)(0|00)
     #    (10|0)(0|10)
     int2c = auxmol.intor('int2c2e', aosym='s1')
-    int2c_low = scipy.linalg.cho_factor(int2c, lower=True)
+    solve_j2c = _gen_metric_solver(int2c)
     int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1')
 
     rhoj0_P = 0
-    if with_k:
-        if hessobj.max_memory*.8e6/8 < naux*nocc*(nocc+nao):
-            raise RuntimeError('Memory not enough. You need to increase mol.max_memory')
-        rhok0_Pl_ = np.empty((naux,nao,nocc))
+    if hessobj.max_memory*.8e6/8 < naux*nocc*(nocc+nao):
+        raise RuntimeError('Memory not enough. You need to increase mol.max_memory')
+    rhok0_Pl_ = np.empty((naux,nao,nocc))
     for i, (shl0, shl1, p0, p1) in enumerate(aoslices):
         int3c = get_int3c((shl0, shl1, 0, nbas, 0, auxmol.nbas))
         rhoj0_P += np.einsum('klp,kl->p', int3c, dm0[p0:p1])
-        if with_k:
-            tmp = lib.einsum('ijp,jk->pik', int3c, mocc_2)
-            tmp = scipy.linalg.cho_solve(int2c_low, tmp.reshape(naux,-1), overwrite_b=True)
-            rhok0_Pl_[:,p0:p1] = tmp.reshape(naux,p1-p0,nocc)
+        tmp = lib.einsum('ijp,jk->pik', int3c, mocc_2)
+        tmp = solve_j2c(tmp.reshape(naux,-1))
+        rhok0_Pl_[:,p0:p1] = tmp.reshape(naux,p1-p0,nocc)
         int3c = tmp = None
-    rhoj0_P = scipy.linalg.cho_solve(int2c_low, rhoj0_P)
+    rhoj0_P = solve_j2c(rhoj0_P)
 
     get_int3c_ipip1 = _int3c_wrapper(mol, auxmol, 'int3c2e_ipip1', 's1')
     vj1_diag = 0
@@ -132,8 +130,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         shl0, shl1, p0, p1 = aoslices[ia]
         shls_slice = (shl0, shl1, 0, nbas, 0, auxmol.nbas)
         int3c_ip1 = get_int3c_ip1(shls_slice)
-        tmp_ip1 = scipy.linalg.cho_solve(int2c_low, int3c_ip1.reshape(-1,naux).T,
-                                         overwrite_b=True).reshape(naux,3,p1-p0,nao)
+        tmp_ip1 = solve_j2c(int3c_ip1.reshape(-1,naux).T).reshape(naux,3,p1-p0,nao)
         rhoj1[i0] = np.einsum('pxij,ji->px', tmp_ip1, dm0[:,p0:p1])
         wj1[i0] = np.einsum('xijp,ji->px', int3c_ip1, dm0[:,p0:p1])
         rho_ip1[p0:p1] = tmp_ip1.transpose(2,3,0,1)
@@ -178,11 +175,18 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         rhok0_P__ = lib.einsum('plj,li->pij', rhok0_Pl_, mocc_2)
         rho2c_0 = lib.einsum('pij,qji->pq', rhok0_P__, rhok0_P__)
 
-        int2c_inv = np.linalg.inv(int2c)
+        try:
+            int2c_inv = np.linalg.inv(int2c)
+        except scipy.linalg.LinAlgError:
+            w, v = scipy.linalg.eigh(int2c)
+            mask = w > LINEAR_DEP_THRESHOLD
+            v1 = v[:,mask]
+            int2c_inv = lib.dot(v1/w[mask], v1.conj().T)
+            v1 = None
         int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
         int2c_ip_ip  = lib.einsum('xpq,qr,ysr->xyps', int2c_ip1, int2c_inv, int2c_ip1)
         int2c_ip_ip -= auxmol.intor('int2c2e_ip1ip2', aosym='s1').reshape(3,3,naux,naux)
-    int2c = int2c_low = None
+    int2c = solve_j2c = None
 
     get_int3c_ipvip1 = _int3c_wrapper(mol, auxmol, 'int3c2e_ipvip1', 's1')
     get_int3c_ip1ip2 = _int3c_wrapper(mol, auxmol, 'int3c2e_ip1ip2', 's1')
@@ -401,21 +405,22 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     rho0_Pij = ftmp.create_group('rho0_Pij')
     wj_ip1_pij = ftmp.create_group('wj_ip1_pij')
     int2c = auxmol.intor('int2c2e', aosym='s1')
-    int2c_low = scipy.linalg.cho_factor(int2c, lower=True)
+    solve_j2c = _gen_metric_solver(int2c)
+    int2c = None
     int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1')
     rhoj0_P = 0
     if with_k:
         rhok0_Pl_ = np.empty((naux,nao,nocc))
     for i, (shl0, shl1, p0, p1) in enumerate(aoslices):
         int3c = get_int3c((shl0, shl1, 0, nbas, 0, auxmol.nbas))
-        coef3c = scipy.linalg.cho_solve(int2c_low, int3c.reshape(-1,naux).T, overwrite_b=True)
+        coef3c = solve_j2c(int3c.reshape(-1,naux).T)
         rho0_Pij['%.4d'%i] = coef3c = coef3c.reshape(naux,p1-p0,nao)
         rhoj0_P += np.einsum('pkl,kl->p', coef3c, dm0[p0:p1])
         if with_k:
             rhok0_Pl_[:,p0:p1] = lib.einsum('pij,jk->pik', coef3c, mocc_2)
         if hessobj.auxbasis_response:
             wj_ip1_pij['%.4d'%i] = lib.einsum('xqp,pij->qixj', int2c_ip1, coef3c)
-    int3c = coef3c = int2c_low = None
+    int3c = coef3c = None
 
     get_int3c_ip1 = _int3c_wrapper(mol, auxmol, 'int3c2e_ip1', 's1')
     get_int3c_ip2 = _int3c_wrapper(mol, auxmol, 'int3c2e_ip2', 's1')
@@ -430,25 +435,27 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
         for i, (shl0, shl1, q0, q1) in enumerate(aoslices):
             wj1 = np.einsum('xijp,ji->xp', int3c_ip1[:,q0:q1], dm0[:,q0:q1])
             vj1_buf[i] += np.einsum('xp,pij->xij', wj1, coef3c)
-        rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0_Pl_[p0:p1], mocc_2)
-        vk1_buf += lib.einsum('xijp,plj->xil', int3c_ip1, rhok0_PlJ)
+        if with_k:
+            rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0_Pl_[p0:p1], mocc_2)
+            vk1_buf += lib.einsum('xijp,plj->xil', int3c_ip1, rhok0_PlJ)
         int3c_ip1 = None
     vj1_buf = ftmp['vj1_buf'] = vj1_buf
 
+    vk1 = np.zeros((3,nao,nao))
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
         shls_slice = (shl0, shl1, 0, nbas, 0, auxmol.nbas)
         int3c_ip1 = get_int3c_ip1(shls_slice)
         vj1 = -np.asarray(vj1_buf[ia])
-        rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0_Pl_, mocc_2[p0:p1])
-        vk1 = -lib.einsum('xijp,pki->xkj', int3c_ip1, rhok0_PlJ)
         vj1[:,p0:p1] -= np.einsum('xijp,p->xij', int3c_ip1, rhoj0_P)
-        vk1[:,p0:p1] -= vk1_buf[:,p0:p1]
+        if with_k:
+            rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0_Pl_, mocc_2[p0:p1])
+            vk1 = -lib.einsum('xijp,pki->xkj', int3c_ip1, rhok0_PlJ)
+            vk1[:,p0:p1] -= vk1_buf[:,p0:p1]
 
         if hessobj.auxbasis_response:
             shl0, shl1, q0, q1 = auxslices[ia]
             shls_slice = (0, nbas, 0, nbas, shl0, shl1)
-            rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0_Pl_[q0:q1], mocc_2)
             int3c_ip2 = get_int3c_ip2(shls_slice)
             rhoj1 = np.einsum('xijp,ji->xp', int3c_ip2, dm0)
             coef3c = _load_dim0(rho0_Pij, q0, q1)
@@ -457,12 +464,15 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
             vj1 += .5 * np.einsum('xijp,p->xij', int3c_ip2, -rhoj0_P[q0:q1])
             vj1 -= .5 * lib.einsum('xpq,q,pij->xij', int2c_ip1[:,q0:q1], -rhoj0_P, coef3c)
             vj1 -= .5 * lib.einsum('pixj,p->xij', pij, -rhoj0_P[q0:q1])
-            vk1 -= lib.einsum('plj,xijp->xil', rhok0_PlJ, int3c_ip2)
-            vk1 += lib.einsum('pjxi,plj->xil', pij, rhok0_PlJ)
+            if with_k:
+                rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0_Pl_[q0:q1], mocc_2)
+                vk1 -= lib.einsum('plj,xijp->xil', rhok0_PlJ, int3c_ip2)
+                vk1 += lib.einsum('pjxi,plj->xil', pij, rhok0_PlJ)
         rhok0_PlJ = pij = coef3c = int3c_ip1 = None
 
         vj1 = vj1 + vj1.transpose(0,2,1)
-        vk1 = vk1 + vk1.transpose(0,2,1)
+        if with_k:
+            vk1 = vk1 + vk1.transpose(0,2,1)
         h1 = hcore_deriv(ia)
         yield ia, h1, vj1, vk1
 
diff --git a/pyscf/df/hessian/rks.py b/pyscf/df/hessian/rks.py
index a6c7f7c290..e391ebfc6d 100644
--- a/pyscf/df/hessian/rks.py
+++ b/pyscf/df/hessian/rks.py
@@ -41,6 +41,10 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     mol = hessobj.mol
     mf = hessobj.base
+    ni = mf._numint
+    if mf.nlc or ni.libxc.is_nlc(mf.xc):
+        raise NotImplementedError('RKS Hessian for NLC functional')
+
     if mo_energy is None: mo_energy = mf.mo_energy
     if mo_occ is None:    mo_occ = mf.mo_occ
     if mo_coeff is None:  mo_coeff = mf.mo_coeff
@@ -50,18 +54,17 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     mocc = mo_coeff[:,mo_occ>0]
     dm0 = numpy.dot(mocc, mocc.T) * 2
 
-    if mf.nlc != '':
-        raise NotImplementedError
-    #enabling range-separated hybrids
-    omega, alpha, beta = mf._numint.rsh_coeff(mf.xc)
-    if abs(omega) > 1e-10:
-        raise NotImplementedError
-    else:
-        hyb = mf._numint.hybrid_coeff(mf.xc, spin=mol.spin)
+    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
+    hybrid = ni.libxc.is_hybrid_xc(mf.xc)
     de2, ej, ek = df_rhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
                                                 atmlst, max_memory, verbose,
-                                                abs(hyb) > 1e-10)
+                                                with_k=hybrid)
     de2 += ej - hyb * ek  # (A,B,dR_A,dR_B)
+    if hybrid and omega != 0:
+        with hessobj.base.with_df.range_coulomb(omega):
+            ek_lr = df_rhf_hess._partial_hess_ejk(
+                hessobj, mo_energy, mo_coeff, mo_occ, atmlst, max_memory, verbose)[2]
+        de2 -= ek_lr * (alpha - hyb)
 
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, mf.max_memory*.9-mem_now)
@@ -90,16 +93,22 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     ni = mf._numint
     ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
+    hybrid = ni.libxc.is_hybrid_xc(mf.xc)
 
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, mf.max_memory*.9-mem_now)
     h1ao = rks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory)
-    for ia, h1, vj1, vk1 in df_rhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
-                                                atmlst, verbose, abs(hyb) > 1e-10):
-        if abs(hyb) > 1e-10:
-            h1ao[ia] += h1 + vj1 - hyb*.5 * vk1
-        else:
-            h1ao[ia] += h1 + vj1
+    for ia, h1, vj1, vk1 in df_rhf_hess._gen_jk(
+            hessobj, mo_coeff, mo_occ, chkfile, atmlst, verbose, with_k=hybrid):
+        h1ao[ia] += h1 + vj1
+        if hybrid:
+            h1ao[ia] -= .5 * hyb * vk1
+
+    if hybrid and omega != 0:
+        with hessobj.base.with_df.range_coulomb(omega):
+            for ia, h1, vj1, vk1 in df_rhf_hess._gen_jk(
+                    hessobj, mo_coeff, mo_occ, chkfile, atmlst, verbose):
+                h1ao[ia] -= .5 * (alpha - hyb) * vk1
 
     if chkfile is None:
         return h1ao
diff --git a/pyscf/df/hessian/uhf.py b/pyscf/df/hessian/uhf.py
index 0e45c26dd1..7c0ca4b260 100644
--- a/pyscf/df/hessian/uhf.py
+++ b/pyscf/df/hessian/uhf.py
@@ -35,7 +35,9 @@
 from pyscf import ao2mo
 from pyscf.hessian import rhf as rhf_hess
 from pyscf.hessian import uhf as uhf_hess
-from pyscf.df.hessian.rhf import _int3c_wrapper, _load_dim0
+from pyscf.df.hessian.rhf import _load_dim0
+from pyscf.df.grad.rhf import (_int3c_wrapper, _gen_metric_solver,
+                               LINEAR_DEP_THRESHOLD)
 
 
 def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
@@ -89,7 +91,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     #    (11|0)(0|00)
     #    (10|0)(0|10)
     int2c = auxmol.intor('int2c2e', aosym='s1')
-    int2c_low = scipy.linalg.cho_factor(int2c, lower=True)
+    solve_j2c = _gen_metric_solver(int2c)
     int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1')
 
     rhoj0_P = 0
@@ -103,13 +105,13 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         rhoj0_P += np.einsum('klp,kl->p', int3c, dm0[p0:p1])
         if with_k:
             tmp = lib.einsum('ijp,jk->pik', int3c, mocca)
-            tmp = scipy.linalg.cho_solve(int2c_low, tmp.reshape(naux,-1), overwrite_b=True)
+            tmp = solve_j2c(tmp.reshape(naux,-1))
             rhok0a_Pl_[:,p0:p1] = tmp.reshape(naux,p1-p0,nocca)
             tmp = lib.einsum('ijp,jk->pik', int3c, moccb)
-            tmp = scipy.linalg.cho_solve(int2c_low, tmp.reshape(naux,-1), overwrite_b=True)
+            tmp = solve_j2c(tmp.reshape(naux,-1))
             rhok0b_Pl_[:,p0:p1] = tmp.reshape(naux,p1-p0,noccb)
         int3c = tmp = None
-    rhoj0_P = scipy.linalg.cho_solve(int2c_low, rhoj0_P)
+    rhoj0_P = solve_j2c(rhoj0_P)
 
     get_int3c_ipip1 = _int3c_wrapper(mol, auxmol, 'int3c2e_ipip1', 's1')
     vj1_diag = 0
@@ -141,8 +143,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         shl0, shl1, p0, p1 = aoslices[ia]
         shls_slice = (shl0, shl1, 0, nbas, 0, auxmol.nbas)
         int3c_ip1 = get_int3c_ip1(shls_slice)
-        tmp_ip1 = scipy.linalg.cho_solve(int2c_low, int3c_ip1.reshape(-1,naux).T,
-                                         overwrite_b=True).reshape(naux,3,p1-p0,nao)
+        tmp_ip1 = solve_j2c(int3c_ip1.reshape(-1,naux).T).reshape(naux,3,p1-p0,nao)
         rhoj1[i0] = np.einsum('pxij,ji->px', tmp_ip1, dm0[:,p0:p1])
         wj1[i0] = np.einsum('xijp,ji->px', int3c_ip1, dm0[:,p0:p1])
         rho_ip1[p0:p1] = tmp_ip1.transpose(2,3,0,1)
@@ -197,11 +198,18 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         rho2c_0  = lib.einsum('pij,qij->pq', rhok0a_P__, rhok0a_P__)
         rho2c_0 += lib.einsum('pij,qij->pq', rhok0b_P__, rhok0b_P__)
 
-        int2c_inv = np.linalg.inv(int2c)
+        try:
+            int2c_inv = np.linalg.inv(int2c)
+        except scipy.linalg.LinAlgError:
+            w, v = scipy.linalg.eigh(int2c)
+            mask = w > LINEAR_DEP_THRESHOLD
+            v1 = v[:,mask]
+            int2c_inv = lib.dot(v1/w[mask], v1.conj().T)
+            v1 = None
         int2c_ipip1 = auxmol.intor('int2c2e_ipip1', aosym='s1')
         int2c_ip_ip  = lib.einsum('xpq,qr,ysr->xyps', int2c_ip1, int2c_inv, int2c_ip1)
         int2c_ip_ip -= auxmol.intor('int2c2e_ip1ip2', aosym='s1').reshape(3,3,naux,naux)
-    int2c = int2c_low = None
+    int2c = solve_j2c = None
 
     get_int3c_ipvip1 = _int3c_wrapper(mol, auxmol, 'int3c2e_ipvip1', 's1')
     get_int3c_ip1ip2 = _int3c_wrapper(mol, auxmol, 'int3c2e_ip1ip2', 's1')
@@ -395,8 +403,9 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
 
     h1aoa = [None] * mol.natm
     h1aob = [None] * mol.natm
-    for ia, h1, vj1, vk1a, vk1b in _gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
-                                           atmlst, verbose, True):
+    for ia, h1, vj1, vk1 in _gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
+                                    atmlst, verbose, True):
+        vk1a, vk1b = vk1
         h1a = h1 + vj1 - vk1a
         h1b = h1 + vj1 - vk1b
 
@@ -439,7 +448,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
     rho0_Pij = ftmp.create_group('rho0_Pij')
     wj_Pij = ftmp.create_group('wj_Pij')
     int2c = auxmol.intor('int2c2e', aosym='s1')
-    int2c_low = scipy.linalg.cho_factor(int2c, lower=True)
+    solve_j2c = _gen_metric_solver(int2c)
+    int2c = None
     int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1')
     rhoj0_P = 0
     if with_k:
@@ -447,7 +457,7 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
         rhok0b_Pl_ = np.empty((naux,nao,noccb))
     for i, (shl0, shl1, p0, p1) in enumerate(aoslices):
         int3c = get_int3c((shl0, shl1, 0, nbas, 0, auxmol.nbas))
-        coef3c = scipy.linalg.cho_solve(int2c_low, int3c.reshape(-1,naux).T, overwrite_b=True)
+        coef3c = solve_j2c(int3c.reshape(-1,naux).T)
         rho0_Pij['%.4d'%i] = coef3c = coef3c.reshape(naux,p1-p0,nao)
         rhoj0_P += np.einsum('pkl,kl->p', coef3c, dm0[p0:p1])
         if with_k:
@@ -471,25 +481,28 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
         for i, (shl0, shl1, q0, q1) in enumerate(aoslices):
             wj1 = np.einsum('xijp,ji->xp', int3c_ip1[:,q0:q1], dm0[:,q0:q1])
             vj1_buf[i] += np.einsum('xp,pij->xij', wj1, coef3c)
-        rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0a_Pl_[p0:p1], mocca)
-        vk1a_buf += lib.einsum('xijp,plj->xil', int3c_ip1, rhok0_PlJ[p0:p1])
-        rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0b_Pl_[p0:p1], moccb)
-        vk1b_buf += lib.einsum('xijp,plj->xil', int3c_ip1, rhok0_PlJ[p0:p1])
+        if with_k:
+            rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0a_Pl_[p0:p1], mocca)
+            vk1a_buf += lib.einsum('xijp,plj->xil', int3c_ip1, rhok0_PlJ[p0:p1])
+            rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0b_Pl_[p0:p1], moccb)
+            vk1b_buf += lib.einsum('xijp,plj->xil', int3c_ip1, rhok0_PlJ[p0:p1])
         int3c_ip1 = None
     vj1_buf = ftmp['vj1_buf'] = vj1_buf
 
+    vk1a = vk1b = np.zeros((3,nao,nao))
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
         shls_slice = (shl0, shl1, 0, nbas, 0, auxmol.nbas)
         int3c_ip1 = get_int3c_ip1(shls_slice)
         vj1 = -np.asarray(vj1_buf[ia])
-        rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0a_Pl_, mocca[p0:p1])
-        vk1a = -lib.einsum('xijp,pki->xkj', int3c_ip1, rhok0_PlJ)
-        rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0b_Pl_, moccb[p0:p1])
-        vk1b = -lib.einsum('xijp,pki->xkj', int3c_ip1, rhok0_PlJ)
         vj1[:,p0:p1] -= np.einsum('xijp,p->xij', int3c_ip1, rhoj0_P)
-        vk1a[:,p0:p1] -= vk1a_buf[:,p0:p1]
-        vk1b[:,p0:p1] -= vk1b_buf[:,p0:p1]
+        if with_k:
+            rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0a_Pl_, mocca[p0:p1])
+            vk1a = -lib.einsum('xijp,pki->xkj', int3c_ip1, rhok0_PlJ)
+            rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0b_Pl_, moccb[p0:p1])
+            vk1b = -lib.einsum('xijp,pki->xkj', int3c_ip1, rhok0_PlJ)
+            vk1a[:,p0:p1] -= vk1a_buf[:,p0:p1]
+            vk1b[:,p0:p1] -= vk1b_buf[:,p0:p1]
 
         if hessobj.auxbasis_response:
             shl0, shl1, q0, q1 = auxslices[ia]
@@ -502,18 +515,20 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None,
             vj1 += .5 * np.einsum('xijp,p->xij', int3c_ip2, -rhoj0_P[q0:q1])
             vj1 -= .5 * lib.einsum('xpq,q,pij->xij', int2c_ip1[:,q0:q1], -rhoj0_P, coef3c)
             vj1 -= .5 * lib.einsum('pixj,p->xij', Pij, -rhoj0_P[q0:q1])
-            rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0a_Pl_[q0:q1], mocca)
-            vk1a -= lib.einsum('plj,xijp->xil', rhok0_PlJ, int3c_ip2)
-            vk1a += lib.einsum('pjxi,plj->xil', Pij, rhok0_PlJ)
-            rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0b_Pl_[q0:q1], moccb)
-            vk1b -= lib.einsum('plj,xijp->xil', rhok0_PlJ, int3c_ip2)
-            vk1b += lib.einsum('pjxi,plj->xil', Pij, rhok0_PlJ)
+            if with_k:
+                rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0a_Pl_[q0:q1], mocca)
+                vk1a -= lib.einsum('plj,xijp->xil', rhok0_PlJ, int3c_ip2)
+                vk1a += lib.einsum('pjxi,plj->xil', Pij, rhok0_PlJ)
+                rhok0_PlJ = lib.einsum('plj,Jj->plJ', rhok0b_Pl_[q0:q1], moccb)
+                vk1b -= lib.einsum('plj,xijp->xil', rhok0_PlJ, int3c_ip2)
+                vk1b += lib.einsum('pjxi,plj->xil', Pij, rhok0_PlJ)
 
         vj1 = vj1 + vj1.transpose(0,2,1)
-        vk1a = vk1a + vk1a.transpose(0,2,1)
-        vk1b = vk1b + vk1b.transpose(0,2,1)
+        if with_k:
+            vk1a = vk1a + vk1a.transpose(0,2,1)
+            vk1b = vk1b + vk1b.transpose(0,2,1)
         h1 = hcore_deriv(ia)
-        yield ia, h1, vj1, vk1a, vk1b
+        yield ia, h1, vj1, (vk1a, vk1b)
 
 
 class Hessian(uhf_hess.Hessian):
diff --git a/pyscf/df/hessian/uks.py b/pyscf/df/hessian/uks.py
index 55cba71003..ead7055096 100644
--- a/pyscf/df/hessian/uks.py
+++ b/pyscf/df/hessian/uks.py
@@ -41,6 +41,10 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     mol = hessobj.mol
     mf = hessobj.base
+    ni = mf._numint
+    if mf.nlc or ni.libxc.is_nlc(mf.xc):
+        raise NotImplementedError('RKS Hessian for NLC functional')
+
     if mo_energy is None: mo_energy = mf.mo_energy
     if mo_occ is None:    mo_occ = mf.mo_occ
     if mo_coeff is None:  mo_coeff = mf.mo_coeff
@@ -52,18 +56,17 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     dm0a = numpy.dot(mocca, mocca.T)
     dm0b = numpy.dot(moccb, moccb.T)
 
-    if mf.nlc != '':
-        raise NotImplementedError
-    #enabling range-separated hybrids
-    omega, alpha, beta = mf._numint.rsh_coeff(mf.xc)
-    if abs(omega) > 1e-10:
-        raise NotImplementedError
-    else:
-        hyb = mf._numint.hybrid_coeff(mf.xc, spin=mol.spin)
+    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
+    hybrid = ni.libxc.is_hybrid_xc(mf.xc)
     de2, ej, ek = df_uhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
                                                 atmlst, max_memory, verbose,
-                                                abs(hyb) > 1e-10)
+                                                with_k=hybrid)
     de2 += ej - hyb * ek  # (A,B,dR_A,dR_B)
+    if hybrid and omega != 0:
+        with hessobj.base.with_df.range_coulomb(omega):
+            ek_lr = df_uhf_hess._partial_hess_ejk(
+                hessobj, mo_energy, mo_coeff, mo_occ, atmlst, max_memory, verbose)[2]
+        de2 -= ek_lr * (alpha - hyb)
 
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, mf.max_memory*.9-mem_now)
@@ -96,19 +99,29 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     ni = mf._numint
     ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
+    hybrid = ni.libxc.is_hybrid_xc(mf.xc)
 
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, mf.max_memory*.9-mem_now)
     h1aoa, h1aob = uks_hess._get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory)
-    for ia, h1, vj1, vk1a, vk1b in df_uhf_hess._gen_jk(hessobj, mo_coeff, mo_occ, chkfile,
-                                                       atmlst, verbose, abs(hyb) > 1e-10):
+    for ia, h1, vj1, vk1 in df_uhf_hess._gen_jk(
+            hessobj, mo_coeff, mo_occ, chkfile, atmlst, verbose, with_k=hybrid):
         f1 = h1 + vj1
         h1aoa[ia] += f1
         h1aob[ia] += f1
-        if abs(hyb) > 1e-10:
+        if hybrid:
+            vk1a, vk1b = vk1
             h1aoa[ia] -= hyb * vk1a
             h1aob[ia] -= hyb * vk1b
 
+    if hybrid and omega != 0:
+        with hessobj.base.with_df.range_coulomb(omega):
+            for ia, h1, vj1, vk1 in df_uhf_hess._gen_jk(
+                    hessobj, mo_coeff, mo_occ, chkfile, atmlst, verbose):
+                vk1a, vk1b = vk1
+                h1aoa[ia] -= (alpha - hyb) * vk1a
+                h1aob[ia] -= (alpha - hyb) * vk1b
+
     if chkfile is None:
         return h1aoa, h1aob
     else:
@@ -147,7 +160,7 @@ def __init__(self, mf):
     mol.unit = 'B'
     mol.spin = 2
     mol.build()
-    mf = dft.UKS(mol).density_fit()
+    mf = dft.RKS(mol).density_fit()
     mf.grids.level = 4
     mf.grids.prune = False
     mf.xc = xc_code
diff --git a/pyscf/df/test/test_addons.py b/pyscf/df/test/test_addons.py
index c8b0c30a84..35007a56b9 100644
--- a/pyscf/df/test/test_addons.py
+++ b/pyscf/df/test/test_addons.py
@@ -82,4 +82,3 @@ def test_default_auxbasis(self):
 if __name__ == "__main__":
     print("Full Tests for df.addons")
     unittest.main()
-
diff --git a/pyscf/df/test/test_df.py b/pyscf/df/test/test_df.py
index 3ace55c238..2f844d9ebf 100644
--- a/pyscf/df/test/test_df.py
+++ b/pyscf/df/test/test_df.py
@@ -124,6 +124,18 @@ def test_rsh_df4c_get_jk(self):
             self.assertAlmostEqual(abs(vj-vj1).max(), 0, 2)
             self.assertAlmostEqual(abs(vk-vk1).max(), 0, 2)
 
+    def test_rsh_df_custom_storage(self):
+        mol = gto.M(atom = 'H 0 0 0; F 0 0 1.1', verbose=0)
+        mf = mol.RKS().density_fit()
+        mf.xc = 'lda+0.5*SR_HF(0.3)'
+        mf.run()
+        self.assertAlmostEqual(mf.e_tot, -102.02277148333626, 8)
+
+        with tempfile.NamedTemporaryFile() as ftmp:
+            mf.with_df._cderi_to_save = ftmp.name
+            mf.run()
+        self.assertAlmostEqual(mf.e_tot, -102.02277148333626, 8)
+
 if __name__ == "__main__":
     print("Full Tests for df")
     unittest.main()
diff --git a/pyscf/df/test/test_df_grad.py b/pyscf/df/test/test_df_grad.py
index 487336d953..710af63fd6 100644
--- a/pyscf/df/test/test_df_grad.py
+++ b/pyscf/df/test/test_df_grad.py
@@ -47,33 +47,113 @@ class KnownValues(unittest.TestCase):
     def test_rhf_grad(self):
         gref = scf.RHF(mol).run().nuc_grad_method().kernel()
         g1 = scf.RHF(mol).density_fit().run().nuc_grad_method().kernel()
-        self.assertAlmostEqual(abs(gref - g1).max(), 0, 5)
+        self.assertAlmostEqual(abs(gref - g1).max(), 0, 4)
+
+        pmol = mol.copy()
+        mf = scf.RHF(pmol).density_fit(auxbasis='ccpvdz-jkfit').run()
+        g = mf.Gradients().set(auxbasis_response=False).kernel()
+        self.assertAlmostEqual(lib.fp(g), 0.005466630382488041, 7)
+        g = mf.Gradients().kernel()
+        self.assertAlmostEqual(lib.fp(g), 0.005516638190173352, 7)
+        mfs = mf.as_scanner()
+        e1 = mfs([['O' , (0. , 0.     , 0.001)],
+                  [1   , (0. , -0.757 , 0.587)],
+                  [1   , (0. , 0.757  , 0.587)] ])
+        e2 = mfs([['O' , (0. , 0.     ,-0.001)],
+                  [1   , (0. , -0.757 , 0.587)],
+                  [1   , (0. , 0.757  , 0.587)] ])
+        self.assertAlmostEqual((e1-e2)/0.002*lib.param.BOHR, g[0,2], 6)
 
     def test_rks_lda_grad(self):
         gref = mol.RKS(xc='lda,').run().nuc_grad_method().kernel()
         g1 = mol.RKS(xc='lda,').density_fit().run().nuc_grad_method().kernel()
         self.assertAlmostEqual(abs(gref - g1).max(), 0, 4)
 
-    def test_rks_grad(self):
+    def test_rks_gga_grad(self):
         gref = mol.RKS(xc='b3lyp').run().nuc_grad_method().kernel()
         g1 = mol.RKS(xc='b3lyp').density_fit().run().nuc_grad_method().kernel()
         self.assertAlmostEqual(abs(gref - g1).max(), 0, 4)
 
+    def test_rks_rsh_grad(self):
+        gref = mol.RKS(xc='wb97').run().nuc_grad_method().kernel()
+        g1 = mol.RKS(xc='wb97').density_fit().run().nuc_grad_method().kernel()
+        self.assertAlmostEqual(abs(gref - g1).max(), 0, 4)
+
+    def test_rks_mgga_grad(self):
+        gref = mol.RKS(xc='m06').run().nuc_grad_method().kernel()
+        g1 = mol.RKS(xc='m06').density_fit().run().nuc_grad_method().kernel()
+        self.assertAlmostEqual(abs(gref - g1).max(), 0, 4)
+
     def test_uhf_grad(self):
+        mol = gto.Mole()
+        mol.atom = [
+            ['O' , (0. , 0.     , 0.)],
+            [1   , (0. , -0.757 , 0.587)],
+            [1   , (0. , 0.757  , 0.587)] ]
+        mol.verbose = 0
+        mol.basis = '631g'
+        mol.spin = 2
+        mol.build()
+        mf = scf.UHF(mol).density_fit().run(conv_tol=1e-12)
+        g1 = mf.nuc_grad_method().kernel()
         gref = mol.UHF.run().nuc_grad_method().kernel()
-        g1 = mol.UHF.density_fit().run().nuc_grad_method().kernel()
-        self.assertAlmostEqual(abs(gref - g1).max(), 0, 5)
+        self.assertAlmostEqual(abs(gref - g1).max(), 0, 4)
+
+        g = mf.Gradients().set(auxbasis_response=False).kernel()
+        self.assertAlmostEqual(lib.fp(g), -0.19670644982746546, 7)
+        g = mf.Gradients().kernel()
+        self.assertAlmostEqual(lib.fp(g), -0.19660674423263175, 7)
+        mfs = mf.as_scanner()
+        e1 = mfs([['O' , (0. , 0.     , 0.001)],
+                  [1   , (0. , -0.757 , 0.587)],
+                  [1   , (0. , 0.757  , 0.587)] ])
+        e2 = mfs([['O' , (0. , 0.     ,-0.001)],
+                  [1   , (0. , -0.757 , 0.587)],
+                  [1   , (0. , 0.757  , 0.587)] ])
+        self.assertAlmostEqual((e1-e2)/0.002*lib.param.BOHR, g[0,2], 6)
 
     def test_uks_lda_grad(self):
-        gref = mol.UKS.run(xc='lda,').nuc_grad_method().kernel()
-        g1 = mol.UKS.density_fit().run(xc='lda,').nuc_grad_method().kernel()
+        mol = gto.Mole()
+        mol.atom = [
+            ['O' , (0. , 0.     , 0.)],
+            [1   , (0. , -0.757 , 0.587)],
+            [1   , (0. ,  0.757 , 0.587)] ]
+        mol.basis = '631g'
+        mol.charge = 1
+        mol.spin = 1
+        mol.build()
+        mf = mol.UKS().density_fit().run(conv_tol=1e-12)
+        gref = mol.UKS.run().nuc_grad_method().kernel()
+        g1 = mf.nuc_grad_method().kernel()
         self.assertAlmostEqual(abs(gref - g1).max(), 0, 4)
 
-    def test_uks_grad(self):
+        g = mf.Gradients().set(auxbasis_response=False)
+        self.assertAlmostEqual(lib.finger(g.kernel()), -0.12092643506961044, 7)
+        g = mf.Gradients()
+        self.assertAlmostEqual(lib.finger(g.kernel()), -0.12092884149543644, 7)
+        g.grid_response = True
+        self.assertAlmostEqual(lib.finger(g.kernel()), -0.12093220332146028, 7)
+
+    def test_uks_gga_grad(self):
         gref = mol.UKS.run(xc='b3lyp').nuc_grad_method().kernel()
         g1 = mol.UKS.density_fit().run(xc='b3lyp').nuc_grad_method().kernel()
         self.assertAlmostEqual(abs(gref - g1).max(), 0, 4)
 
+    def test_uks_rsh_grad(self):
+        mol = gto.Mole()
+        mol.atom = [
+            ['O' , (0. , 0.     , 0.)],
+            [1   , (0. , -0.757 , 0.587)],
+            [1   , (0. ,  0.757 , 0.587)] ]
+        mol.basis = '631g'
+        mol.charge = 1
+        mol.spin = 1
+        mol.verbose = 0
+        mol.build()
+        gref = mol.UKS(xc='camb3lyp').run().nuc_grad_method().kernel()
+        g1 = mol.UKS(xc='camb3lyp').density_fit().run().nuc_grad_method().kernel()
+        self.assertAlmostEqual(abs(gref - g1).max(), 0, 4)
+
     def test_casscf_grad(self):
         gref = mcscf.CASSCF (mol.RHF.run (), 8, 6).run ().nuc_grad_method().kernel()
         g1 = mcscf.CASSCF (mol.RHF.density_fit().run(), 8, 6).run ().nuc_grad_method().kernel()
@@ -90,4 +170,3 @@ def test_sacasscf_grad(self):
 if __name__ == "__main__":
     print("Full Tests for df.grad")
     unittest.main()
-
diff --git a/pyscf/df/test/test_df_hessian.py b/pyscf/df/test/test_df_hessian.py
index 835d37868c..0ce69cdf58 100644
--- a/pyscf/df/test/test_df_hessian.py
+++ b/pyscf/df/test/test_df_hessian.py
@@ -48,9 +48,23 @@ def test_rhf_hess(self):
         h1 = scf.RHF(mol).density_fit().run().Hessian().kernel()
         self.assertAlmostEqual(abs(href - h1).max(), 0, 3)
 
-    def test_rks_hess(self):
+    def test_rks_lda_hess(self):
+        href = mol.RKS.run(xc='lda,vwn').Hessian().kernel()
+        df_h = mol.RKS.density_fit().run(xc='lda,vwn').Hessian()
+        df_h.auxbasis_response = 2
+        h1 = df_h.kernel()
+        self.assertAlmostEqual(abs(href - h1).max(), 0, 4)
+
+    def test_rks_gga_hess(self):
         href = mol.RKS.run(xc='b3lyp').Hessian().kernel()
-        h1 = mol.RKS.density_fit().run(xc='b3lyp').Hessian().kernel()
+        df_h = mol.RKS.density_fit().run(xc='b3lyp').Hessian()
+        df_h.auxbasis_response = 2
+        h1 = df_h.kernel()
+        self.assertAlmostEqual(abs(href - h1).max(), 0, 4)
+
+    def test_rks_rsh_hess(self):
+        href = mol.RKS.run(xc='camb3lyp').Hessian().kernel()
+        h1 = mol.RKS.density_fit().run(xc='camb3lyp').Hessian().kernel()
         self.assertAlmostEqual(abs(href - h1).max(), 0, 3)
 
     def test_uhf_hess(self):
@@ -59,8 +73,8 @@ def test_uhf_hess(self):
         self.assertAlmostEqual(abs(href - h1).max(), 0, 3)
 
     def test_uks_hess(self):
-        href = mol.UKS.run(xc='b3lyp').Hessian().kernel()
-        h1 = mol.UKS.density_fit().run(xc='b3lyp').Hessian().kernel()
+        href = mol.UKS.run(xc='camb3lyp').Hessian().kernel()
+        h1 = mol.UKS.density_fit().run(xc='camb3lyp').Hessian().kernel()
         self.assertAlmostEqual(abs(href - h1).max(), 0, 3)
 
 if __name__ == "__main__":
diff --git a/pyscf/dft/dks.py b/pyscf/dft/dks.py
index 60641facc4..6e0de04adf 100644
--- a/pyscf/dft/dks.py
+++ b/pyscf/dft/dks.py
@@ -33,8 +33,8 @@
 
 @lib.with_doc(gks.get_veff.__doc__)
 def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
-    if ks.nlc != '':
-        raise NotImplementedError(ks.nlc)
+    if ks.nlc or ks._numint.libxc.is_nlc(ks.xc):
+        raise NotImplementedError(f'NLC functional {ks.xc} + {ks.nlc}')
     return gks.get_veff(ks, mol, dm, dm_last, vhf_last, hermi)
 
 
diff --git a/pyscf/dft/gen_grid.py b/pyscf/dft/gen_grid.py
index 8f29348d9d..7ffccc52cc 100644
--- a/pyscf/dft/gen_grid.py
+++ b/pyscf/dft/gen_grid.py
@@ -261,13 +261,6 @@ def gen_atomic_grids(mol, atom_grid={}, radi_method=radi.gauss_chebyshev,
             logger.debug(mol, 'atom %s rad-grids = %d, ang-grids = %s',
                          symb, n_rad, angs)
 
-            ang_grids = {}
-            for n in sorted(set(angs)):
-                grid = numpy.empty((n,4))
-                libdft.MakeAngularGrid(grid.ctypes.data_as(ctypes.c_void_p),
-                                       ctypes.c_int(n))
-                ang_grids[n] = grid
-
             angs = numpy.array(angs)
             coords = []
             vol = []
diff --git a/pyscf/dft/gks.py b/pyscf/dft/gks.py
index 9c701f3968..282c40f0bd 100644
--- a/pyscf/dft/gks.py
+++ b/pyscf/dft/gks.py
@@ -75,19 +75,21 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
         ni = ks._numint
         n, exc, vxc = ni.get_vxc(mol, ks.grids, ks.xc, dm,
                                  hermi=hermi, max_memory=max_memory)
-        if ks.nlc != '':
-            assert ('VV10' in ks.nlc.upper())
-            _, enlc, vnlc = ni.get_vxc(mol, ks.nlcgrids, ks.xc+'__'+ks.nlc, dm,
-                                       hermi=hermi, max_memory=max_memory)
+        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        if ks.nlc or ni.libxc.is_nlc(ks.xc):
+            if ni.libxc.is_nlc(ks.xc):
+                xc = ks.xc
+            else:
+                assert ni.libxc.is_nlc(ks.nlc)
+                xc = ks.nlc
+            n, enlc, vnlc = ni.nr_nlc_vxc(mol, ks.nlcgrids, xc, dm,
+                                          hermi=hermi, max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+            logger.debug(ks, 'nelec with nlc grids = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
-    #enabling range-separated hybrids
-    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=mol.spin)
-
-    if abs(hyb) < 1e-10 and abs(alpha) < 1e-10:
+    if not ni.libxc.is_hybrid_xc(ks.xc):
         vk = None
         if (ks._eri is None and ks.direct_scf and
             getattr(vhf_last, 'vj', None) is not None):
@@ -98,12 +100,13 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
             vj = ks.get_j(mol, dm, hermi)
         vxc += vj
     else:
+        omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=mol.spin)
         if (ks._eri is None and ks.direct_scf and
             getattr(vhf_last, 'vk', None) is not None):
             ddm = numpy.asarray(dm) - numpy.asarray(dm_last)
             vj, vk = ks.get_jk(mol, ddm, hermi)
             vk *= hyb
-            if abs(omega) > 1e-10:
+            if omega != 0:
                 vklr = ks.get_k(mol, ddm, hermi, omega=omega)
                 vklr *= (alpha - hyb)
                 vk += vklr
@@ -112,7 +115,7 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
         else:
             vj, vk = ks.get_jk(mol, dm, hermi)
             vk *= hyb
-            if abs(omega) > 1e-10:
+            if omega != 0:
                 vklr = ks.get_k(mol, dm, hermi, omega=omega)
                 vklr *= (alpha - hyb)
                 vk += vklr
diff --git a/pyscf/dft/libxc.py b/pyscf/dft/libxc.py
index c17efddc63..0cab269734 100644
--- a/pyscf/dft/libxc.py
+++ b/pyscf/dft/libxc.py
@@ -28,6 +28,7 @@
 import ctypes
 import math
 import numpy
+from functools import lru_cache
 from pyscf import lib
 from pyscf.dft.xc.utils import remove_dup, format_xc_code
 from pyscf import __config__
@@ -39,6 +40,7 @@
 _itrf.LIBXC_needs_laplacian.restype = ctypes.c_int
 _itrf.LIBXC_needs_laplacian.argtypes = [ctypes.c_int]
 _itrf.LIBXC_is_hybrid.restype = ctypes.c_int
+_itrf.LIBXC_is_nlc.restype = ctypes.c_int
 _itrf.LIBXC_is_cam_rsh.restype = ctypes.c_int
 _itrf.LIBXC_max_deriv_order.restype = ctypes.c_int
 _itrf.LIBXC_number_of_functionals.restype = ctypes.c_int
@@ -55,6 +57,9 @@
 _itrf.LIBXC_reference_doi.restype = ctypes.c_char_p
 _itrf.LIBXC_xc_reference.argtypes = [ctypes.c_int, (ctypes.c_char_p * 8)]
 
+_itrf.xc_functional_get_number.argtypes = (ctypes.c_char_p, )
+_itrf.xc_functional_get_number.restype = ctypes.c_int
+
 def libxc_version():
     '''Returns the version of libxc'''
     return _itrf.LIBXC_version().decode("UTF-8")
@@ -851,11 +856,6 @@ def _xc_key_without_underscore(xc_keys):
 XC_ALIAS.update([(key.replace('-',''), XC_ALIAS[key])
                  for key in XC_ALIAS if '-' in key])
 
-VV10_XC = set(('B97M_V', 'WB97M_V', 'WB97X_V', 'VV10', 'LC_VV10',
-               'REVSCAN_VV10',
-               'SCAN_VV10', 'SCAN_RVV10', 'SCANL_VV10', 'SCANL_RVV10'))
-VV10_XC = VV10_XC.union(set([x.replace('_', '') for x in VV10_XC]))
-
 def xc_reference(xc_code):
     '''Returns the reference to the individual XC functional'''
     hyb, fn_facs = parse_xc(xc_code)
@@ -868,15 +868,18 @@ def xc_reference(xc_code):
                 refs.append(ref.decode("UTF-8"))
     return refs
 
+@lru_cache(100)
 def xc_type(xc_code):
     if xc_code is None:
         return None
     elif isinstance(xc_code, str):
-        if is_nlc(xc_code):
-            return 'NLC'
+        if '__VV10' in xc_code:
+            raise RuntimeError('Deprecated notation for NLC functional.')
         hyb, fn_facs = parse_xc(xc_code)
     else:
+        assert isinstance(xc_code, int)
         fn_facs = [(xc_code, 1)]  # mimic fn_facs
+
     if not fn_facs:
         return 'HF'
     elif all(_itrf.LIBXC_is_lda(ctypes.c_int(xid)) for xid, fac in fn_facs):
@@ -891,6 +894,7 @@ def xc_type(xc_code):
 def is_lda(xc_code):
     return xc_type(xc_code) == 'LDA'
 
+@lru_cache(100)
 def is_hybrid_xc(xc_code):
     if xc_code is None:
         return False
@@ -902,7 +906,7 @@ def is_hybrid_xc(xc_code):
                 return True
             if hybrid_coeff(xc_code) != 0:
                 return True
-            if rsh_coeff(xc_code) != [0, 0, 0]:
+            if rsh_coeff(xc_code) != (0, 0, 0):
                 return True
             return False
     elif isinstance(xc_code, int):
@@ -916,12 +920,22 @@ def is_meta_gga(xc_code):
 def is_gga(xc_code):
     return xc_type(xc_code) == 'GGA'
 
+@lru_cache(100)
+def is_nlc(xc_code):
+    if isinstance(xc_code, str):
+        if xc_code.isdigit():
+            return _itrf.LIBXC_is_nlc(ctypes.c_int(int(xc_code)))
+        else:
+            fn_facs = parse_xc(xc_code)[1]
+            return any(_itrf.LIBXC_is_nlc(ctypes.c_int(xid)) for xid, fac in fn_facs)
+    elif isinstance(xc_code, int):
+        return _itrf.LIBXC_is_nlc(ctypes.c_int(xc_code))
+    else:
+        return any((is_nlc(x) for x in xc_code))
+
 def needs_laplacian(xc_code):
     return _itrf.LIBXC_needs_laplacian(xc_code) != 0
 
-def is_nlc(xc_code):
-    return '__VV10' in xc_code.upper()
-
 def max_deriv_order(xc_code):
     hyb, fn_facs = parse_xc(xc_code)
     if fn_facs:
@@ -951,41 +965,28 @@ def test_deriv_order(xc_code, deriv, raise_error=False):
             raise e
     return support
 
+@lru_cache(100)
 def hybrid_coeff(xc_code, spin=0):
     '''Support recursively defining hybrid functional
     '''
     hyb, fn_facs = parse_xc(xc_code)
-    for xid, fac in fn_facs:
-        hyb[0] += fac * _itrf.LIBXC_hybrid_coeff(ctypes.c_int(xid))
-    return hyb[0]
+    hybs = [fac * _itrf.LIBXC_hybrid_coeff(ctypes.c_int(xid)) for xid, fac in fn_facs]
+    return hyb[0] + sum(hybs)
 
+@lru_cache(100)
 def nlc_coeff(xc_code):
     '''Get NLC coefficients
     '''
-    nlc_code = None
-    if isinstance(xc_code, str) and '__VV10' in xc_code.upper():
-        xc_code, nlc_code = xc_code.upper().split('__', 1)
-
     hyb, fn_facs = parse_xc(xc_code)
-    nlc_pars = [0, 0]
+    nlc_pars = []
     nlc_tmp = (ctypes.c_double*2)()
     for xid, fac in fn_facs:
-        _itrf.LIBXC_nlc_coeff(xid, nlc_tmp)
-        nlc_pars[0] += nlc_tmp[0]
-        nlc_pars[1] += nlc_tmp[1]
-
-    if nlc_pars[0] == 0 and nlc_pars[1] == 0:
-        if nlc_code is not None:
-            # Use VV10 NLC parameters by default for the general case
-            _itrf.LIBXC_nlc_coeff(XC_CODES['GGA_XC_' + nlc_code], nlc_tmp)
-            nlc_pars[0] += nlc_tmp[0]
-            nlc_pars[1] += nlc_tmp[1]
-        else:
-            raise NotImplementedError(
-                '%s does not have NLC part. Available functionals are %s' %
-                (xc_code, ', '.join(VV10_XC.keys())))
-    return nlc_pars
+        if _itrf.LIBXC_is_nlc(ctypes.c_int(xid)):
+            _itrf.LIBXC_nlc_coeff(xid, nlc_tmp)
+            nlc_pars.append((tuple(nlc_tmp), fac))
+    return tuple(nlc_pars)
 
+@lru_cache(100)
 def rsh_coeff(xc_code):
     '''Range-separated parameter and HF exchange components: omega, alpha, beta
 
@@ -993,8 +994,8 @@ def rsh_coeff(xc_code):
             = alpha * HFX   + beta * SR_HFX + (1-c_SR) * Ex_SR + (1-c_LR) * Ex_LR + Ec
             = alpha * LR_HFX + hyb * SR_HFX + (1-c_SR) * Ex_SR + (1-c_LR) * Ex_LR + Ec
 
-    SR_HFX = < pi | (1-e^{-omega r_{12}})/r_{12} | iq >
-    LR_HFX = < pi | e^{-omega r_{12}}/r_{12} | iq >
+    SR_HFX = < pi | (1-erf(-omega r_{12}))/r_{12} | iq >
+    LR_HFX = < pi | erf(-omega r_{12})/r_{12} | iq >
     alpha = c_LR
     beta = c_SR - c_LR = hyb - alpha
     '''
@@ -1011,12 +1012,10 @@ def rsh_coeff(xc_code):
             check_omega = False
 
     hyb, fn_facs = parse_xc(xc_code)
-
     hyb, alpha, omega = hyb
     beta = hyb - alpha
     rsh_pars = [omega, alpha, beta]
     rsh_tmp = (ctypes.c_double*3)()
-    _itrf.LIBXC_rsh_coeff(433, rsh_tmp)
     for xid, fac in fn_facs:
         _itrf.LIBXC_rsh_coeff(xid, rsh_tmp)
         if rsh_pars[0] == 0:
@@ -1031,7 +1030,7 @@ def rsh_coeff(xc_code):
                 raise ValueError('Different values of omega found for RSH functionals')
         rsh_pars[1] += rsh_tmp[1] * fac
         rsh_pars[2] += rsh_tmp[2] * fac
-    return rsh_pars
+    return tuple(rsh_pars)
 
 def parse_xc_name(xc_name='LDA,VWN'):
     '''Convert the XC functional name to libxc library internal ID.
@@ -1039,6 +1038,7 @@ def parse_xc_name(xc_name='LDA,VWN'):
     fn_facs = parse_xc(xc_name)[1]
     return fn_facs[0][0], fn_facs[1][0]
 
+@lru_cache(100)
 def parse_xc(description):
     r'''Rules to input functional description:
 
@@ -1077,96 +1077,20 @@ def parse_xc(description):
       contribution has been included.
 
     Args:
-        xc_code : str
+        description : str
             A string to describe the linear combination of different XC functionals.
             The X and C functional are separated by comma like '.8*LDA+.2*B86,VWN'.
             If "HF" was appeared in the string, it stands for the exact exchange.
-        rho : ndarray
-            Shape of ((*,N)) for electron density (and derivatives) if spin = 0;
-            Shape of ((*,N),(*,N)) for alpha/beta electron density (and derivatives) if spin > 0;
-            where N is number of grids.
-            rho (*,N) are ordered as (den,grad_x,grad_y,grad_z,laplacian,tau)
-            where grad_x = d/dx den, laplacian = \nabla^2 den, tau = 1/2(\nabla f)^2
-            In spin unrestricted case,
-            rho is ((den_u,grad_xu,grad_yu,grad_zu,laplacian_u,tau_u)
-                    (den_d,grad_xd,grad_yd,grad_zd,laplacian_d,tau_d))
-
-    Kwargs:
-        spin : int
-            spin polarized if spin > 0
-        relativity : int
-            No effects.
-        verbose : int or object of :class:`Logger`
-            No effects.
 
     Returns:
-        ex, vxc, fxc, kxc
-
-        where
-
-        * vxc = (vrho, vsigma, vlapl, vtau) for restricted case
-
-        * vxc for unrestricted case
-          | vrho[:,2]   = (u, d)
-          | vsigma[:,3] = (uu, ud, dd)
-          | vlapl[:,2]  = (u, d)
-          | vtau[:,2]   = (u, d)
-
-        * fxc for restricted case:
-          (v2rho2, v2rhosigma, v2sigma2, v2lapl2, vtau2, v2rholapl, v2rhotau, v2lapltau, v2sigmalapl, v2sigmatau)
-
-        * fxc for unrestricted case:
-          | v2rho2[:,3]     = (u_u, u_d, d_d)
-          | v2rhosigma[:,6] = (u_uu, u_ud, u_dd, d_uu, d_ud, d_dd)
-          | v2sigma2[:,6]   = (uu_uu, uu_ud, uu_dd, ud_ud, ud_dd, dd_dd)
-          | v2lapl2[:,3]
-          | v2tau2[:,3]     = (u_u, u_d, d_d)
-          | v2rholapl[:,4]
-          | v2rhotau[:,4]   = (u_u, u_d, d_u, d_d)
-          | v2lapltau[:,4]
-          | v2sigmalapl[:,6]
-          | v2sigmatau[:,6] = (uu_u, uu_d, ud_u, ud_d, dd_u, dd_d)
-
-        * kxc for restricted case:
-          (v3rho3, v3rho2sigma, v3rhosigma2, v3sigma3,
-           v3rho2lapl, v3rho2tau,
-           v3rhosigmalapl, v3rhosigmatau,
-           v3rholapl2, v3rholapltau, v3rhotau2,
-           v3sigma2lapl, v3sigma2tau,
-           v3sigmalapl2, v3sigmalapltau, v3sigmatau2,
-           v3lapl3, v3lapl2tau, v3lapltau2, v3tau3)
-
-        * kxc for unrestricted case:
-          | v3rho3[:,4]         = (u_u_u, u_u_d, u_d_d, d_d_d)
-          | v3rho2sigma[:,9]    = (u_u_uu, u_u_ud, u_u_dd, u_d_uu, u_d_ud, u_d_dd, d_d_uu, d_d_ud, d_d_dd)
-          | v3rhosigma2[:,12]   = (u_uu_uu, u_uu_ud, u_uu_dd, u_ud_ud, u_ud_dd, u_dd_dd, d_uu_uu, d_uu_ud, d_uu_dd, d_ud_ud, d_ud_dd, d_dd_dd)
-          | v3sigma3[:,10]      = (uu_uu_uu, uu_uu_ud, uu_uu_dd, uu_ud_ud, uu_ud_dd, uu_dd_dd, ud_ud_ud, ud_ud_dd, ud_dd_dd, dd_dd_dd)
-          | v3rho2lapl[:,6]
-          | v3rho2tau[:,6]      = (u_u_u, u_u_d, u_d_u, u_d_d, d_d_u, d_d_d)
-          | v3rhosigmalapl[:,12]
-          | v3rhosigmatau[:,12] = (u_uu_u, u_uu_d, u_ud_u, u_ud_d, u_dd_u, u_dd_d,
-                                   d_uu_u, d_uu_d, d_ud_u, d_ud_d, d_dd_u, d_dd_d)
-          | v3rholapl2[:,6]
-          | v3rholapltau[:,8]
-          | v3rhotau2[:,6]      = (u_u_u, u_u_d, u_d_d, d_u_u, d_u_d, d_d_d)
-          | v3sigma2lapl[:,12]
-          | v3sigma2tau[:,12]   = (uu_uu_u, uu_uu_d, uu_ud_u, uu_ud_d, uu_dd_u, uu_dd_d,
-                                   ud_ud_u, ud_ud_d, ud_dd_u, ud_dd_d, dd_dd_u, dd_dd_d)
-          | v3sigmalapl2[:,9]
-          | v3sigmalapltau[:,12]
-          | v3sigmatau2[:,9]    = (uu_u_u, uu_u_d, uu_d_d, ud_u_u, ud_u_d, ud_d_d, dd_u_u, dd_u_d, dd_d_d)
-          | v3lapl3[:,4]
-          | v3lapl2tau[:,6]
-          | v3lapltau2[:,6]
-          | v3tau3[:,4]         = (u_u_u, u_u_d, u_d_d, d_d_d)
-
-        see also libxc_itrf.c
+        decoded XC description, with the data structure
+        (hybrid, alpha, omega), ((libxc-Id, fac), (libxc-Id, fac), ...)
     '''  # noqa: E501
     hyb = [0, 0, 0]  # hybrid, alpha, omega (== SR_HF, LR_HF, omega)
     if description is None:
-        return hyb, []
+        return tuple(hyb), ()
     elif isinstance(description, int):
-        return hyb, [(description, 1.)]
+        return tuple(hyb), ((description, 1.),)
     elif not isinstance(description, str): #isinstance(description, (tuple,list)):
         return parse_xc('%s,%s' % tuple(description))
 
@@ -1245,10 +1169,14 @@ def parse_token(token, ftype, search_xc_alias=False):
                             x_id = possible_xc.pop()
                         x_id = XC_CODES[x_id]
                     else:
-                        raise KeyError('Unknown %s functional  %s' % (ftype, key))
+                        # Some libxc functionals may not be listed in the
+                        # XC_CODES table. Query libxc directly
+                        func_id = _itrf.xc_functional_get_number(ctypes.c_char_p(key.encode()))
+                        if func_id == -1:
+                            raise KeyError(f"LibXCFunctional: name '{key}' not found.")
                 if isinstance(x_id, str):
                     hyb1, fn_facs1 = parse_xc(x_id)
-# Recursively scale the composed functional, to support e.g. '0.5*b3lyp'
+                    # Recursively scale the composed functional, to support e.g. '0.5*b3lyp'
                     if hyb1[0] != 0 or hyb1[1] != 0:
                         assign_omega(hyb1[2], hyb1[0]*fac, hyb1[1]*fac)
                     fn_facs.extend([(xid, c*fac) for xid, c in fn_facs1])
@@ -1262,7 +1190,7 @@ def possible_x_for(key):
                     'HYB_GGA_X_'+key, 'HYB_MGGA_X_'+key))
     def possible_xc_for(key):
         return set((key, 'LDA_XC_'+key, 'GGA_XC_'+key, 'MGGA_XC_'+key,
-                    'HYB_GGA_XC_'+key, 'HYB_MGGA_XC_'+key))
+                    'HYB_LDA_XC_'+key, 'HYB_GGA_XC_'+key, 'HYB_MGGA_XC_'+key))
     def possible_k_for(key):
         return set((key,
                     'LDA_K_'+key, 'GGA_K_'+key,))
@@ -1295,7 +1223,7 @@ def possible_c_for(key):
             parse_token(token, 'compound XC', search_xc_alias=True)
     if hyb[2] == 0: # No omega is assigned. LR_HF is 0 for normal Coulomb operator
         hyb[1] = 0
-    return hyb, remove_dup(fn_facs)
+    return tuple(hyb), tuple(remove_dup(fn_facs))
 
 _NAME_WITH_DASH = {'SR-HF'    : 'SR_HF',
                    'LR-HF'    : 'LR_HF',
@@ -1473,7 +1401,7 @@ def eval_xc(xc_code, rho, spin=0, relativity=0, deriv=1, omega=None, verbose=Non
     '''  # noqa: E501
     hyb, fn_facs = parse_xc(xc_code)
     if omega is not None:
-        hyb[2] = float(omega)
+        hyb = hyb[:2] + (float(omega),)
     return _eval_xc(hyb, fn_facs, rho, spin, relativity, deriv, verbose)
 
 
diff --git a/pyscf/dft/numint.py b/pyscf/dft/numint.py
index 67d5e7cfcc..88b155f885 100644
--- a/pyscf/dft/numint.py
+++ b/pyscf/dft/numint.py
@@ -157,10 +157,7 @@ def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0,
     >>> rho, dx_rho, dy_rho, dz_rho = eval_rho(mol, ao, dm, xctype='LDA')
     '''
     xctype = xctype.upper()
-    if xctype == 'LDA' or xctype == 'HF':
-        ngrids, nao = ao.shape
-    else:
-        ngrids, nao = ao[0].shape
+    ngrids, nao = ao.shape[-2:]
 
     shls_slice = (0, mol.nbas)
     ao_loc = mol.ao_loc_nr()
@@ -273,10 +270,7 @@ def eval_rho1(mol, ao, dm, screen_index=None, xctype='LDA', hermi=0,
         return eval_rho(mol, ao, dm, screen_index, xctype, hermi, with_lapl, verbose)
 
     xctype = xctype.upper()
-    if xctype == 'LDA' or xctype == 'HF':
-        ngrids = ao.shape[0]
-    else:
-        ngrids = ao.shape[1]
+    ngrids = ao.shape[-2]
 
     if cutoff is None:
         cutoff = CUTOFF
@@ -369,10 +363,7 @@ def eval_rho2(mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
         or (5,N) (with_lapl=False) where the last row is tau = 1/2(\nabla f)^2
     '''
     xctype = xctype.upper()
-    if xctype == 'LDA' or xctype == 'HF':
-        ngrids, nao = ao.shape
-    else:
-        ngrids, nao = ao[0].shape
+    ngrids, nao = ao.shape[-2:]
 
     shls_slice = (0, mol.nbas)
     ao_loc = mol.ao_loc_nr()
@@ -608,10 +599,7 @@ def eval_mat(mol, ao, weight, rho, vxc,
         number of AO functions.
     '''
     xctype = xctype.upper()
-    if xctype == 'LDA' or xctype == 'HF':
-        ngrids, nao = ao.shape
-    else:
-        ngrids, nao = ao[0].shape
+    ngrids, nao = ao.shape[-2:]
 
     if non0tab is None:
         non0tab = numpy.ones(((ngrids+BLKSIZE-1)//BLKSIZE,mol.nbas),
@@ -1174,36 +1162,6 @@ def block_loop(ao_deriv):
                               hermi=0, out=vmat[i])
         vmat = lib.hermi_sum(vmat, axes=(0,2,1))
 
-    elif xctype == 'NLC':
-        nlc_pars = ni.nlc_coeff(xc_code)
-        ao_deriv = 1
-        vvrho = []
-        for ao, mask, weight, coords \
-                in ni.block_loop(mol, grids, nao, ao_deriv, max_memory=max_memory):
-            vvrho.append([make_rho(idm, ao, mask, 'GGA') for idm in range(nset)])
-
-        vv_vxc = []
-        for i in range(nset):
-            rho = numpy.hstack([r[i] for r in vvrho])
-            exc, vxc = _vv10nlc(rho, grids.coords, rho, grids.weights,
-                                grids.coords, nlc_pars)
-            den = rho[0] * grids.weights
-            nelec[i] = den.sum()
-            excsum[i] = numpy.dot(den, exc)
-            vv_vxc.append(xc_deriv.transform_vxc(rho, vxc, 'GGA', spin=0))
-
-        p1 = 0
-        for ao, mask, weight, coords \
-                in ni.block_loop(mol, grids, nao, ao_deriv, max_memory=max_memory):
-            p0, p1 = p1, p1 + weight.size
-            for i in range(nset):
-                wv = vv_vxc[i][:,p0:p1] * weight
-                wv[0] *= .5  # *.5 because vmat + vmat.T at the end
-                aow = _scale_ao_sparse(ao[:4], wv[:4], mask, ao_loc, out=aow)
-                _dot_ao_ao_sparse(ao[0], aow, None, nbins, mask, pair_mask, ao_loc,
-                                  hermi=0, out=vmat[i])
-        vmat = lib.hermi_sum(vmat, axes=(0,2,1))
-
     elif xctype == 'MGGA':
         if (any(x in xc_code.upper() for x in ('CC06', 'CS', 'BR89', 'MK00'))):
             raise NotImplementedError('laplacian in meta-GGA method')
@@ -1279,12 +1237,6 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     >>> nelec, exc, vxc = ni.nr_uks(mol, grids, 'lda,vwn', dm)
     '''
     xctype = ni._xc_type(xc_code)
-    if xctype == 'NLC':
-        dms_sf = dms[0] + dms[1]
-        nelec, excsum, vmat = nr_rks(ni, mol, grids, xc_code, dms_sf, relativity, hermi,
-                                     max_memory, verbose)
-        return [nelec,nelec], excsum, numpy.asarray([vmat,vmat])
-
     ao_loc = mol.ao_loc_nr()
     cutoff = grids.cutoff * 1e2
     nbins = NBINS * 2 - int(NBINS * numpy.log(cutoff) / numpy.log(grids.cutoff))
@@ -1398,6 +1350,77 @@ def _format_uks_dm(dms):
 nr_rks_vxc = nr_rks
 nr_uks_vxc = nr_uks
 
+def nr_nlc_vxc(ni, mol, grids, xc_code, dm, relativity=0, hermi=1,
+               max_memory=2000, verbose=None):
+    '''Calculate NLC functional and potential matrix on given grids
+
+    Args:
+        ni : an instance of :class:`NumInt`
+
+        mol : an instance of :class:`Mole`
+
+        grids : an instance of :class:`Grids`
+            grids.coords and grids.weights are needed for coordinates and weights of meshgrids.
+        xc_code : str
+            XC functional description.
+            See :func:`parse_xc` of pyscf/dft/libxc.py for more details.
+        dm : 2D array
+            Density matrix or multiple density matrices
+
+    Kwargs:
+        hermi : int
+            Input density matrices symmetric or not. It also indicates whether
+            the potential matrices in return are symmetric or not.
+        max_memory : int or float
+            The maximum size of cache to use (in MB).
+
+    Returns:
+        nelec, excsum, vmat.
+        nelec is the number of electrons generated by numerical integration.
+        excsum is the XC functional value.  vmat is the XC potential matrix in
+        2D array of shape (nao,nao) where nao is the number of AO functions.
+    '''
+    make_rho, nset, nao = ni._gen_rho_evaluator(mol, dm, hermi, False, grids)
+    assert nset == 1
+    ao_loc = mol.ao_loc_nr()
+    cutoff = grids.cutoff * 1e2
+    nbins = NBINS * 2 - int(NBINS * numpy.log(cutoff) / numpy.log(grids.cutoff))
+
+    ao_deriv = 1
+    vvrho = []
+    for ao, mask, weight, coords \
+            in ni.block_loop(mol, grids, nao, ao_deriv, max_memory=max_memory):
+        vvrho.append(make_rho(0, ao, mask, 'GGA'))
+    rho = numpy.hstack(vvrho)
+
+    exc = 0
+    vxc = 0
+    nlc_coefs = ni.nlc_coeff(xc_code)
+    for nlc_pars, fac in nlc_coefs:
+        e, v = _vv10nlc(rho, grids.coords, rho, grids.weights,
+                        grids.coords, nlc_pars)
+        exc += e * fac
+        vxc += v * fac
+    den = rho[0] * grids.weights
+    nelec = den.sum()
+    excsum = numpy.dot(den, exc)
+    vv_vxc = xc_deriv.transform_vxc(rho, vxc, 'GGA', spin=0)
+
+    pair_mask = mol.get_overlap_cond() < -numpy.log(ni.cutoff)
+    aow = None
+    vmat = numpy.zeros((nao,nao))
+    p1 = 0
+    for ao, mask, weight, coords \
+            in ni.block_loop(mol, grids, nao, ao_deriv, max_memory=max_memory):
+        p0, p1 = p1, p1 + weight.size
+        wv = vv_vxc[:,p0:p1] * weight
+        wv[0] *= .5
+        aow = _scale_ao_sparse(ao[:4], wv[:4], mask, ao_loc, out=aow)
+        _dot_ao_ao_sparse(ao[0], aow, None, nbins, mask, pair_mask, ao_loc,
+                          hermi=0, out=vmat)
+    vmat = vmat + vmat.T
+    return nelec, excsum, vmat
+
 def nr_rks_fxc(ni, mol, grids, xc_code, dm0, dms, relativity=0, hermi=0,
                rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None):
     '''Contract RKS XC (singlet hessian) kernel matrix with given density matrices
@@ -1477,7 +1500,7 @@ def block_loop(ao_deriv):
                 if xctype == 'LDA':
                     wv = weight * rho1 * _fxc[0]
                 else:
-                    wv = numpy.einsum('xg,xyg,g->yg', rho1, _fxc, weight)
+                    wv = numpy.einsum('yg,xyg,g->xg', rho1, _fxc, weight)
                 yield i, ao, mask, wv
 
     ao_loc = mol.ao_loc_nr()
@@ -1504,9 +1527,6 @@ def block_loop(ao_deriv):
         # [(\nabla mu) nu + mu (\nabla nu)] * fxc_jb = ((\nabla mu) nu f_jb) + h.c.
         vmat = lib.hermi_sum(vmat, axes=(0,2,1))
 
-    elif xctype == 'NLC':
-        raise NotImplementedError('NLC')
-
     elif xctype == 'MGGA':
         assert not MGGA_DENSITY_LAPL
         ao_deriv = 2 if MGGA_DENSITY_LAPL else 1
@@ -1537,8 +1557,6 @@ def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0, dms_alpha, relativity=0, singlet
     Ref. CPL, 256, 454
     '''
     if fxc is None:
-        if dm0.ndim == 2:
-            dm0 = [dm0*.5] * 2
         fxc = ni.cache_xc_kernel1(mol, grids, xc_code, dm0, spin=1,
                                   max_memory=max_memory)[2]
     if singlet:
@@ -1808,9 +1826,6 @@ def block_loop(ao_deriv):
         # [(\nabla mu) nu + mu (\nabla nu)] * fxc_jb = ((\nabla mu) nu f_jb) + h.c.
         vmat = lib.hermi_sum(vmat.reshape(-1,nao,nao), axes=(0,2,1)).reshape(2,nset,nao,nao)
 
-    elif xctype == 'NLC':
-        raise NotImplementedError('NLC')
-
     elif xctype == 'MGGA':
         assert not MGGA_DENSITY_LAPL
         ao_deriv = 1
@@ -2565,19 +2580,22 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
         ao_deriv = 1
     elif xctype == 'MGGA':
         ao_deriv = 2 if MGGA_DENSITY_LAPL else 1
-    elif xctype == 'NLC':
-        raise NotImplementedError('NLC')
     else:
         ao_deriv = 0
 
-    if spin == 0:
+    if mo_coeff[0].ndim == 1:  # RKS
         nao = mo_coeff.shape[0]
         rho = []
         for ao, mask, weight, coords \
                 in ni.block_loop(mol, grids, nao, ao_deriv, max_memory=max_memory):
             rho.append(ni.eval_rho2(mol, ao, mo_coeff, mo_occ, mask, xctype))
         rho = numpy.hstack(rho)
-    else:
+        if spin == 1:  # RKS with nr_rks_fxc_st
+            rho *= .5
+            rho = numpy.repeat(rho[numpy.newaxis], 2, axis=0)
+    else:  # UKS
+        assert mo_coeff[0].ndim == 2
+        assert spin == 1
         nao = mo_coeff[0].shape[0]
         rhoa = []
         rhob = []
@@ -2598,19 +2616,22 @@ def cache_xc_kernel1(ni, mol, grids, xc_code, dm, spin=0, max_memory=2000):
         ao_deriv = 1
     elif xctype == 'MGGA':
         ao_deriv = 2 if MGGA_DENSITY_LAPL else 1
-    elif xctype == 'NLC':
-        raise NotImplementedError('NLC')
     else:
         ao_deriv = 0
 
     make_rho, nset, nao = ni._gen_rho_evaluator(mol, dm, hermi=1)
-    if spin == 0:
+    if dm[0].ndim == 1:  # RKS
         rho = []
         for ao, mask, weight, coords \
                 in ni.block_loop(mol, grids, nao, ao_deriv, max_memory=max_memory):
             rho.append(make_rho(0, ao, mask, xctype))
         rho = numpy.hstack(rho)
-    else:
+        if spin == 1:  # RKS with nr_rks_fxc_st
+            rho *= .5
+            rho = numpy.repeat(rho[numpy.newaxis], 2, axis=0)
+    else:  # UKS
+        assert dm[0].ndim == 2
+        assert spin == 1
         rhoa = []
         rhob = []
         for ao, mask, weight, coords \
@@ -2778,8 +2799,8 @@ def rsh_and_hybrid_coeff(self, xc_code, spin=0):
                 = alpha * HFX + beta * SR_HFX + (1-c_SR) * Ex_SR + (1-c_LR) * Ex_LR + Ec
                 = alpha * LR_HFX + hyb * SR_HFX + (1-c_SR) * Ex_SR + (1-c_LR) * Ex_LR + Ec
 
-        SR_HFX = < pi | (1-e^{-omega r_{12}})/r_{12} | iq >
-        LR_HFX = < pi | e^{-omega r_{12}}/r_{12} | iq >
+        SR_HFX = < pi | (1-erf(-omega r_{12}))/r_{12} | iq >
+        LR_HFX = < pi | erf(-omega r_{12})/r_{12} | iq >
         alpha = c_LR
         beta = c_SR - c_LR
         '''
@@ -2823,6 +2844,7 @@ def nr_fxc(self, mol, grids, xc_code, dm0, dms, spin=0, relativity=0, hermi=0,
 
     nr_rks = nr_rks
     nr_uks = nr_uks
+    nr_nlc_vxc = nr_nlc_vxc
     nr_sap = nr_sap_vxc = nr_sap_vxc
     nr_rks_fxc = nr_rks_fxc
     nr_uks_fxc = nr_uks_fxc
diff --git a/pyscf/dft/numint2c.py b/pyscf/dft/numint2c.py
index 5c03ad7fdc..810937611b 100644
--- a/pyscf/dft/numint2c.py
+++ b/pyscf/dft/numint2c.py
@@ -653,6 +653,22 @@ def nr_vxc(self, mol, grids, xc_code, dms, spin=0, relativity=0, hermi=1,
         return n, exc, vmat
     get_vxc = nr_gks_vxc = nr_vxc
 
+    @lib.with_doc(numint.nr_nlc_vxc.__doc__)
+    def nr_nlc_vxc(self, mol, grids, xc_code, dm, spin=0, relativity=0, hermi=1,
+                   max_memory=2000, verbose=None):
+        assert dm.ndim == 2
+        nao = dm.shape[-1] // 2
+        # ground state density is always real
+        dm_a = dm[:nao,:nao].real
+        dm_b = dm[nao:,nao:].real
+        ni = self._to_numint1c()
+        n, exc, v = ni.nr_nlc_vxc(mol, grids, xc_code, dm_a+dm_b, relativity,
+                                  hermi, max_memory, verbose)
+        vmat = np.zeros_like(dm)
+        vmat[:nao,:nao] = v[0]
+        vmat[nao:,nao:] = v[1]
+        return n, exc, vmat
+
     @lib.with_doc(numint.nr_rks_fxc.__doc__)
     def nr_fxc(self, mol, grids, xc_code, dm0, dms, spin=0, relativity=0, hermi=0,
                rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None):
diff --git a/pyscf/dft/r_numint.py b/pyscf/dft/r_numint.py
index 45d1616803..4f45a71bda 100644
--- a/pyscf/dft/r_numint.py
+++ b/pyscf/dft/r_numint.py
@@ -116,7 +116,7 @@ def _eval_rho_2c(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0, with_lapl=Fal
     shls_slice = (0, mol.nbas)
     ao_loc = mol.ao_loc_2c()
 
-    if xctype == 'LDA':
+    if xctype == 'LDA' or xctype == 'HF':
         c0 = _dot_spinor_dm(mol, ao, dm, non0tab, shls_slice, ao_loc)
         rho_m = _contract_rho_m(ao, c0, hermi, True)
     elif xctype == 'GGA':
diff --git a/pyscf/dft/rks.py b/pyscf/dft/rks.py
index 636c2d468e..c204386194 100644
--- a/pyscf/dft/rks.py
+++ b/pyscf/dft/rks.py
@@ -78,19 +78,21 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
     else:
         max_memory = ks.max_memory - lib.current_memory()[0]
         n, exc, vxc = ni.nr_rks(mol, ks.grids, ks.xc, dm, max_memory=max_memory)
-        if ks.nlc:
-            assert 'VV10' in ks.nlc.upper()
-            _, enlc, vnlc = ni.nr_rks(mol, ks.nlcgrids, ks.xc+'__'+ks.nlc, dm,
-                                      max_memory=max_memory)
+        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        if ks.nlc or ni.libxc.is_nlc(ks.xc):
+            if ni.libxc.is_nlc(ks.xc):
+                xc = ks.xc
+            else:
+                assert ni.libxc.is_nlc(ks.nlc)
+                xc = ks.nlc
+            n, enlc, vnlc = ni.nr_nlc_vxc(mol, ks.nlcgrids, xc, dm,
+                                          max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+            logger.debug(ks, 'nelec with nlc grids = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
-    #enabling range-separated hybrids
-    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=mol.spin)
-
-    if abs(hyb) < 1e-10 and abs(alpha) < 1e-10:
+    if not ni.libxc.is_hybrid_xc(ks.xc):
         vk = None
         if (ks._eri is None and ks.direct_scf and
             getattr(vhf_last, 'vj', None) is not None):
@@ -101,12 +103,13 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
             vj = ks.get_j(mol, dm, hermi)
         vxc += vj
     else:
+        omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=mol.spin)
         if (ks._eri is None and ks.direct_scf and
             getattr(vhf_last, 'vk', None) is not None):
             ddm = numpy.asarray(dm) - numpy.asarray(dm_last)
             vj, vk = ks.get_jk(mol, ddm, hermi)
             vk *= hyb
-            if abs(omega) > 1e-10:  # For range separated Coulomb operator
+            if omega != 0:  # For range separated Coulomb
                 vklr = ks.get_k(mol, ddm, hermi, omega=omega)
                 vklr *= (alpha - hyb)
                 vk += vklr
@@ -115,7 +118,7 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
         else:
             vj, vk = ks.get_jk(mol, dm, hermi)
             vk *= hyb
-            if abs(omega) > 1e-10:
+            if omega != 0:
                 vklr = ks.get_k(mol, dm, hermi, omega=omega)
                 vklr *= (alpha - hyb)
                 vk += vklr
@@ -352,12 +355,14 @@ def dump_flags(self, verbose=None):
             if hasattr(self._numint.libxc, 'xc_reference'):
                 log.info(textwrap.indent('\n'.join(self._numint.libxc.xc_reference(self.xc)), '    '))
 
-        if self.nlc!='':
-            log.info('NLC functional = %s', self.nlc)
-
         self.grids.dump_flags(verbose)
-        if self.nlc!='':
-            log.info('** Following is NLC Grids **')
+
+        if self.nlc or self._numint.libxc.is_nlc(self.xc):
+            log.info('** Following is NLC and NLC Grids **')
+            if self.nlc:
+                log.info('NLC functional = %s', self.nlc)
+            else:
+                log.info('NLC functional = %s', self.xc)
             self.nlcgrids.dump_flags(verbose)
 
         log.info('small_rho_cutoff = %g', self.small_rho_cutoff)
@@ -480,17 +485,17 @@ def initialize_grids(self, mol=None, dm=None):
                                                     self.grids)
             t0 = logger.timer(self, 'setting up grids', *t0)
 
-        if self.nlc != '':
-            if self.nlcgrids.coords is None:
-                t0 = (logger.process_clock(), logger.perf_counter())
-                self.nlcgrids.build(with_non0tab=True)
-                if (self.small_rho_cutoff > 1e-20 and
-                    # dm.ndim == 2 indicates ground state
-                    isinstance(dm, numpy.ndarray) and dm.ndim == 2):
-                    # Filter grids the first time setup grids
-                    self.nlcgrids = prune_small_rho_grids_(self, self.mol, dm,
-                                                           self.nlcgrids)
-                t0 = logger.timer(self, 'setting up nlc grids', *t0)
+        is_nlc = self.nlc or self._numint.libxc.is_nlc(self.xc)
+        if is_nlc and self.nlcgrids.coords is None:
+            t0 = (logger.process_clock(), logger.perf_counter())
+            self.nlcgrids.build(with_non0tab=True)
+            if (self.small_rho_cutoff > 1e-20 and
+                # dm.ndim == 2 indicates ground state
+                isinstance(dm, numpy.ndarray) and dm.ndim == 2):
+                # Filter grids the first time setup grids
+                self.nlcgrids = prune_small_rho_grids_(self, self.mol, dm,
+                                                       self.nlcgrids)
+            t0 = logger.timer(self, 'setting up nlc grids', *t0)
         return self
 
 # Update the KohnShamDFT label in scf.hf module
diff --git a/pyscf/dft/test/test_h2o.py b/pyscf/dft/test/test_h2o.py
index 35a5b051a5..4e71c9bae5 100644
--- a/pyscf/dft/test/test_h2o.py
+++ b/pyscf/dft/test/test_h2o.py
@@ -330,7 +330,6 @@ def test_nr_uks_vv10(self):
         dm = method.get_init_guess()
         dm = (dm[0], dm[0])
         method.xc = 'wB97M_V'
-        method.nlc = 'vv10'
         method.grids.prune = None
         method.grids.atom_grid = {"H": (30, 86), "O": (30, 86),}
         method.nlcgrids.prune = None
@@ -347,29 +346,39 @@ def test_nr_uks_vv10(self):
         self.assertAlmostEqual(lib.fp(vxc[1]), 22.767504283729778, 8)
 
     def test_nr_rks_rsh(self):
+        method = dft.RKS(h2o)
+        dm = method.get_init_guess()
+        method.xc = 'wB97'
+        vxc = method.get_veff(h2o, dm)
+        self.assertAlmostEqual(lib.fp(vxc), 23.16975737295899, 8)
+
+    def test_nr_rks_nlc(self):
         method = dft.RKS(h2o)
         dm = method.get_init_guess()
         method.xc = 'wB97M_V'
         vxc = method.get_veff(h2o, dm)
-        self.assertAlmostEqual(lib.fp(vxc), 22.759558596896344, 8)
+        self.assertAlmostEqual(lib.fp(vxc), 22.767792068559917, 8)
 
+        method.xc = 'B97M_V'
+        vxc = method.get_veff(h2o, dm)
+        self.assertAlmostEqual(lib.fp(vxc), 23.067046560473408, 8)
+
+    def test_nr_rks_nlc_small_memory_high_cost(self):
+        method = dft.RKS(h2o)
+        dm = method.get_init_guess()
         method._eri = None
         method.max_memory = 0
         method.xc = 'wB97M_V'
         vxc = method.get_veff(h2o, dm, dm, vxc)
-        self.assertAlmostEqual(lib.fp(vxc), 22.759558596896344, 8)
-
-        method.xc = 'B97M_V'
-        vxc = method.get_veff(h2o, dm)
-        self.assertAlmostEqual(lib.fp(vxc), 23.058813088809824, 8)
+        self.assertAlmostEqual(lib.fp(vxc), 22.767792068559917, 8)
 
         method._eri = None
         method.max_memory = 0
         method.xc = 'B97M_V'
         vxc = method.get_veff(h2o, dm, dm, vxc)
-        self.assertAlmostEqual(lib.fp(vxc), 23.058813088809824, 8)
+        self.assertAlmostEqual(lib.fp(vxc), 23.067046560473408, 8)
 
-    def test_nr_rks_rsh_cart(self):
+    def test_nr_rks_rsh_cart_high_cost(self):
         mol1 = h2o.copy()
         mol1.basis = 'ccpvdz'
         mol1.cart = True
@@ -377,60 +386,87 @@ def test_nr_rks_rsh_cart(self):
         method = dft.RKS(mol1)
         method.xc = 'B97M_V'
         method.grids.atom_grid = {"H": (50, 194), "O": (50, 194),}
-        self.assertAlmostEqual(method.kernel(), -76.44022393692919, 8)
+        self.assertAlmostEqual(method.kernel(), -76.39753789383619, 8)
 
     def test_nr_uks_rsh(self):
+        method = dft.UKS(h2o)
+        dm = method.get_init_guess()
+        dm = (dm[0], dm[0])
+        method.xc = 'wB97'
+        vxc = method.get_veff(h2o, dm)
+        self.assertAlmostEqual(lib.fp(vxc[0]), 23.16975737295899, 8)
+        self.assertAlmostEqual(lib.fp(vxc[1]), 23.16975737295899, 8)
+
+    def test_nr_uks_nlc_high_cost(self):
         method = dft.UKS(h2o)
         dm = method.get_init_guess()
         dm = (dm[0], dm[0])
         method.xc = 'wB97M_V'
         vxc = method.get_veff(h2o, dm)
-        self.assertAlmostEqual(lib.fp(vxc[0]), 22.759558596896344, 8)
-        self.assertAlmostEqual(lib.fp(vxc[1]), 22.759558596896344, 8)
+        self.assertAlmostEqual(lib.fp(vxc[0]), 22.767792068559917, 8)
+        self.assertAlmostEqual(lib.fp(vxc[1]), 22.767792068559917, 8)
 
+        method.xc = 'B97M_V'
+        vxc = method.get_veff(h2o, dm)
+        self.assertAlmostEqual(lib.fp(vxc[0]), 23.067046560473408, 8)
+        self.assertAlmostEqual(lib.fp(vxc[1]), 23.067046560473408, 8)
+
+    def test_nr_uks_nlc_small_memory_high_cost(self):
+        method = dft.UKS(h2o)
+        dm = method.get_init_guess()
+        dm = (dm[0], dm[0])
         method._eri = None
         method.max_memory = 0
         method.xc = 'wB97M_V'
         vxc = method.get_veff(h2o, dm, dm, vxc)
-        self.assertAlmostEqual(lib.fp(vxc[0]), 22.759558596896344, 8)
-        self.assertAlmostEqual(lib.fp(vxc[1]), 22.759558596896344, 8)
-
-        method.xc = 'B97M_V'
-        vxc = method.get_veff(h2o, dm)
-        self.assertAlmostEqual(lib.fp(vxc[0]), 23.058813088809824, 8)
-        self.assertAlmostEqual(lib.fp(vxc[1]), 23.058813088809824, 8)
+        self.assertAlmostEqual(lib.fp(vxc[0]), 22.767792068559917, 8)
+        self.assertAlmostEqual(lib.fp(vxc[1]), 22.767792068559917, 8)
 
         method._eri = None
         method.max_memory = 0
         method.xc = 'B97M_V'
         vxc = method.get_veff(h2o, dm, dm, vxc)
-        self.assertAlmostEqual(lib.fp(vxc[0]), 23.058813088809824, 8)
-        self.assertAlmostEqual(lib.fp(vxc[1]), 23.058813088809824, 8)
+        self.assertAlmostEqual(lib.fp(vxc[0]), 23.067046560473408, 8)
+        self.assertAlmostEqual(lib.fp(vxc[1]), 23.067046560473408, 8)
 
     def test_nr_gks_rsh(self):
+        method = dft.GKS(h2o)
+        dm = method.get_init_guess()
+        dm = dm + numpy.sin(dm)*.02j
+        dm = dm + dm.conj().T
+        method.xc = 'wB97'
+        vxc = method.get_veff(h2o, dm)
+        self.assertAlmostEqual(lib.fp(vxc), 5.115622298912124+0j, 8)
+
+    def test_nr_gks_nlc_high_cost(self):
         method = dft.GKS(h2o)
         dm = method.get_init_guess()
         dm = dm + numpy.sin(dm)*.02j
         dm = dm + dm.conj().T
         method.xc = 'wB97M_V'
         vxc = method.get_veff(h2o, dm)
-        self.assertAlmostEqual(lib.fp(vxc), 3.1818982731583274+0j, 8)
+        self.assertAlmostEqual(lib.fp(vxc), 3.172920887028461+0j, 8)
 
+        method.xc = 'B97M_V'
+        vxc = method.get_veff(h2o, dm)
+        self.assertAlmostEqual(lib.fp(vxc), 2.0041673361905317+0j, 8)
+
+    def test_nr_gks_nlc_small_memory_high_cost(self):
+        method = dft.GKS(h2o)
+        dm = method.get_init_guess()
+        dm = dm + numpy.sin(dm)*.02j
+        dm = dm + dm.conj().T
         method._eri = None
         method.max_memory = 0
         method.xc = 'wB97M_V'
         vxc = method.get_veff(h2o, dm, dm, vxc)
-        self.assertAlmostEqual(lib.fp(vxc), 3.1818982731583274+0j, 8)
-
-        method.xc = 'B97M_V'
-        vxc = method.get_veff(h2o, dm)
-        self.assertAlmostEqual(lib.fp(vxc), 2.0131447223203565+0j, 8)
+        self.assertAlmostEqual(lib.fp(vxc), 3.172920887028461+0j, 8)
 
         method._eri = None
         method.max_memory = 0
         method.xc = 'B97M_V'
         vxc = method.get_veff(h2o, dm, dm, vxc)
-        self.assertAlmostEqual(lib.fp(vxc), 2.0131447223203565+0j, 8)
+        self.assertAlmostEqual(lib.fp(vxc), 2.0041673361905317+0j, 8)
 
     def test_nr_rks_vv10_high_cost(self):
         method = dft.RKS(h2o)
diff --git a/pyscf/dft/test/test_libxc.py b/pyscf/dft/test/test_libxc.py
index 4531d76bf0..b8ba89ac48 100644
--- a/pyscf/dft/test/test_libxc.py
+++ b/pyscf/dft/test/test_libxc.py
@@ -52,48 +52,48 @@ def test_parse_xc(self):
                                        (0.04, 0.36, 0.405, 0.595)))
         hyb, fn_facs = dft.libxc.parse_xc('HF,')
         self.assertEqual(hyb[0], 1)
-        self.assertEqual(fn_facs, [])
+        self.assertEqual(fn_facs, ())
 
         hyb, fn_facs = dft.libxc.parse_xc('B88 - SLATER')
-        self.assertEqual(fn_facs, [(106, 1), (1, -1)])
+        self.assertEqual(fn_facs, ((106, 1), (1, -1)))
         hyb, fn_facs = dft.libxc.parse_xc('B88 -SLATER*.5')
-        self.assertEqual(fn_facs, [(106, 1), (1, -0.5)])
+        self.assertEqual(fn_facs, ((106, 1), (1, -0.5)))
 
         hyb, fn_facs = dft.libxc.parse_xc('0.5*B3LYP\n+0.25*B3LYP')
-        self.assertTrue(numpy.allclose(hyb, [.15, 0, 0]))
+        self.assertTrue(numpy.allclose(hyb, (.15, 0, 0)))
         hyb = dft.libxc.hybrid_coeff('0.5*B3LYP+0.25*B3LYP')
         self.assertAlmostEqual(hyb, .15, 12)
 
         hyb, fn_facs = dft.libxc.parse_xc('0.6*CAM_B3LYP+0.4*B3P86')
-        self.assertTrue(numpy.allclose(hyb, [.08, 0, 0]))
+        self.assertTrue(numpy.allclose(hyb, (.08, 0, 0)))
         self.assertTrue(numpy.allclose(fn_facs,
-                                       [(433, 0.6), (1, 0.032), (106, 0.288), (132, 0.324), (7, 0.076)]))
+                                       ((433, 0.6), (1, 0.032), (106, 0.288), (132, 0.324), (7, 0.076))))
         rsh = dft.libxc.rsh_coeff('0.6*CAM_B3LYP+0.4*B3P86')
         self.assertTrue(numpy.allclose(rsh, (0.33, 0.39, -0.196)))
 
         hyb, fn_facs = dft.libxc.parse_xc('0.4*B3P86+0.6*CAM_B3LYP')
-        self.assertTrue(numpy.allclose(hyb, [.08, 0, 0]))
+        self.assertTrue(numpy.allclose(hyb, (.08, 0, 0)))
         self.assertTrue(numpy.allclose(fn_facs,
-                                       [(1, 0.032), (106, 0.288), (132, 0.324), (7, 0.076), (433, 0.6)]))
+                                       ((1, 0.032), (106, 0.288), (132, 0.324), (7, 0.076), (433, 0.6))))
         rsh = dft.libxc.rsh_coeff('0.4*B3P86+0.6*CAM_B3LYP')
         self.assertTrue(numpy.allclose(rsh, (0.33, 0.39, -0.196)))
 
         hyb, fn_facs = dft.libxc.parse_xc('0.5*SR-HF(0.3) + .8*HF + .22*LR_HF')
-        self.assertEqual(hyb, [1.3, 1.02, 0.3])
+        self.assertEqual(hyb, (1.3, 1.02, 0.3))
 
         hyb, fn_facs = dft.libxc.parse_xc('0.5*SR-HF + .22*LR_HF(0.3) + .8*HF')
-        self.assertEqual(hyb, [1.3, 1.02, 0.3])
+        self.assertEqual(hyb, (1.3, 1.02, 0.3))
 
         hyb, fn_facs = dft.libxc.parse_xc('0.5*SR-HF + .8*HF + .22*LR_HF(0.3)')
-        self.assertEqual(hyb, [1.3, 1.02, 0.3])
+        self.assertEqual(hyb, (1.3, 1.02, 0.3))
 
         hyb, fn_facs = dft.libxc.parse_xc('0.5*RSH(2.04;0.56;0.3) + 0.5*BP86')
-        self.assertEqual(hyb, [1.3, 1.02, 0.3])
-        self.assertEqual(fn_facs, [(106, 0.5), (132, 0.5)])
+        self.assertEqual(hyb, (1.3, 1.02, 0.3))
+        self.assertEqual(fn_facs, ((106, 0.5), (132, 0.5)))
 
         hyb, fn_facs = dft.libxc.parse_xc('0.5*RSH(.3, 2.04, 0.56) + 0.5*BP86')
-        self.assertEqual(hyb, [1.3, 1.02, 0.3])
-        self.assertEqual(fn_facs, [(106, 0.5), (132, 0.5)])
+        self.assertEqual(hyb, (1.3, 1.02, 0.3))
+        self.assertEqual(fn_facs, ((106, 0.5), (132, 0.5)))
 
         self.assertRaises(ValueError, dft.libxc.parse_xc, 'SR_HF(0.3) + LR_HF(.5)')
         self.assertRaises(ValueError, dft.libxc.parse_xc, 'LR-HF(0.3) + SR-HF(.5)')
@@ -102,28 +102,31 @@ def test_parse_xc(self):
         self.assertAlmostEqual(hyb, 0.28, 9)
 
         hyb, fn_facs = dft.libxc.parse_xc('APBE,')
-        self.assertEqual(fn_facs, [(184, 1)])
+        self.assertEqual(fn_facs, ((184, 1),))
+
+        hyb, fn_facs = dft.libxc.parse_xc('LDA0')
+        self.assertEqual(fn_facs, ((177, 1),))
 
         #hyb, fn_facs = dft.libxc.parse_xc('TF,')
         #self.assertEqual(fn_facs, [(50, 1)])
 
-        ref = [(1, 1), (7, 1)]
+        ref = ((1, 1), (7, 1))
         self.assertEqual(dft.libxc.parse_xc_name('LDA,VWN'), (1,7))
         self.assertEqual(dft.libxc.parse_xc(('LDA','VWN'))[1], ref)
         self.assertEqual(dft.libxc.parse_xc((1, 7))[1], ref)
         self.assertEqual(dft.libxc.parse_xc('1, 7')[1], ref)
-        self.assertEqual(dft.libxc.parse_xc(7)[1], [(7,1)])
+        self.assertEqual(dft.libxc.parse_xc(7)[1], ((7,1),))
 
-        self.assertEqual(dft.libxc.parse_xc('M11-L')[1], [(226,1),(75,1)])
-        self.assertEqual(dft.libxc.parse_xc('M11L' )[1], [(226,1),(75,1)])
-        self.assertEqual(dft.libxc.parse_xc('M11-L,M11L' )[1], [(226,1),(75,1)])
-        self.assertEqual(dft.libxc.parse_xc('M11_L,M11-L')[1], [(226,1),(75,1)])
-        self.assertEqual(dft.libxc.parse_xc('M11L,M11_L' )[1], [(226,1),(75,1)])
+        self.assertEqual(dft.libxc.parse_xc('M11-L')[1], ((226,1),(75,1)))
+        self.assertEqual(dft.libxc.parse_xc('M11L' )[1], ((226,1),(75,1)))
+        self.assertEqual(dft.libxc.parse_xc('M11-L,M11L' )[1], ((226,1),(75,1)))
+        self.assertEqual(dft.libxc.parse_xc('M11_L,M11-L')[1], ((226,1),(75,1)))
+        self.assertEqual(dft.libxc.parse_xc('M11L,M11_L' )[1], ((226,1),(75,1)))
 
-        self.assertEqual(dft.libxc.parse_xc('Xpbe,')[1], [(123,1)])
-        self.assertEqual(dft.libxc.parse_xc('pbe,' )[1], [(101,1)])
+        self.assertEqual(dft.libxc.parse_xc('Xpbe,')[1], ((123,1),))
+        self.assertEqual(dft.libxc.parse_xc('pbe,' )[1], ((101,1),))
         hyb, fn_facs = dft.libxc.parse_xc('PBE*.4+LDA')
-        self.assertEqual(fn_facs, [(101, 0.4), (130, 0.4), (1, 1)])
+        self.assertEqual(fn_facs, ((101, 0.4), (130, 0.4), (1, 1)))
         self.assertRaises(KeyError, dft.libxc.parse_xc, 'PBE+VWN')
 
         self.assertTrue (dft.libxc.is_meta_gga('m05'))
@@ -141,6 +144,12 @@ def test_parse_xc(self):
         self.assertFalse(dft.libxc.is_hybrid_xc('vv10'))
         self.assertTrue (dft.libxc.is_hybrid_xc((402,'vv10')))
         self.assertTrue (dft.libxc.is_hybrid_xc(('402','vv10')))
+        self.assertTrue (dft.libxc.is_nlc('b97mv'))
+        self.assertTrue (dft.libxc.is_nlc('lc-vv10'))
+        self.assertTrue (dft.libxc.is_nlc('scanl-vv10'))
+        self.assertTrue (dft.libxc.is_nlc('b97mv+pbe'))
+        self.assertTrue (dft.libxc.is_nlc((402, 'b97mv')))
+        self.assertTrue (dft.libxc.is_nlc(('402', 'b97mv')))
 
     def test_libxc_cam_beta(self):
         rsh_tmp = (ctypes.c_double*3)()
@@ -159,8 +168,8 @@ def test_libxc_cam_beta(self):
         self.assertEqual(beta, 0)
 
     def test_nlc_coeff(self):
-        self.assertEqual(dft.libxc.nlc_coeff('0.5*vv10'), [5.9, 0.0093])
-        self.assertEqual(dft.libxc.nlc_coeff('pbe__vv10'), [5.9, 0.0093])
+        self.assertEqual(dft.libxc.nlc_coeff('0.5*vv10'), (((5.9, 0.0093), .5),))
+        self.assertEqual(dft.libxc.nlc_coeff('pbe+vv10'), (((5.9, 0.0093), 1),))
 
     def test_lda(self):
         e,v,f,k = dft.libxc.eval_xc('lda,', rho[0], deriv=3)
@@ -261,7 +270,7 @@ def test_camb3lyp(self):
         self.assertAlmostEqual(float(vxc[0]), -0.7709812578936763, 7)
         self.assertAlmostEqual(float(vxc[1]), -0.0029862221286189846, 7)
 
-        self.assertEqual(dft.libxc.rsh_coeff('camb3lyp'), [0.33, 0.65, -0.46])
+        self.assertEqual(dft.libxc.rsh_coeff('camb3lyp'), (0.33, 0.65, -0.46))
 
         rho = numpy.array([1., 1., 0.1, 0.1]).reshape(-1,1)
         exc, vxc, fxc, kxc = dft.libxc.eval_xc('RSH(0.5,0.65,-0.46) + 0.46*ITYH + .35*B88,', rho, 0, deriv=1)
@@ -275,7 +284,7 @@ def test_ityh(self):
         self.assertAlmostEqual(float(exc), -0.6359945579326314, 7)
         self.assertAlmostEqual(float(vxc[0]), -0.8712041561251518, 7)
         self.assertAlmostEqual(float(vxc[1]), -0.003911167644579979, 7)
-        self.assertEqual(dft.libxc.rsh_coeff('ityh,'), [0.2, 0.0, 0.0])
+        self.assertEqual(dft.libxc.rsh_coeff('ityh,'), (0.2, 0.0, 0.0))
 
     def test_deriv_order(self):
         self.assertTrue(dft.libxc.test_deriv_order('lda', 3, raise_error=False))
@@ -289,6 +298,7 @@ def test_xc_type(self):
         self.assertEqual(dft.libxc.xc_type('hf'), 'HF')
         self.assertEqual(dft.libxc.xc_type(',vwn'), 'LDA')
         self.assertEqual(dft.libxc.xc_type('lda+b3lyp'), 'GGA')
+        self.assertEqual(dft.libxc.xc_type('wb97x_v'), 'GGA')
         self.assertEqual(dft.libxc.xc_type('wb97m_v'), 'MGGA')
         self.assertEqual(dft.libxc.xc_type('bp86'), 'GGA')
 
diff --git a/pyscf/dft/test/test_numint.py b/pyscf/dft/test/test_numint.py
index 5a17b8d828..d25826879a 100644
--- a/pyscf/dft/test/test_numint.py
+++ b/pyscf/dft/test/test_numint.py
@@ -481,11 +481,11 @@ def test_vv10nlc(self):
     def test_nr_uks_vxc_vv10(self):
         method = dft.UKS(h2o)
         dm = method.get_init_guess()
-        dm = (dm[0], dm[0])
         grids = dft.gen_grid.Grids(h2o)
         grids.atom_grid = {'H': (20, 50), 'O': (20,50)}
-        v = dft.numint.nr_vxc(h2o, grids, 'wB97M_V__vv10', dm, spin=1, hermi=0)[2]
-        self.assertAlmostEqual(lib.fp(v), 0.02293399033256055, 8)
+        ni = dft.numint.NumInt()
+        v = dft.numint.nr_nlc_vxc(ni, h2o, grids, 'wB97M_V', dm[0]*2, hermi=0)[2]
+        self.assertAlmostEqual(lib.fp([v, v]), 0.02293399033256055, 8)
 
     def test_uks_gga_wv1(self):
         numpy.random.seed(1)
diff --git a/pyscf/dft/test/test_xcfun.py b/pyscf/dft/test/test_xcfun.py
index d0be069f0a..a3778a2bdc 100644
--- a/pyscf/dft/test/test_xcfun.py
+++ b/pyscf/dft/test/test_xcfun.py
@@ -54,47 +54,47 @@ def test_parse_xc(self):
                                        (0.04, 0.36, 0.405, 0.595)))
         hyb, fn_facs = dft.xcfun.parse_xc('HF,')
         self.assertEqual(hyb[0], 1)
-        self.assertEqual(fn_facs, [])
+        self.assertEqual(fn_facs, ())
 
         hyb, fn_facs = dft.libxc.parse_xc('B88 - SLATER')
-        self.assertEqual(fn_facs, [(106, 1), (1, -1)])
+        self.assertEqual(fn_facs, ((106, 1), (1, -1)))
         hyb, fn_facs = dft.libxc.parse_xc('B88 -SLATER*.5')
-        self.assertEqual(fn_facs, [(106, 1), (1, -0.5)])
+        self.assertEqual(fn_facs, ((106, 1), (1, -0.5)))
 
         hyb, fn_facs = dft.xcfun.parse_xc('0.5*B3LYP+0.25*B3LYP')
-        self.assertTrue(numpy.allclose(hyb, [.15, 0, 0]))
+        self.assertTrue(numpy.allclose(hyb, (.15, 0, 0)))
         hyb = dft.libxc.hybrid_coeff('0.5*B3LYP+0.25*B3LYP')
         self.assertAlmostEqual(hyb, .15, 12)
 
         hyb, fn_facs = dft.xcfun.parse_xc('CAM_B3LYP')
-        self.assertTrue(numpy.allclose(hyb, [0.19, 0.65, 0.33]))
+        self.assertTrue(numpy.allclose(hyb, (0.19, 0.65, 0.33)))
 
         hyb, fn_facs = dft.xcfun.parse_xc('0.6*CAM_B3LYP+0.4*B3P86')
-        self.assertTrue(numpy.allclose(hyb, [.08+0.19*.6, 0.65*.6, 0.33]))
+        self.assertTrue(numpy.allclose(hyb, (.08+0.19*.6, 0.65*.6, 0.33)))
         self.assertTrue(numpy.allclose(fn_facs,
-                                       [(8, 0.276), (6, 0.498), (3, 0.19), (16, 0.486), (0, 0.032), (56, 0.324)]))
+                                       ((8, 0.276), (6, 0.498), (3, 0.19), (16, 0.486), (0, 0.032), (56, 0.324))))
         rsh = dft.xcfun.rsh_coeff('0.6*CAM_B3LYP+0.4*B3P86')
         self.assertTrue(numpy.allclose(rsh, (0.33, 0.39, -0.196)))
 
         hyb, fn_facs = dft.xcfun.parse_xc('0.4*B3P86+0.6*CAM_B3LYP')
-        self.assertTrue(numpy.allclose(hyb, [.08+0.19*.6, 0.65*.6, 0.33]))
+        self.assertTrue(numpy.allclose(hyb, (.08+0.19*.6, 0.65*.6, 0.33)))
         self.assertTrue(numpy.allclose(fn_facs,
-                                       [(0, 0.032), (6, 0.498), (56, 0.324), (3, 0.19), (8, 0.276), (16, 0.486)]))
+                                       ((0, 0.032), (6, 0.498), (56, 0.324), (3, 0.19), (8, 0.276), (16, 0.486))))
         rsh = dft.xcfun.rsh_coeff('0.4*B3P86+0.6*CAM_B3LYP')
         self.assertTrue(numpy.allclose(rsh, (0.33, 0.39, -0.196)))
 
         hyb, fn_facs = dft.xcfun.parse_xc('0.5*SR-HF(0.3) + .8*HF + .22*LR_HF')
-        self.assertEqual(hyb, [1.3, 1.02, 0.3])
+        self.assertEqual(hyb, (1.3, 1.02, 0.3))
 
         hyb, fn_facs = dft.xcfun.parse_xc('0.5*SR-HF + .22*LR_HF(0.3) + .8*HF')
-        self.assertEqual(hyb, [1.3, 1.02, 0.3])
+        self.assertEqual(hyb, (1.3, 1.02, 0.3))
 
         hyb, fn_facs = dft.xcfun.parse_xc('0.5*SR-HF + .8*HF + .22*LR_HF(0.3)')
-        self.assertEqual(hyb, [1.3, 1.02, 0.3])
+        self.assertEqual(hyb, (1.3, 1.02, 0.3))
 
         hyb, fn_facs = dft.xcfun.parse_xc('0.5*RSH(2.04;0.56;0.3) + 0.5*BP86')
-        self.assertEqual(hyb, [1.3, 1.02, 0.3])
-        self.assertEqual(fn_facs, [(6, 0.5), (56, 0.5)])
+        self.assertEqual(hyb, (1.3, 1.02, 0.3))
+        self.assertEqual(fn_facs, ((6, 0.5), (56, 0.5)))
 
         self.assertRaises(ValueError, dft.xcfun.parse_xc, 'SR_HF(0.3) + LR_HF(.5)')
         self.assertRaises(ValueError, dft.xcfun.parse_xc, 'LR-HF(0.3) + SR-HF(.5)')
@@ -106,30 +106,30 @@ def test_parse_xc(self):
         self.assertEqual(fn_facs[0][0], 68)
 
         hyb, fn_facs = dft.xcfun.parse_xc('VWN,')
-        self.assertEqual(fn_facs, [(3, 1)])
+        self.assertEqual(fn_facs, ((3, 1),))
 
         hyb, fn_facs = dft.xcfun.parse_xc('TF,')
-        self.assertEqual(fn_facs, [(24, 1)])
+        self.assertEqual(fn_facs, ((24, 1),))
 
-        ref = [(0, 1), (3, 1)]
+        ref = ((0, 1), (3, 1))
         self.assertEqual(dft.xcfun.parse_xc_name('LDA,VWN'), (0,3))
         self.assertEqual(dft.xcfun.parse_xc(('LDA','VWN'))[1], ref)
         self.assertEqual(dft.xcfun.parse_xc((0, 3))[1], ref)
         self.assertEqual(dft.xcfun.parse_xc('0, 3')[1], ref)
-        self.assertEqual(dft.xcfun.parse_xc(3)[1], [(3,1)])
+        self.assertEqual(dft.xcfun.parse_xc(3)[1], ((3,1),))
 
-        #self.assertEqual(dft.xcfun.parse_xc('M11-L')[1], [(226,1),(75,1)])
-        #self.assertEqual(dft.xcfun.parse_xc('M11L' )[1], [(226,1),(75,1)])
-        #self.assertEqual(dft.xcfun.parse_xc('M11-L,M11L' )[1], [(226,1),(75,1)])
-        #self.assertEqual(dft.xcfun.parse_xc('M11_L,M11-L')[1], [(226,1),(75,1)])
-        #self.assertEqual(dft.xcfun.parse_xc('M11L,M11_L' )[1], [(226,1),(75,1)])
+        #self.assertEqual(dft.xcfun.parse_xc('M11-L')[1], ((226,1),(75,1)))
+        #self.assertEqual(dft.xcfun.parse_xc('M11L' )[1], ((226,1),(75,1)))
+        #self.assertEqual(dft.xcfun.parse_xc('M11-L,M11L' )[1], ((226,1),(75,1)))
+        #self.assertEqual(dft.xcfun.parse_xc('M11_L,M11-L')[1], ((226,1),(75,1)))
+        #self.assertEqual(dft.xcfun.parse_xc('M11L,M11_L' )[1], ((226,1),(75,1)))
 
-        #self.assertEqual(dft.xcfun.parse_xc('Xpbe,')[1], [(123,1)])
-        #self.assertEqual(dft.xcfun.parse_xc('pbe,' )[1], [(101,1)])
+        #self.assertEqual(dft.xcfun.parse_xc('Xpbe,')[1], ((123,1),))
+        #self.assertEqual(dft.xcfun.parse_xc('pbe,' )[1], ((101,1),))
         hyb, fn_facs = dft.xcfun.parse_xc('PBE*.4+LDA')
-        self.assertEqual(fn_facs, [(5, 0.4), (4, 0.4), (0, 1)])
+        self.assertEqual(fn_facs, ((5, 0.4), (4, 0.4), (0, 1)))
         hyb, fn_facs = dft.xcfun.parse_xc('PBE*.4+VWN')
-        self.assertEqual(fn_facs, [(5, 0.4), (4, 0.4), (3, 1)])
+        self.assertEqual(fn_facs, ((5, 0.4), (4, 0.4), (3, 1)))
 
         self.assertTrue (dft.xcfun.is_meta_gga('m05'))
         self.assertFalse(dft.xcfun.is_meta_gga('pbe0'))
@@ -147,8 +147,8 @@ def test_parse_xc(self):
         self.assertTrue (dft.xcfun.is_hybrid_xc(('b3lyp', 4, 'vv10')))
 
     def test_nlc_coeff(self):
-        #self.assertEqual(dft.xcfun.nlc_coeff('0.5*vv10'), [5.9, 0.0093])
-        self.assertEqual(dft.xcfun.nlc_coeff('pbe__vv10'), [5.9, 0.0093])
+        self.assertEqual(dft.xcfun.nlc_coeff('0.5*vv10'), (((5.9, 0.0093), .5),))
+        self.assertEqual(dft.xcfun.nlc_coeff('pbe+vv10'), (((5.9, 0.0093), 1),))
 
     def test_lda(self):
         e,v,f,k = dft.xcfun.eval_xc('lda,', rho[0], deriv=3)
diff --git a/pyscf/dft/uks.py b/pyscf/dft/uks.py
index 49fdf0c54f..8596ac55c9 100644
--- a/pyscf/dft/uks.py
+++ b/pyscf/dft/uks.py
@@ -49,19 +49,21 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
     else:
         max_memory = ks.max_memory - lib.current_memory()[0]
         n, exc, vxc = ni.nr_uks(mol, ks.grids, ks.xc, dm, max_memory=max_memory)
-        if ks.nlc:
-            assert 'VV10' in ks.nlc.upper()
-            _, enlc, vnlc = ni.nr_rks(mol, ks.nlcgrids, ks.xc+'__'+ks.nlc, dm[0]+dm[1],
-                                      max_memory=max_memory)
+        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        if ks.nlc or ni.libxc.is_nlc(ks.xc):
+            if ni.libxc.is_nlc(ks.xc):
+                xc = ks.xc
+            else:
+                assert ni.libxc.is_nlc(ks.nlc)
+                xc = ks.nlc
+            n, enlc, vnlc = ni.nr_nlc_vxc(mol, ks.nlcgrids, xc, dm[0]+dm[1],
+                                          max_memory=max_memory)
             exc += enlc
             vxc += vnlc
-        logger.debug(ks, 'nelec by numeric integration = %s', n)
+            logger.debug(ks, 'nelec with nlc grids = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
-    #enabling range-separated hybrids
-    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=mol.spin)
-
-    if abs(hyb) < 1e-10 and abs(alpha) < 1e-10:
+    if not ni.libxc.is_hybrid_xc(ks.xc):
         vk = None
         if (ks._eri is None and ks.direct_scf and
             getattr(vhf_last, 'vj', None) is not None):
@@ -72,12 +74,13 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
             vj = ks.get_j(mol, dm[0]+dm[1], hermi)
         vxc += vj
     else:
+        omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=mol.spin)
         if (ks._eri is None and ks.direct_scf and
             getattr(vhf_last, 'vk', None) is not None):
             ddm = numpy.asarray(dm) - numpy.asarray(dm_last)
             vj, vk = ks.get_jk(mol, ddm, hermi)
             vk *= hyb
-            if abs(omega) > 1e-10:
+            if omega != 0:
                 vklr = ks.get_k(mol, ddm, hermi, omega)
                 vklr *= (alpha - hyb)
                 vk += vklr
@@ -87,7 +90,7 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
             vj, vk = ks.get_jk(mol, dm, hermi)
             vj = vj[0] + vj[1]
             vk *= hyb
-            if abs(omega) > 1e-10:
+            if omega != 0:
                 vklr = ks.get_k(mol, dm, hermi, omega)
                 vklr *= (alpha - hyb)
                 vk += vklr
diff --git a/pyscf/dft/xcfun.py b/pyscf/dft/xcfun.py
index 27319f04f3..b9b537b0ee 100644
--- a/pyscf/dft/xcfun.py
+++ b/pyscf/dft/xcfun.py
@@ -24,6 +24,7 @@
 
 import copy
 import ctypes
+from functools import lru_cache
 import math
 import numpy
 from pyscf import lib
@@ -247,28 +248,28 @@
 MAX_DERIV_ORDER = 3
 
 VV10_XC = {
-    'B97M_V'    : [6.0, 0.01],
-    'WB97M_V'   : [6.0, 0.01],
-    'WB97X_V'   : [6.0, 0.01],
-    'VV10'      : [5.9, 0.0093],
-    'LC_VV10'   : [6.3, 0.0089],
-    'REVSCAN_VV10': [9.8, 0.0093],
-    'SCAN_RVV10'  : [15.7, 0.0093],
-    'SCAN_VV10'   : [14.0, 0.0093],
-    'SCANL_RVV10' : [15.7, 0.0093],
-    'SCANL_VV10'  : [14.0, 0.0093],
+    'B97M_V'    : (6.0, 0.01),
+    'WB97M_V'   : (6.0, 0.01),
+    'WB97X_V'   : (6.0, 0.01),
+    'VV10'      : (5.9, 0.0093),
+    'LC_VV10'   : (6.3, 0.0089),
+    'REVSCAN_VV10': (9.8, 0.0093),
+    'SCAN_RVV10'  : (15.7, 0.0093),
+    'SCAN_VV10'   : (14.0, 0.0093),
+    'SCANL_RVV10' : (15.7, 0.0093),
+    'SCANL_VV10'  : (14.0, 0.0093),
 }
 VV10_XC.update([(key.replace('_', ''), val) for key, val in VV10_XC.items()])
 
+@lru_cache(100)
 def xc_type(xc_code):
     if xc_code is None:
         return None
     elif isinstance(xc_code, str):
-        if is_nlc(xc_code):
-            return 'NLC'
         hyb, fn_facs = parse_xc(xc_code)
     else:
         fn_facs = [(xc_code, 1)]  # mimic fn_facs
+
     if not fn_facs:
         return 'HF'
     elif all(_itrf.XCFUN_xc_type(ctypes.c_int(xid)) == 0 for xid, val in fn_facs):
@@ -277,7 +278,7 @@ def xc_type(xc_code):
         return 'MGGA'
     else:
         # all((xid in GGA_IDS or xid in LDA_IDS for xid, val in fn_fns)):
-        # include hybrid_xc
+        # include hybrid_xc and NLC
         return 'GGA'
 
 def is_lda(xc_code):
@@ -299,27 +300,28 @@ def is_meta_gga(xc_code):
 def is_gga(xc_code):
     return xc_type(xc_code) == 'GGA'
 
+# Assign a temporary Id to VV10 functionals. parse_xc function needs them to
+# parse NLC functionals
+XC_CODES.update([(key, 5000+i) for i, key in enumerate(VV10_XC)])
+VV10_XC.update([(5000+i, VV10_XC[key]) for i, key in enumerate(VV10_XC)])
+
 def is_nlc(xc_code):
-    return '__VV10' in xc_code.upper()
+    fn_facs = parse_xc(xc_code)[1]
+    return any(xid >= 5000 for xid, c in fn_facs)
 
 def nlc_coeff(xc_code):
     '''Get NLC coefficients
     '''
     xc_code = xc_code.upper()
-
-    nlc_part = None
     if '__VV10' in xc_code:
-        xc_code, nlc_part = xc_code.split('__', 1)
+        raise RuntimeError('Deprecated notation for NLC functional.')
 
-    if xc_code in VV10_XC:
-        return VV10_XC[xc_code]
-    elif nlc_part is not None:
-        # Use VV10 NLC parameters by default for the general case
-        return VV10_XC[nlc_part]
-    else:
-        raise NotImplementedError(
-            '%s does not have NLC part. Available functionals are %s' %
-            (xc_code, ', '.join(VV10_XC.keys())))
+    fn_facs = parse_xc(xc_code)[1]
+    nlc_pars = []
+    for xid, fac in fn_facs:
+        if xid >= 5000:
+            nlc_pars.append((VV10_XC[xid], fac))
+    return tuple(nlc_pars)
 
 def rsh_coeff(xc_code):
     '''Get Range-separated-hybrid coefficients
@@ -330,7 +332,6 @@ def rsh_coeff(xc_code):
     return omega, alpha, beta
 
 def max_deriv_order(xc_code):
-    hyb, fn_facs = parse_xc(xc_code)
     return MAX_DERIV_ORDER
 
 def test_deriv_order(xc_code, deriv, raise_error=False):
@@ -341,8 +342,6 @@ def test_deriv_order(xc_code, deriv, raise_error=False):
     return support
 
 def hybrid_coeff(xc_code, spin=0):
-    if is_nlc(xc_code):
-        return 0
     hyb, fn_facs = parse_xc(xc_code)
     return hyb[0]
 
@@ -350,6 +349,7 @@ def parse_xc_name(xc_name):
     fn_facs = parse_xc(xc_name)[1]
     return fn_facs[0][0], fn_facs[1][0]
 
+@lru_cache(100)
 def parse_xc(description):
     r'''Rules to input functional description:
 
@@ -402,9 +402,9 @@ def parse_xc(description):
     '''
     hyb = [0, 0, 0]  # hybrid, alpha, omega
     if description is None:
-        return hyb, []
+        return tuple(hyb), ()
     elif isinstance(description, int):
-        return hyb, [(description, 1.)]
+        return tuple(hyb), ((description, 1.),)
     elif not isinstance(description, str): #isinstance(description, (tuple,list)):
         return parse_xc('%s,%s' % tuple(description))
 
@@ -494,7 +494,7 @@ def parse_token(token, suffix, search_xc_alias=False):
             parse_token(token, 'XC', search_xc_alias=True)
     if hyb[2] == 0: # No omega is assigned. LR_HF is 0 for normal Coulomb operator
         hyb[1] = 0
-    return hyb, remove_dup(fn_facs)
+    return tuple(hyb), tuple(remove_dup(fn_facs))
 
 _NAME_WITH_DASH = {'SR-HF'  : 'SR_HF',
                    'LR-HF'  : 'LR_HF',
@@ -512,7 +512,7 @@ def eval_xc(xc_code, rho, spin=0, relativity=0, deriv=1, omega=None, verbose=Non
     '''
     hyb, fn_facs = parse_xc(xc_code)
     if omega is not None:
-        hyb[2] = float(omega)
+        hyb = hyb[:2] + (float(omega),)
     return _eval_xc(hyb, fn_facs, rho, spin, relativity, deriv, verbose)
 
 XC_D0 = 0
diff --git a/pyscf/eph/rks.py b/pyscf/eph/rks.py
index a942f3eaa5..2cef97ebb6 100644
--- a/pyscf/eph/rks.py
+++ b/pyscf/eph/rks.py
@@ -109,6 +109,7 @@ def get_eph(ephobj, mo1, omega, vec, mo_rep):
     ni = mf._numint
     ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
     omg, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
+    hybrid = ni.libxc.is_hybrid_xc(mf.xc)
 
     vnuc_deriv = ephobj.vnuc_generator(mol)
     aoslices = mol.aoslice_by_atom()
@@ -128,14 +129,14 @@ def get_eph(ephobj, mo1, omega, vec, mo_rep):
         shl0, shl1, p0, p1 = aoslices[ia]
         shls_slice = (shl0, shl1) + (0, mol.nbas)*3
 
-        if abs(hyb)>1e-10:
+        if hybrid:
             vj1, vk1 = \
                     rhf_hess._get_jk(mol, 'int2e_ip1', 3, 's2kl',
                                      ['ji->s2kl', -dm0[:,p0:p1], #vj1
                                       'li->s1kj', -dm0[:,p0:p1]], #vk1
                                      shls_slice=shls_slice)
             veff = vj1 - hyb * .5 * vk1
-            if abs(omg) > 1e-10:
+            if omg != 0:
                 with mol.with_range_coulomb(omg):
                     vk1 = \
                         rhf_hess._get_jk(mol, 'int2e_ip1', 3, 's2kl',
diff --git a/pyscf/eph/uks.py b/pyscf/eph/uks.py
index 2703c5d63f..56867bf19c 100644
--- a/pyscf/eph/uks.py
+++ b/pyscf/eph/uks.py
@@ -136,6 +136,7 @@ def get_eph(ephobj, mo1, omega, vec, mo_rep):
     ni = mf._numint
     ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
     omg, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
+    hybrid = ni.libxc.is_hybrid_xc(mf.xc)
 
     vnuc_deriv = ephobj.vnuc_generator(mol)
     aoslices = mol.aoslice_by_atom()
@@ -161,7 +162,7 @@ def get_eph(ephobj, mo1, omega, vec, mo_rep):
         v1 = vind(moia)
         shl0, shl1, p0, p1 = aoslices[ia]
         shls_slice = (shl0, shl1) + (0, mol.nbas)*3
-        if abs(hyb)>1e-10:
+        if hybrid:
             vja, vjb, vka, vkb = \
                     rhf_hess._get_jk(mol, 'int2e_ip1', 3, 's2kl',
                                      ['ji->s2kl', -dm0a[:,p0:p1], #vja
@@ -171,7 +172,7 @@ def get_eph(ephobj, mo1, omega, vec, mo_rep):
                                      shls_slice=shls_slice)
             vhfa = vja + vjb - hyb * vka
             vhfb = vjb + vja - hyb * vkb
-            if abs(omg) > 1e-10:
+            if omg != 0:
                 with mol.with_range_coulomb(omg):
                     vka, vkb = \
                         rhf_hess._get_jk(mol, 'int2e_ip1', 3, 's2kl',
diff --git a/pyscf/fci/addons.py b/pyscf/fci/addons.py
index acd4fbe7c7..02e183faa9 100644
--- a/pyscf/fci/addons.py
+++ b/pyscf/fci/addons.py
@@ -33,7 +33,8 @@ def large_ci(ci, norb, nelec, tol=LARGE_CI_TOL, return_strs=RETURN_STRS):
     neleca, nelecb = _unpack_nelec(nelec)
     na = cistring.num_strings(norb, neleca)
     nb = cistring.num_strings(norb, nelecb)
-    assert (ci.shape == (na, nb))
+    assert ci.size == na * nb
+    ci = ci.reshape(na, nb)
     addra, addrb = numpy.where(abs(ci) > tol)
     if addra.size == 0:
         # No large CI coefficient > tol, search for the largest coefficient
@@ -83,116 +84,7 @@ def symm_initguess(norb, nelec, orbsym, wfnsym=0, irrep_nelec=None):
     Returns:
         CI coefficients 2D array which has the target symmetry.
     '''
-    neleca, nelecb = _unpack_nelec(nelec)
-    orbsym = numpy.asarray(orbsym)
-    if not isinstance(orbsym[0], numpy.number):
-        raise RuntimeError('TODO: convert irrep symbol to irrep id')
-
-    na = cistring.num_strings(norb, neleca)
-    nb = cistring.num_strings(norb, nelecb)
-    ci1 = numpy.zeros((na,nb))
-
-########################
-# pass 1: The fixed occs
-    orbleft = numpy.ones(norb, dtype=bool)
-    stra = numpy.zeros(norb, dtype=bool)
-    strb = numpy.zeros(norb, dtype=bool)
-    if irrep_nelec is not None:
-        for k,n in irrep_nelec.items():
-            orbleft[orbsym==k] = False
-            if isinstance(n, (int, numpy.number)):
-                idx = numpy.where(orbsym==k)[0][:n//2]
-                stra[idx] = True
-                strb[idx] = True
-            else:
-                na, nb = n
-                stra[numpy.where(orbsym==k)[0][:na]] = True
-                strb[numpy.where(orbsym==k)[0][:nb]] = True
-                if (na-nb)%2:
-                    wfnsym ^= k
-
-    orbleft = numpy.where(orbleft)[0]
-    neleca_left = neleca - stra.sum()
-    nelecb_left = nelecb - strb.sum()
-    spin = neleca_left - nelecb_left
-    assert (neleca_left >= 0)
-    assert (nelecb_left >= 0)
-    assert (spin >= 0)
-
-########################
-# pass 2: search pattern
-    def gen_str_iter(orb_list, nelec):
-        if nelec == 1:
-            for i in orb_list:
-                yield [i]
-        elif nelec >= len(orb_list):
-            yield orb_list
-        else:
-            restorb = orb_list[1:]
-            #yield from gen_str_iter(restorb, nelec)
-            for x in gen_str_iter(restorb, nelec):
-                yield x
-            for x in gen_str_iter(restorb, nelec-1):
-                yield [orb_list[0]] + x
-
-    # search for alpha and beta pattern which match to the required symmetry
-    def query(target, nelec_atmost, spin, orbsym):
-        norb = len(orbsym)
-        for excite_level in range(1, nelec_atmost+1):
-            for beta_only in gen_str_iter(list(range(norb)), excite_level):
-                alpha_allow = [i for i in range(norb) if i not in beta_only]
-                alpha_orbsym = orbsym[alpha_allow]
-                alpha_target = target
-                for i in beta_only:
-                    alpha_target ^= orbsym[i]
-                alpha_only = symm.route(alpha_target, spin+excite_level, alpha_orbsym)
-                if alpha_only:
-                    alpha_only = [alpha_allow[i] for i in alpha_only]
-                    return alpha_only, beta_only
-        raise RuntimeError('No pattern found for wfn irrep %s over orbsym %s'
-                           % (target, orbsym))
-
-    if spin == 0:
-        aonly = bonly = []
-        if wfnsym != 0:
-            aonly, bonly = query(wfnsym, neleca_left, spin, orbsym[orbleft])
-    else:
-        # 1. assume "nelecb_left" doubly occupied orbitals
-        # search for alpha pattern which match to the required symmetry
-        aonly, bonly = orbleft[symm.route(wfnsym, spin, orbsym[orbleft])], []
-        # dcompose doubly occupied orbitals, search for alpha and beta pattern
-        if len(aonly) != spin:
-            aonly, bonly = query(wfnsym, neleca_left, spin, orbsym[orbleft])
-
-    ndocc = neleca_left - len(aonly) # == nelecb_left - len(bonly)
-    docc_allow = numpy.ones(len(orbleft), dtype=bool)
-    docc_allow[aonly] = False
-    docc_allow[bonly] = False
-    docclst = orbleft[numpy.where(docc_allow)[0]][:ndocc]
-    stra[docclst] = True
-    strb[docclst] = True
-
-    def find_addr_(stra, aonly, nelec):
-        stra[orbleft[aonly]] = True
-        return cistring.str2addr(norb, nelec, ('%i'*norb)%tuple(stra)[::-1])
-    if bonly:
-        if spin > 0:
-            aonly, socc_only = aonly[:-spin], aonly[-spin:]
-            stra[orbleft[socc_only]] = True
-        stra1 = stra.copy()
-        strb1 = strb.copy()
-
-        addra = find_addr_(stra, aonly, neleca)
-        addrb = find_addr_(strb, bonly, nelecb)
-        addra1 = find_addr_(stra1, bonly, neleca)
-        addrb1 = find_addr_(strb1, aonly, nelecb)
-        ci1[addra,addrb] = ci1[addra1,addrb1] = numpy.sqrt(.5)
-    else:
-        addra = find_addr_(stra, aonly, neleca)
-        addrb = find_addr_(strb, bonly, nelecb)
-        ci1[addra,addrb] = 1
-
-    return ci1
+    raise DeprecationWarning
 
 
 def cylindrical_init_guess(mol, norb, nelec, orbsym, wfnsym=0, singlet=True,
@@ -246,9 +138,9 @@ def irrep_id2lz(irrep_id):
         raise NotImplementedError
         orb_lz = wfn_lz = d2h_wfnsym_id = None
 
-    occslsta = occslstb = cistring._gen_occslst(range(norb), neleca)
+    occslsta = occslstb = cistring.gen_occslst(range(norb), neleca)
     if neleca != nelecb:
-        occslstb = cistring._gen_occslst(range(norb), nelecb)
+        occslstb = cistring.gen_occslst(range(norb), nelecb)
     na = len(occslsta)
     nb = len(occslsta)
 
@@ -647,6 +539,9 @@ def fix_spin_(fciobj, shift=PENALTY, ss=None, **kwargs):
             A modified FCI object based on fciobj.
     '''
     import types
+    from pyscf.fci import direct_uhf
+    if isinstance(fciobj, direct_uhf.FCISolver):
+        raise NotImplementedError
 
     if 'ss_value' in kwargs:
         sys.stderr.write('fix_spin_: kwarg "ss_value" will be removed in future release. '
diff --git a/pyscf/fci/cistring.py b/pyscf/fci/cistring.py
index 62fc928e93..83597c97f0 100644
--- a/pyscf/fci/cistring.py
+++ b/pyscf/fci/cistring.py
@@ -41,7 +41,7 @@ def make_strings(orb_list, nelec):
     '''
     orb_list = list(orb_list)
     if len(orb_list) > 63:
-        return _gen_occslst(orb_list, nelec)
+        return gen_occslst(orb_list, nelec)
 
     assert (nelec >= 0)
     if nelec == 0:
@@ -68,8 +68,24 @@ def gen_str_iter(orb_list, nelec):
     return numpy.asarray(strings, dtype=numpy.int64)
 gen_strings4orblist = make_strings
 
-def _gen_occslst(orb_list, nelec):
+def gen_occslst(orb_list, nelec):
     '''Generate occupied orbital list for each string.
+
+    Returns:
+        List of lists of int32. Each inner list has length equal to the number of
+        electrons, and contains the occupied orbitals in the corresponding string.
+
+    Example:
+
+        >>> [bin(x) for x in make_strings((0, 1, 2, 3), 2)]
+        ['0b11', '0b101', '0b110', '0b1001', '0b1010', '0b1100']
+        >>> gen_occslst((0, 1, 2, 3), 2)
+        OIndexList([[0, 1],
+                    [0, 2],
+                    [1, 2],
+                    [0, 3],
+                    [1, 3],
+                    [2, 3]], dtype=int32)
     '''
     orb_list = list(orb_list)
     assert (nelec >= 0)
@@ -117,7 +133,7 @@ def gen_linkstr_index_o0(orb_list, nelec, strs=None):
     if strs is None:
         strs = make_strings(orb_list, nelec)
     strdic = dict(zip(strs,range(strs.__len__())))
-    def propgate1e(str0):
+    def propagate1e(str0):
         occ = []
         vir = []
         for i in orb_list:
@@ -135,7 +151,7 @@ def propgate1e(str0):
                 linktab.append((a, i, strdic[str1], cre_des_sign(a, i, str0)))
         return linktab
 
-    t = [propgate1e(s) for s in strs.astype(numpy.int64)]
+    t = [propagate1e(s) for s in strs.astype(numpy.int64)]
     return numpy.array(t, dtype=numpy.int32)
 
 def gen_linkstr_index_o1(orb_list, nelec, strs=None, tril=False):
@@ -143,7 +159,7 @@ def gen_linkstr_index_o1(orb_list, nelec, strs=None, tril=False):
         return numpy.zeros((0,0,4), dtype=numpy.int32)
 
     if strs is None:
-        strs = _gen_occslst(orb_list, nelec)
+        strs = gen_occslst(orb_list, nelec)
     occslst = strs
 
     orb_list = numpy.asarray(orb_list)
diff --git a/pyscf/fci/direct_nosym.py b/pyscf/fci/direct_nosym.py
index b9c597ae7b..6c2f002d73 100644
--- a/pyscf/fci/direct_nosym.py
+++ b/pyscf/fci/direct_nosym.py
@@ -40,7 +40,7 @@
 from pyscf.fci import cistring
 from pyscf.fci import direct_spin1
 
-libfci = lib.load_library('libfci')
+libfci = direct_spin1.libfci
 
 def contract_1e(h1e, fcivec, norb, nelec, link_index=None):
     h1e = numpy.asarray(h1e, order='C')
@@ -118,7 +118,7 @@ def absorb_h1e(h1e, eri, norb, nelec, fac=1):
     '''
     if not isinstance(nelec, (int, numpy.number)):
         nelec = sum(nelec)
-    h2e = ao2mo.restore(1, eri.copy(), norb)
+    h2e = ao2mo.restore(1, eri.copy(), norb).astype(h1e.dtype, copy=False)
     f1e = h1e - numpy.einsum('jiik->jk', h2e) * .5
     f1e = f1e * (1./(nelec+1e-100))
     for k in range(norb):
diff --git a/pyscf/fci/direct_spin0.py b/pyscf/fci/direct_spin0.py
index fe90b5bd08..7de41938a7 100644
--- a/pyscf/fci/direct_spin0.py
+++ b/pyscf/fci/direct_spin0.py
@@ -49,12 +49,15 @@
 from pyscf.fci import direct_spin1
 from pyscf.fci.spin_op import contract_ss
 
-libfci = lib.load_library('libfci')
+libfci = direct_spin1.libfci
 
 @lib.with_doc(direct_spin1.contract_1e.__doc__)
 def contract_1e(f1e, fcivec, norb, nelec, link_index=None):
     fcivec = numpy.asarray(fcivec, order='C')
-    link_index = _unpack(norb, nelec, link_index)
+    link_index = direct_spin1._unpack(norb, nelec, link_index)
+    if not isinstance(link_index, numpy.ndarray):
+        # Handle computability. link_index should be (nparray, nparray)
+        link_index = link_index[0]
     na, nlink = link_index.shape[:2]
     assert (fcivec.size == na**2)
     ci1 = numpy.empty_like(fcivec)
@@ -84,7 +87,10 @@ def contract_2e(eri, fcivec, norb, nelec, link_index=None):
     eri = ao2mo.restore(4, eri, norb)
     lib.transpose_sum(eri, inplace=True)
     eri *= .5
-    link_index = _unpack(norb, nelec, link_index)
+    link_index = direct_spin1._unpack(norb, nelec, link_index)
+    if not isinstance(link_index, numpy.ndarray):
+        # Handle computability. link_index should be (nparray, nparray)
+        link_index = link_index[0]
     na, nlink = link_index.shape[:2]
     assert (fcivec.size == na**2)
     ci1 = numpy.empty((na,na))
@@ -102,7 +108,7 @@ def contract_2e(eri, fcivec, norb, nelec, link_index=None):
 absorb_h1e = direct_spin1.absorb_h1e
 
 @lib.with_doc(direct_spin1.make_hdiag.__doc__)
-def make_hdiag(h1e, eri, norb, nelec):
+def make_hdiag(h1e, eri, norb, nelec, compress=False):
     hdiag = direct_spin1.make_hdiag(h1e, eri, norb, nelec)
     na = int(numpy.sqrt(hdiag.size))
 # symmetrize hdiag to reduce numerical error
@@ -234,91 +240,93 @@ def kernel_ms0(fci, h1e, eri, norb, nelec, ci0=None, link_index=None,
         max_memory = fci.max_memory - lib.current_memory()[0]
     log = logger.new_logger(fci, verbose)
 
-    assert (fci.spin is None or fci.spin == 0)
-    assert (0 <= numpy.sum(nelec) <= norb*2)
+    nelec = direct_spin1._unpack_nelec(nelec, fci.spin)
+    assert (0 <= nelec[0] <= norb and 0 <= nelec[1] <= norb)
 
-    link_index = _unpack(norb, nelec, link_index)
-    h1e = numpy.ascontiguousarray(h1e)
-    eri = numpy.ascontiguousarray(eri)
-    na = link_index.shape[0]
+    hdiag = fci.make_hdiag(h1e, eri, norb, nelec, compress=True)
+    if getattr(fci, 'sym_allowed_idx', None):
+        # Remove symmetry forbidden elements
+        sym_idx = numpy.hstack(fci.sym_allowed_idx)
+        civec_size = sym_idx.size
+    else:
+        sym_idx = None
+        civec_size = hdiag.size
 
-    if max_memory < na**2*6*8e-6:
+    if max_memory < hdiag.size*6*8e-6:
         log.warn('Not enough memory for FCI solver. '
-                 'The minimal requirement is %.0f MB', na**2*60e-6)
+                 'The minimal requirement is %.0f MB', hdiag.size*60e-6)
 
-    hdiag = fci.make_hdiag(h1e, eri, norb, nelec)
+    pspace_size = min(hdiag.size, pspace_size)
     nroots = min(hdiag.size, nroots)
-
-    try:
-        addr, h0 = fci.pspace(h1e, eri, norb, nelec, hdiag, max(pspace_size,nroots))
-        if pspace_size > 0:
-            pw, pv = fci.eig(h0)
+    na = cistring.num_strings(norb, nelec[0])
+    addr = [0]
+    pw = pv = None
+    if pspace_size > 0:
+        try:
+            addr, h0 = fci.pspace(h1e, eri, norb, nelec, hdiag, pspace_size)
+        except NotImplementedError:
+            pass
+        pw, pv = fci.eig(h0)
+
+    if pspace_size >= civec_size and ci0 is None and not davidson_only:
+        e = []
+        civec = []
+        for i in range(pspace_size):
+            c = numpy.empty(civec_size)
+            c[addr] = pv[:,i]
+            try:
+                civec.append(_check_(c.reshape(na,na)))
+            except ValueError:
+                continue
+            e.append(pw[i])
+            if len(civec) >= nroots:
+                break
+        if nroots == 1:
+            return e[0]+ecore, civec[0]
         else:
-            pw = pv = None
-
-        if pspace_size >= na*na and ci0 is None and not davidson_only:
-            # The degenerated wfn can break symmetry.  The davidson iteration with proper
-            # initial guess doesn't have this issue
-            if na*na == 1:
-                return pw[0]+ecore, pv[:,0].reshape(1,1).view(direct_spin1.FCIvector)
-            elif nroots > 1:
-                civec = numpy.empty((nroots,na*na))
-                civec[:,addr] = pv[:,:nroots].T
-                civec = civec.reshape(nroots,na,na)
-                try:
-                    return (pw[:nroots]+ecore,
-                            [_check_(ci).view(direct_spin1.FCIvector) for ci in civec])
-                except ValueError:
-                    pass
-            elif abs(pw[0]-pw[1]) > 1e-12:
-                civec = numpy.empty((na*na))
-                civec[addr] = pv[:,0]
-                civec = civec.reshape(na,na)
-                civec = lib.transpose_sum(civec) * .5
-                # direct diagonalization may lead to triplet ground state
-
-                #TODO: optimize initial guess.  Using pspace vector as initial guess may have
-                # spin problems.  The 'ground state' of psapce vector may have different spin
-                # state to the true ground state.
-                try:
-                    return (pw[0]+ecore,
-                            _check_(civec.reshape(na,na)).view(direct_spin1.FCIvector))
-                except ValueError:
-                    pass
-    except NotImplementedError:
-        addr = [0]
-        pw = pv = None
-
-    precond = fci.make_precond(hdiag, pw, pv, addr)
+            return numpy.array(e)+ecore, civec
+    pw = pv = h0 = None
+
+    if hdiag.size == civec_size:
+        precond = fci.make_precond(hdiag)
+    else:
+        precond = fci.make_precond(hdiag[sym_idx])
 
     h2e = fci.absorb_h1e(h1e, eri, norb, nelec, .5)
     if hop is None:
+        cpu0 = [logger.process_clock(), logger.perf_counter()]
         def hop(c):
             hc = fci.contract_2e(h2e, c.reshape(na,na), norb, nelec, link_index)
+            cpu0[:] = log.timer_debug1('contract_2e', *cpu0)
             return hc.ravel()
 
-#TODO: check spin of initial guess
-    if ci0 is None:
+    def init_guess():
         if callable(getattr(fci, 'get_init_guess', None)):
-            ci0 = lambda: fci.get_init_guess(norb, nelec, nroots, hdiag)
+            return fci.get_init_guess(norb, nelec, nroots, hdiag)
         else:
-            def ci0():
-                x0 = []
-                for i in range(nroots):
-                    x = numpy.zeros((na,na))
-                    addra = addr[i] // na
-                    addrb = addr[i] % na
-                    if addra == addrb:
-                        x[addra,addrb] = 1
-                    else:
-                        x[addra,addrb] = x[addrb,addra] = numpy.sqrt(.5)
-                    x0.append(x.ravel())
-                return x0
+            x0 = []
+            for i in range(nroots):
+                x = numpy.zeros((na,na))
+                addra = addr[i] // na
+                addrb = addr[i] % na
+                if addra == addrb:
+                    x[addra,addrb] = 1
+                else:
+                    x[addra,addrb] = x[addrb,addra] = numpy.sqrt(.5)
+                x0.append(x.ravel())
+            return x0
+
+    if ci0 is None:
+        ci0 = init_guess  # lazy initialization to reduce memory footprint
     elif not callable(ci0):
-        if isinstance(ci0, numpy.ndarray) and ci0.size == na*na:
+        if isinstance(ci0, numpy.ndarray):
             ci0 = [ci0.ravel()]
         else:
             ci0 = [x.ravel() for x in ci0]
+        if sym_idx is not None and ci0[0].size != civec_size:
+            ci0 = [x[sym_idx] for x in ci0]
+        if len(ci0) < nroots:
+            ci0.extend(init_guess()[len(ci0):])
 
     if tol is None: tol = fci.conv_tol
     if lindep is None: lindep = fci.lindep
@@ -327,16 +335,11 @@ def ci0():
     tol_residual = getattr(fci, 'conv_tol_residual', None)
 
     with lib.with_omp_threads(fci.threads):
-        #e, c = lib.davidson(hop, ci0, precond, tol=fci.conv_tol, lindep=fci.lindep)
         e, c = fci.eig(hop, ci0, precond, tol=tol, lindep=lindep,
                        max_cycle=max_cycle, max_space=max_space, nroots=nroots,
                        max_memory=max_memory, verbose=log, follow_state=True,
                        tol_residual=tol_residual, **kwargs)
-    if nroots > 1:
-        return (e+ecore,
-                [_check_(ci.reshape(na,na)).view(direct_spin1.FCIvector) for ci in c])
-    else:
-        return e+ecore, _check_(c.reshape(na,na)).view(direct_spin1.FCIvector)
+    return e+ecore, c
 
 def _check_(c):
     c = lib.transpose_sum(c, inplace=True)
@@ -349,8 +352,7 @@ def _check_(c):
 
 class FCISolver(direct_spin1.FCISolver):
 
-    def make_hdiag(self, h1e, eri, norb, nelec):
-        return make_hdiag(h1e, eri, norb, nelec)
+    make_hdiag = staticmethod(make_hdiag)
 
     def contract_1e(self, f1e, fcivec, norb, nelec, link_index=None, **kwargs):
         return contract_1e(f1e, fcivec, norb, nelec, link_index, **kwargs)
@@ -365,14 +367,24 @@ def kernel(self, h1e, eri, norb, nelec, ci0=None,
                tol=None, lindep=None, max_cycle=None, max_space=None,
                nroots=None, davidson_only=None, pspace_size=None,
                orbsym=None, wfnsym=None, ecore=0, **kwargs):
+        if nroots is None: nroots = self.nroots
         if self.verbose >= logger.WARN:
             self.check_sanity()
+        assert self.spin is None or self.spin == 0
         self.norb = norb
         self.nelec = nelec
-        self.eci, self.ci = \
-                kernel_ms0(self, h1e, eri, norb, nelec, ci0, None,
-                           tol, lindep, max_cycle, max_space, nroots,
-                           davidson_only, pspace_size, ecore=ecore, **kwargs)
+        link_index = direct_spin1._unpack(norb, nelec, None)
+        e, c = kernel_ms0(self, h1e, eri, norb, nelec, ci0, link_index,
+                          tol, lindep, max_cycle, max_space, nroots,
+                          davidson_only, pspace_size, ecore=ecore, **kwargs)
+        self.eci = e
+
+        na = link_index[0].shape[0]
+        if nroots > 1:
+            self.ci = [
+                _check_(x.reshape(na,na)).view(direct_spin1.FCIvector) for x in c]
+        else:
+            self.ci = _check_(c.reshape(na,na)).view(direct_spin1.FCIvector)
         return self.eci, self.ci
 
     def energy(self, h1e, eri, fcivec, norb, nelec, link_index=None):
@@ -416,18 +428,6 @@ def gen_linkstr(self, norb, nelec, tril=True, spin=None):
 FCI = FCISolver
 
 
-def _unpack(norb, nelec, link_index):
-    if link_index is None:
-        if isinstance(nelec, (int, numpy.number)):
-            neleca = nelec//2
-        else:
-            neleca, nelecb = nelec
-            assert (neleca == nelecb)
-        return cistring.gen_linkstr_index_trilidx(range(norb), neleca)
-    else:
-        return link_index
-
-
 if __name__ == '__main__':
     from functools import reduce
     from pyscf import gto
diff --git a/pyscf/fci/direct_spin0_symm.py b/pyscf/fci/direct_spin0_symm.py
index fc6be225f4..ed6feb92f9 100644
--- a/pyscf/fci/direct_spin0_symm.py
+++ b/pyscf/fci/direct_spin0_symm.py
@@ -44,105 +44,20 @@
 from pyscf.fci import direct_spin1
 from pyscf.fci import direct_spin1_symm
 from pyscf.fci import addons
-from pyscf.fci.spin_op import contract_ss
 from pyscf import __config__
 
-libfci = lib.load_library('libfci')
+libfci = direct_spin1.libfci
 
-TOTIRREPS = 8
-
-def contract_1e(f1e, fcivec, norb, nelec, link_index=None, orbsym=None):
-    return direct_spin0.contract_1e(f1e, fcivec, norb, nelec, link_index)
-
-# Note eri is NOT the 2e hamiltonian matrix, the 2e hamiltonian is
-# h2e = eri_{pq,rs} p^+ q r^+ s
-#     = (pq|rs) p^+ r^+ s q - (pq|rs) \delta_{qr} p^+ s
-# so eri is defined as
-#       eri_{pq,rs} = (pq|rs) - (1/Nelec) \sum_q (pq|qs)
-# to restore the symmetry between pq and rs,
-#       eri_{pq,rs} = (pq|rs) - (.5/Nelec) [\sum_q (pq|qs) + \sum_p (pq|rp)]
-# Please refer to the treatment in direct_spin1.absorb_h1e
-# the input fcivec should be symmetrized
 def contract_2e(eri, fcivec, norb, nelec, link_index=None, orbsym=None, wfnsym=0):
-    if orbsym is None:
-        return direct_spin0.contract_2e(eri, fcivec, norb, nelec, link_index)
-
-    eri = ao2mo.restore(4, eri, norb)
-    neleca, nelecb = direct_spin1._unpack_nelec(nelec)
-    assert (neleca == nelecb)
-    link_indexa = direct_spin0._unpack(norb, nelec, link_index)
-    na, nlinka = link_indexa.shape[:2]
-    eri_irs, rank_eri, irrep_eri = direct_spin1_symm.reorder_eri(eri, norb, orbsym)
-
-    strsa = numpy.asarray(cistring.gen_strings4orblist(range(norb), neleca))
-    aidx, link_indexa = direct_spin1_symm.gen_str_irrep(strsa, orbsym, link_indexa,
-                                                        rank_eri, irrep_eri)
-
-    Tirrep = ctypes.c_void_p*TOTIRREPS
-    linka_ptr = Tirrep(*[x.ctypes.data_as(ctypes.c_void_p) for x in link_indexa])
-    eri_ptrs = Tirrep(*[x.ctypes.data_as(ctypes.c_void_p) for x in eri_irs])
-    dimirrep = (ctypes.c_int*TOTIRREPS)(*[x.shape[0] for x in eri_irs])
-    fcivec_shape = fcivec.shape
-    fcivec = fcivec.reshape((na,na), order='C')
-    ci1new = numpy.zeros_like(fcivec)
-    nas = (ctypes.c_int*TOTIRREPS)(*[x.size for x in aidx])
-
-    ci0 = []
-    ci1 = []
-    wfnsym_in_d2h = wfnsym % 10
-    for ir in range(TOTIRREPS):
-        ma, mb = aidx[ir].size, aidx[wfnsym_in_d2h ^ ir].size
-        ci0.append(numpy.zeros((ma,mb)))
-        ci1.append(numpy.zeros((ma,mb)))
-        if ma > 0 and mb > 0:
-            lib.take_2d(fcivec, aidx[ir], aidx[wfnsym_in_d2h ^ ir], out=ci0[ir])
-    ci0_ptrs = Tirrep(*[x.ctypes.data_as(ctypes.c_void_p) for x in ci0])
-    ci1_ptrs = Tirrep(*[x.ctypes.data_as(ctypes.c_void_p) for x in ci1])
-    libfci.FCIcontract_2e_symm1(eri_ptrs, ci0_ptrs, ci1_ptrs,
-                                ctypes.c_int(norb), nas, nas,
-                                ctypes.c_int(nlinka), ctypes.c_int(nlinka),
-                                linka_ptr, linka_ptr, dimirrep,
-                                ctypes.c_int(wfnsym_in_d2h))
-    for ir in range(TOTIRREPS):
-        if ci0[ir].size > 0:
-            lib.takebak_2d(ci1new, ci1[ir], aidx[ir], aidx[wfnsym_in_d2h ^ ir])
-    ci1 = lib.transpose_sum(ci1new, inplace=True).reshape(fcivec_shape)
-    return ci1.view(direct_spin1.FCIvector)
-
+    link_index = direct_spin1._unpack(norb, nelec, link_index)
+    if isinstance(link_index, numpy.ndarray):
+        # For backward compatibility
+        link_index = (link_index, link_index)
+    return direct_spin1_symm.contract_2e(eri, fcivec, norb, nelec, link_index,
+                                         orbsym, wfnsym)
 
-def kernel(h1e, eri, norb, nelec, ci0=None, level_shift=1e-3, tol=1e-10,
-           lindep=1e-14, max_cycle=50, max_space=12, nroots=1,
-           davidson_only=False, pspace_size=400, orbsym=None, wfnsym=None,
-           ecore=0, **kwargs):
-    assert (len(orbsym) == norb)
-    cis = FCISolver(None)
-    cis.level_shift = level_shift
-    cis.conv_tol = tol
-    cis.lindep = lindep
-    cis.max_cycle = max_cycle
-    cis.max_space = max_space
-    cis.nroots = nroots
-    cis.davidson_only = davidson_only
-    cis.pspace_size = pspace_size
-    cis.orbsym = orbsym
-    cis.wfnsym = wfnsym
-
-    unknown = {}
-    for k, v in kwargs.items():
-        if not hasattr(cis, k):
-            unknown[k] = v
-        setattr(cis, k, v)
-    if unknown:
-        sys.stderr.write('Unknown keys %s for FCI kernel %s\n' %
-                         (str(unknown.keys()), __name__))
-
-    wfnsym = direct_spin1_symm._id_wfnsym(cis, norb, nelec, cis.orbsym,
-                                          cis.wfnsym)
-    if cis.wfnsym is not None and ci0 is None:
-        ci0 = addons.symm_initguess(norb, nelec, orbsym, wfnsym)
-
-    e, c = cis.kernel(h1e, eri, norb, nelec, ci0, ecore=ecore, **unknown)
-    return e, c
+energy = direct_spin1_symm.energy
+kernel = direct_spin1_symm.kernel
 
 make_rdm1 = direct_spin0.make_rdm1
 make_rdm1s = direct_spin0.make_rdm1s
@@ -152,36 +67,29 @@ def kernel(h1e, eri, norb, nelec, ci0=None, level_shift=1e-3, tol=1e-10,
 trans_rdm1 = direct_spin0.trans_rdm1
 trans_rdm12 = direct_spin0.trans_rdm12
 
-def energy(h1e, eri, fcivec, norb, nelec, link_index=None, orbsym=None, wfnsym=0):
-    h2e = direct_spin1.absorb_h1e(h1e, eri, norb, nelec) * .5
-    ci1 = contract_2e(h2e, fcivec, norb, nelec, link_index, orbsym, wfnsym)
-    return numpy.dot(fcivec.ravel(), ci1.ravel())
-
 def get_init_guess(norb, nelec, nroots, hdiag, orbsym, wfnsym=0):
     neleca, nelecb = direct_spin1._unpack_nelec(nelec)
     assert (neleca == nelecb)
+
     strsa = cistring.gen_strings4orblist(range(norb), neleca)
+    na = len(strsa)
     airreps = direct_spin1_symm._gen_strs_irrep(strsa, orbsym)
-    na = nb = len(airreps)
-    hdiag = hdiag.reshape(na,nb)
+    hdiag = hdiag.reshape(na,na)
 
-    ci0 = []
-    iroot = 0
     sym_allowed = (airreps[:,None] ^ airreps) == wfnsym
     idx = numpy.arange(na)
     sym_allowed[idx[:,None] < idx] = False
     idx_a, idx_b = numpy.where(sym_allowed)
-    for k in hdiag[idx_a,idx_b].argsort():
+
+    ci0 = []
+    for k in numpy.argpartition(hdiag[idx_a,idx_b], nroots-1)[:nroots]:
         addra, addrb = idx_a[k], idx_b[k]
-        x = numpy.zeros((na, nb))
+        x = numpy.zeros((na, na))
         if addra == addrb:
             x[addra,addrb] = 1
         else:
             x[addra,addrb] = x[addrb,addra] = numpy.sqrt(.5)
         ci0.append(x.ravel().view(direct_spin1.FCIvector))
-        iroot += 1
-        if iroot >= nroots:
-            break
 
     if len(ci0) == 0:
         raise RuntimeError(f'Initial guess for symmetry {wfnsym} not found')
@@ -195,8 +103,8 @@ def get_init_guess_cyl_sym(norb, nelec, nroots, hdiag, orbsym, wfnsym=0):
 
     wfnsym_in_d2h = wfnsym % 10
     wfn_momentum = symm.basis.linearmole_irrep2momentum(wfnsym)
-    na = nb = len(strsa)
-    hdiag = hdiag.reshape(na,nb)
+    na = len(strsa)
+    hdiag = hdiag.reshape(na,na)
     degen = orbsym.degen_mapping
     ci0 = []
     iroot = 0
@@ -224,7 +132,7 @@ def get_init_guess_cyl_sym(norb, nelec, nroots, hdiag, orbsym, wfnsym=0):
         else:
             x = x + x.T
             norm = numpy.linalg.norm(x)
-            if norm < 1e-7:
+            if norm < 1e-3:
                 continue
         x *= 1./norm
         ci0.append(x.ravel().view(direct_spin1.FCIvector))
@@ -240,90 +148,62 @@ def get_init_guess_cyl_sym(norb, nelec, nroots, hdiag, orbsym, wfnsym=0):
 class FCISolver(direct_spin0.FCISolver):
 
     davidson_only = getattr(__config__, 'fci_direct_spin1_symm_FCI_davidson_only', True)
-
-    # pspace may break point group symmetry
-    pspace_size = getattr(__config__, 'fci_direct_spin1_symm_FCI_pspace_size', 0)
+    pspace_size = getattr(__config__, 'fci_direct_spin1_symm_FCI_pspace_size', 400)
 
     def __init__(self, mol=None, **kwargs):
-        direct_spin0.FCISolver.__init__(self, mol, **kwargs)
         # wfnsym will be guessed based on initial guess if it is None
         self.wfnsym = None
+        self.sym_allowed_idx = None
+        direct_spin0.FCISolver.__init__(self, mol, **kwargs)
 
-    def dump_flags(self, verbose=None):
-        direct_spin0.FCISolver.dump_flags(self, verbose)
-        log = logger.new_logger(self, verbose)
-        if isinstance(self.wfnsym, str):
-            log.info('specified CI wfn symmetry = %s', self.wfnsym)
-        elif isinstance(self.wfnsym, (int, numpy.number)):
-            groupname = getattr(self.mol, 'groupname', None)
-            log.info('specified CI wfn symmetry = %s',
-                     symm.irrep_id2name(groupname, self.wfnsym))
-
-    def absorb_h1e(self, h1e, eri, norb, nelec, fac=1):
-        return direct_spin1.absorb_h1e(h1e, eri, norb, nelec, fac)
-
-    def make_hdiag(self, h1e, eri, norb, nelec):
-        return direct_spin0.make_hdiag(h1e, eri, norb, nelec)
-
-    def pspace(self, h1e, eri, norb, nelec, hdiag, np=400):
-        return direct_spin0.pspace(h1e, eri, norb, nelec, hdiag, np)
+    absorb_h1e = direct_spin1.FCISolver.absorb_h1e
 
-    def contract_1e(self, f1e, fcivec, norb, nelec, link_index=None, **kwargs):
-        return contract_1e(f1e, fcivec, norb, nelec, link_index, **kwargs)
+    dump_flags = direct_spin1_symm.FCISolver.dump_flags
+    make_hdiag = direct_spin1_symm.FCISolver.make_hdiag
+    pspace = direct_spin1_symm.FCISolver.pspace
+    contract_1e = direct_spin1_symm.FCISolver.contract_1e
+    contract_ss = direct_spin1_symm.FCISolver.contract_ss
+    guess_wfnsym = direct_spin1_symm.guess_wfnsym
+    kernel = direct_spin1_symm.FCISolver.kernel
 
     def contract_2e(self, eri, fcivec, norb, nelec, link_index=None,
                     orbsym=None, wfnsym=None, **kwargs):
         if orbsym is None: orbsym = self.orbsym
         if wfnsym is None: wfnsym = self.wfnsym
-        wfnsym = direct_spin1_symm._id_wfnsym(self, norb, nelec, orbsym,
-                                              wfnsym)
-        return contract_2e(eri, fcivec, norb, nelec, link_index, orbsym, wfnsym, **kwargs)
-
-    def get_init_guess(self, norb, nelec, nroots, hdiag):
-        wfnsym = direct_spin1_symm._id_wfnsym(self, norb, nelec, self.orbsym,
-                                              self.wfnsym)
-        if getattr(self.mol, 'groupname', None) in ('Dooh', 'Coov'):
-            return get_init_guess_cyl_sym(
-                norb, nelec, nroots, hdiag, self.orbsym, wfnsym)
+        wfnsym = direct_spin1_symm._id_wfnsym(self, norb, nelec, orbsym, wfnsym)
+        nelec = direct_spin1._unpack_nelec(nelec, self.spin)
+        civec = contract_2e(eri, fcivec, norb, nelec, link_index, orbsym, wfnsym)
+        na = cistring.num_strings(norb, nelec[0])
+        if civec.size != na**2:
+            s_idx = numpy.hstack(self.sym_allowed_idx)
+            x, y = divmod(s_idx, na)
+            ci1 = numpy.empty(na**2)
+            ci1[y*na+x] = civec
+            civec += ci1[s_idx]
+            civec *= .5
         else:
-            return get_init_guess(norb, nelec, nroots, hdiag, self.orbsym, wfnsym)
-
-    guess_wfnsym = direct_spin1_symm.guess_wfnsym
+            civec = lib.transpose_sum(civec.reshape(na,na), inplace=True)
+            civec *= .5
+        return civec
 
-    def kernel(self, h1e, eri, norb, nelec, ci0=None,
-               tol=None, lindep=None, max_cycle=None, max_space=None,
-               nroots=None, davidson_only=None, pspace_size=None,
-               orbsym=None, wfnsym=None, ecore=0, **kwargs):
-        if nroots is None: nroots = self.nroots
+    def get_init_guess(self, norb, nelec, nroots, hdiag, orbsym=None, wfnsym=None):
         if orbsym is None: orbsym = self.orbsym
-        if wfnsym is None: wfnsym = self.wfnsym
-        if self.verbose >= logger.WARN:
-            self.check_sanity()
-        self.norb = norb
-        self.nelec = nelec
-
-        if (not hasattr(orbsym, 'degen_mapping') and
-            getattr(self.mol, 'groupname', None) in ('Dooh', 'Coov')):
-            degen_mapping = map_degeneracy(h1e.diagonal(), orbsym)
-            orbsym = lib.tag_array(orbsym, degen_mapping=degen_mapping)
-
-        wfnsym = self.guess_wfnsym(norb, nelec, ci0, orbsym, wfnsym, **kwargs)
-
-        if wfnsym > 7:
-            # Symmetry broken for Dooh and Coov groups is often observed.
-            # A larger max_space is helpful to reduce the error. Also it is
-            # hard to converge to high precision.
-            if max_space is None and self.max_space == FCISolver.max_space:
-                max_space = 20 + 7 * nroots
-            if tol is None and self.conv_tol == FCISolver.conv_tol:
-                tol = 1e-7
-
-        with lib.temporary_env(self, orbsym=orbsym, wfnsym=wfnsym):
-            e, c = direct_spin0.kernel_ms0(self, h1e, eri, norb, nelec, ci0, None,
-                                           tol, lindep, max_cycle, max_space,
-                                           nroots, davidson_only, pspace_size,
-                                           ecore=ecore, **kwargs)
-        self.eci, self.ci = e, c
-        return e, c
+        if wfnsym is None:
+            wfnsym = direct_spin1_symm._id_wfnsym(
+                self, norb, nelec, orbsym, self.wfnsym)
+        s_idx = numpy.hstack(self.sym_allowed_idx)
+        if getattr(self.mol, 'groupname', None) in ('Dooh', 'Coov'):
+            ci0 = get_init_guess_cyl_sym(
+                norb, nelec, nroots, hdiag, orbsym, wfnsym)
+        else:
+            nelec = direct_spin1._unpack_nelec(nelec, self.spin)
+            na = cistring.num_strings(norb, nelec[0])
+            if hdiag.size != na * na:
+                hdiag, hdiag0 = numpy.empty(na*na), hdiag
+                hdiag[:] = 1e9
+                hdiag[numpy.hstack(self.sym_allowed_idx)] = hdiag0
+            ci0 = get_init_guess(norb, nelec, nroots, hdiag.ravel(),
+                                 orbsym, wfnsym)
+        return [x[s_idx] for x in ci0]
 
 FCI = FCISolver
diff --git a/pyscf/fci/direct_spin1.py b/pyscf/fci/direct_spin1.py
index da0450f356..7c0c024771 100644
--- a/pyscf/fci/direct_spin1.py
+++ b/pyscf/fci/direct_spin1.py
@@ -53,7 +53,7 @@
 from pyscf.fci.addons import _unpack_nelec
 from pyscf import __config__
 
-libfci = lib.load_library('libfci')
+libfci = cistring.libfci
 
 def contract_1e(f1e, fcivec, norb, nelec, link_index=None):
     '''Contract the 1-electron Hamiltonian with a FCI vector to get a new FCI
@@ -140,8 +140,11 @@ def contract_2e(eri, fcivec, norb, nelec, link_index=None):
                                 link_indexb.ctypes.data_as(ctypes.c_void_p))
     return ci1.view(FCIvector)
 
-def make_hdiag(h1e, eri, norb, nelec):
+def make_hdiag(h1e, eri, norb, nelec, compress=False):
     '''Diagonal Hamiltonian for Davidson preconditioner
+
+    Kwargs:
+        compress (bool) : whether to remove symmetry forbidden elements
     '''
     if h1e.dtype == numpy.complex128 or eri.dtype == numpy.complex128:
         raise NotImplementedError('Complex Hamiltonian')
@@ -149,9 +152,9 @@ def make_hdiag(h1e, eri, norb, nelec):
     neleca, nelecb = _unpack_nelec(nelec)
     h1e = numpy.asarray(h1e, order='C')
     eri = ao2mo.restore(1, eri, norb)
-    occslsta = occslstb = cistring._gen_occslst(range(norb), neleca)
+    occslsta = occslstb = cistring.gen_occslst(range(norb), neleca)
     if neleca != nelecb:
-        occslstb = cistring._gen_occslst(range(norb), nelecb)
+        occslstb = cistring.gen_occslst(range(norb), nelecb)
     na = len(occslsta)
     nb = len(occslstb)
 
@@ -198,10 +201,12 @@ def pspace(h1e, eri, norb, nelec, hdiag=None, np=400):
     neleca, nelecb = _unpack_nelec(nelec)
     h1e = numpy.ascontiguousarray(h1e)
     eri = ao2mo.restore(1, eri, norb)
+    na = cistring.num_strings(norb, neleca)
     nb = cistring.num_strings(norb, nelecb)
     if hdiag is None:
-        hdiag = make_hdiag(h1e, eri, norb, nelec)
-    if hdiag.size < np:
+        hdiag = make_hdiag(h1e, eri, norb, nelec, compress=False)
+    assert hdiag.size == na * nb
+    if hdiag.size <= np:
         addr = numpy.arange(hdiag.size)
     else:
         try:
@@ -434,8 +439,7 @@ def _get_init_guess(na, nb, nroots, hdiag, nelec):
     neleca, nelecb = _unpack_nelec(nelec)
     if neleca == nelecb and na == nb:
         hdiag = hdiag.reshape(na, na)
-        idx = numpy.arange(na)
-        addrs = numpy.argpartition(hdiag[idx[:,None]>=idx], nroots-1)[:nroots]
+        addrs = numpy.argpartition(lib.pack_tril(hdiag), nroots-1)[:nroots]
         for addr in addrs:
             addra = (int)((2*addr+.25)**.5 - .5 + 1e-7)
             addrb = addr - addra*(addra+1)//2
@@ -519,74 +523,84 @@ def kernel_ms1(fci, h1e, eri, norb, nelec, ci0=None, link_index=None,
 
     nelec = _unpack_nelec(nelec, fci.spin)
     assert (0 <= nelec[0] <= norb and 0 <= nelec[1] <= norb)
-    link_indexa, link_indexb = _unpack(norb, nelec, link_index)
-    na = link_indexa.shape[0]
-    nb = link_indexb.shape[0]
 
-    if max_memory < na*nb*6*8e-6:
-        log.warn('Not enough memory for FCI solver. '
-                 'The minimal requirement is %.0f MB', na*nb*60e-6)
+    hdiag = fci.make_hdiag(h1e, eri, norb, nelec, compress=False).ravel()
+    if getattr(fci, 'sym_allowed_idx', None):
+        # Remove symmetry forbidden elements
+        sym_idx = numpy.hstack(fci.sym_allowed_idx)
+        civec_size = sym_idx.size
+    else:
+        sym_idx = None
+        civec_size = hdiag.size
 
-    hdiag = fci.make_hdiag(h1e, eri, norb, nelec)
-    nroots = min(hdiag.size, nroots)
+    if max_memory < civec_size*6*8e-6:
+        log.warn('Not enough memory for FCI solver. '
+                 'The minimal requirement is %.0f MB', civec_size*60e-6)
 
-    try:
-        addr, h0 = fci.pspace(h1e, eri, norb, nelec, hdiag, max(pspace_size,nroots))
-        if pspace_size > 0:
-            pw, pv = fci.eig(h0)
-        else:
-            pw = pv = None
-
-        if pspace_size >= na*nb and ci0 is None and not davidson_only:
-            # The degenerated wfn can break symmetry.  The davidson iteration with proper
-            # initial guess doesn't have this issue
-            if na*nb == 1:
-                return pw[0]+ecore, pv[:,0].reshape(1,1).view(FCIvector)
-            elif nroots > 1:
-                civec = numpy.empty((nroots,na*nb))
-                civec[:,addr] = pv[:,:nroots].T
-                return pw[:nroots]+ecore, [c.reshape(na,nb).view(FCIvector) for c in civec]
-            elif abs(pw[0]-pw[1]) > 1e-12:
-                civec = numpy.empty((na*nb))
-                civec[addr] = pv[:,0]
-                return pw[0]+ecore, civec.reshape(na,nb).view(FCIvector)
-    except NotImplementedError:
-        addr = [0]
-        pw = pv = None
-
-    precond = fci.make_precond(hdiag, pw, pv, addr)
+    pspace_size = min(hdiag.size, pspace_size)
+    addr = [0]
+    pw = pv = None
+    if pspace_size > 0:
+        try:
+            addr, h0 = fci.pspace(h1e, eri, norb, nelec, hdiag, pspace_size)
+            pspace_size = len(addr)
+        except NotImplementedError:
+            pass
+        pw, pv = fci.eig(h0)
+
+    if pspace_size >= civec_size and ci0 is None and not davidson_only:
+        if nroots > 1:
+            nroots = min(civec_size, nroots)
+            civec = numpy.empty((nroots,civec_size))
+            civec[:,addr] = pv[:,:nroots].T
+            return pw[:nroots]+ecore, civec
+        elif pspace_size == 1 or abs(pw[0]-pw[1]) > 1e-12:
+            # Check degeneracy. Degenerated wfn may break point group symmetry.
+            # Davidson iteration with a proper initial guess can avoid this problem.
+            civec = numpy.empty(civec_size)
+            civec[addr] = pv[:,0]
+            return pw[0]+ecore, civec
+    pw = pv = h0 = None
+
+    if hdiag.size == civec_size:
+        precond = fci.make_precond(hdiag)
+    else:
+        precond = fci.make_precond(hdiag[sym_idx])
 
     h2e = fci.absorb_h1e(h1e, eri, norb, nelec, .5)
     if hop is None:
+        cpu0 = [logger.process_clock(), logger.perf_counter()]
         def hop(c):
-            hc = fci.contract_2e(h2e, c, norb, nelec, (link_indexa,link_indexb))
+            hc = fci.contract_2e(h2e, c, norb, nelec, link_index)
+            cpu0[:] = log.timer_debug1('contract_2e', *cpu0)
             return hc.ravel()
 
-    if ci0 is None:
+    def init_guess():
         if callable(getattr(fci, 'get_init_guess', None)):
-            ci0 = lambda: fci.get_init_guess(norb, nelec, nroots, hdiag)
+            return fci.get_init_guess(norb, nelec, nroots, hdiag)
         else:
-            def ci0():  # lazy initialization to reduce memory footprint
-                x0 = []
-                for i in range(nroots):
-                    x = numpy.zeros(na*nb)
-                    x[addr[i]] = 1
-                    x0.append(x)
-                return x0
+            x0 = []
+            for i in range(min(len(addr), nroots)):
+                x = numpy.zeros(civec_size)
+                x[addr[i]] = 1
+                x0.append(x)
+            return x0
+
+    if ci0 is None:
+        ci0 = init_guess  # lazy initialization to reduce memory footprint
     elif not callable(ci0):
-        if isinstance(ci0, numpy.ndarray) and ci0.size == na*nb:
+        if isinstance(ci0, numpy.ndarray):
             ci0 = [ci0.ravel()]
         else:
             ci0 = [x.ravel() for x in ci0]
-        # Add vectors if not enough initial guess is given
+        if sym_idx is not None and ci0[0].size != civec_size:
+            ci0 = [x[sym_idx] for x in ci0]
+        # If provided initial guess ci0 are accidentally the eigenvectors of the
+        # system, Davidson solver may be failed to find enough roots as it is
+        # unable to generate more subspace basis from ci0. Adding vectors so
+        # initial guess to help Davidson solver generate enough basis.
         if len(ci0) < nroots:
-            if callable(getattr(fci, 'get_init_guess', None)):
-                ci0.extend(fci.get_init_guess(norb, nelec, nroots, hdiag)[len(ci0):])
-            else:
-                for i in range(len(ci0), nroots):
-                    x = numpy.zeros(na*nb)
-                    x[addr[i]] = 1
-                    ci0.append(x)
+            ci0.extend(init_guess()[len(ci0):])
 
     if tol is None: tol = fci.conv_tol
     if lindep is None: lindep = fci.lindep
@@ -595,15 +609,11 @@ def ci0():  # lazy initialization to reduce memory footprint
     tol_residual = getattr(fci, 'conv_tol_residual', None)
 
     with lib.with_omp_threads(fci.threads):
-        #e, c = lib.davidson(hop, ci0, precond, tol=fci.conv_tol, lindep=fci.lindep)
         e, c = fci.eig(hop, ci0, precond, tol=tol, lindep=lindep,
                        max_cycle=max_cycle, max_space=max_space, nroots=nroots,
                        max_memory=max_memory, verbose=log, follow_state=True,
                        tol_residual=tol_residual, **kwargs)
-    if nroots > 1:
-        return e+ecore, [ci.reshape(na,nb).view(FCIvector) for ci in c]
-    else:
-        return e+ecore, c.reshape(na,nb).view(FCIvector)
+    return e+ecore, c
 
 def make_pspace_precond(hdiag, pspaceig, pspaceci, addr, level_shift=0):
     # precondition with pspace Hamiltonian, CPL, 169, 463
@@ -622,7 +632,7 @@ def precond(r, e0, x0, *args):
         x1 = r - e1*x0
         #pspace_x1 = x1[addr].copy()
         x1 *= hdiaginv
-# pspace (h0-e0)^{-1} cause diverging?
+        # pspace (h0-e0)^{-1} cause diverging?
         #x1[addr] = numpy.linalg.solve(h0e0, pspace_x1)
         return x1
     return precond
@@ -696,7 +706,8 @@ class FCIBase(lib.StreamObject):
     conv_tol_residual = getattr(__config__, 'fci_direct_spin1_FCI_conv_tol_residual', None)
     lindep = getattr(__config__, 'fci_direct_spin1_FCI_lindep', 1e-14)
 
-    # level shift in precond
+    # level shift in preconditioner is helpful to avoid singularity and linear
+    # dependence basis in davidson diagonalization solver
     level_shift = getattr(__config__, 'fci_direct_spin1_FCI_level_shift', 1e-3)
 
     # force the diagonlization use davidson iteration.  When the CI space
@@ -769,9 +780,9 @@ def absorb_h1e(self, h1e, eri, norb, nelec, fac=1):
         return absorb_h1e(h1e, eri, norb, nelec, fac)
 
     @lib.with_doc(make_hdiag.__doc__)
-    def make_hdiag(self, h1e, eri, norb, nelec):
+    def make_hdiag(self, h1e, eri, norb, nelec, compress=False):
         nelec = _unpack_nelec(nelec, self.spin)
-        return make_hdiag(h1e, eri, norb, nelec)
+        return make_hdiag(h1e, eri, norb, nelec, compress)
 
     @lib.with_doc(pspace.__doc__)
     def pspace(self, h1e, eri, norb, nelec, hdiag=None, np=400):
@@ -802,11 +813,12 @@ def eig(self, op, x0=None, precond=None, **kwargs):
             ci = ci[0]
         return e, ci
 
-    def make_precond(self, hdiag, pspaceig, pspaceci, addr):
+    def make_precond(self, hdiag, pspaceig=None, pspaceci=None, addr=None):
         if pspaceig is None:
             return make_diag_precond(hdiag, pspaceig, pspaceci, addr,
                                      self.level_shift)
         else:
+            # Note: H0 in pspace may break symmetry.
             return make_pspace_precond(hdiag, pspaceig, pspaceci, addr,
                                        self.level_shift)
 
@@ -819,14 +831,23 @@ def kernel(self, h1e, eri, norb, nelec, ci0=None,
                tol=None, lindep=None, max_cycle=None, max_space=None,
                nroots=None, davidson_only=None, pspace_size=None,
                orbsym=None, wfnsym=None, ecore=0, **kwargs):
+        if nroots is None: nroots = self.nroots
         if self.verbose >= logger.WARN:
             self.check_sanity()
         self.norb = norb
-        self.nelec = nelec
-        self.eci, self.ci = \
-                kernel_ms1(self, h1e, eri, norb, nelec, ci0, None,
-                           tol, lindep, max_cycle, max_space, nroots,
-                           davidson_only, pspace_size, ecore=ecore, **kwargs)
+        self.nelec = nelec = _unpack_nelec(nelec, self.spin)
+        link_index = _unpack(norb, nelec, None)
+        e, c = kernel_ms1(self, h1e, eri, norb, nelec, ci0, link_index,
+                          tol, lindep, max_cycle, max_space, nroots,
+                          davidson_only, pspace_size, ecore=ecore, **kwargs)
+        self.eci = e
+
+        na = link_index[0].shape[0]
+        nb = link_index[1].shape[0]
+        if nroots > 1:
+            self.ci = [x.reshape(na,nb).view(FCIvector) for x in c]
+        else:
+            self.ci = c.reshape(na,nb).view(FCIvector)
         return self.eci, self.ci
 
     @lib.with_doc(energy.__doc__)
@@ -899,9 +920,8 @@ def large_ci(self, fcivec, norb, nelec,
         return addons.large_ci(fcivec, norb, nelec, tol, return_strs)
 
     def contract_ss(self, fcivec, norb, nelec):  # noqa: F811
-        from pyscf.fci import spin_op
         nelec = _unpack_nelec(nelec, self.spin)
-        return spin_op.contract_ss(fcivec, norb, nelec)
+        return contract_ss(fcivec, norb, nelec)
 
     def gen_linkstr(self, norb, nelec, tril=True, spin=None):
         if spin is None:
@@ -946,43 +966,3 @@ def _unpack(norb, nelec, link_index, spin=None):
         return link_indexa, link_indexb
     else:
         return link_index
-
-
-if __name__ == '__main__':
-    from functools import reduce
-    from pyscf import gto
-    from pyscf import scf
-
-    mol = gto.Mole()
-    mol.verbose = 0
-    mol.output = None#"out_h2o"
-    mol.atom = [
-        ['H', ( 1.,-1.    , 0.   )],
-        ['H', ( 0.,-1.    ,-1.   )],
-        ['H', ( 1.,-0.5   ,-1.   )],
-        #['H', ( 0.,-0.5   ,-1.   )],
-        #['H', ( 0.,-0.5   ,-0.   )],
-        ['H', ( 0.,-0.    ,-1.   )],
-        ['H', ( 1.,-0.5   , 0.   )],
-        ['H', ( 0., 1.    , 1.   )],
-    ]
-
-    mol.basis = {'H': 'sto-3g'}
-    mol.build()
-
-    m = scf.RHF(mol)
-    ehf = m.scf()
-
-    cis = FCISolver(mol)
-    norb = m.mo_coeff.shape[1]
-    nelec = mol.nelectron - 2
-    h1e = reduce(numpy.dot, (m.mo_coeff.T, m.get_hcore(), m.mo_coeff))
-    eri = ao2mo.incore.general(m._eri, (m.mo_coeff,)*4, compact=False)
-    eri = eri.reshape(norb,norb,norb,norb)
-    nea = nelec//2 + 1
-    neb = nelec//2 - 1
-    nelec = (nea, neb)
-
-    e1 = cis.kernel(h1e, eri, norb, nelec, davidson_only=True)[0]
-    print(e1, e1 - -7.7466756526056004)
-
diff --git a/pyscf/fci/direct_spin1_cyl_sym.py b/pyscf/fci/direct_spin1_cyl_sym.py
index ad0cf5c55f..a666a393f2 100644
--- a/pyscf/fci/direct_spin1_cyl_sym.py
+++ b/pyscf/fci/direct_spin1_cyl_sym.py
@@ -30,7 +30,10 @@
 of direct_spin1_symm.
 '''
 
+import functools
+import ctypes
 import numpy
+import numpy as np
 from pyscf import lib
 from pyscf import ao2mo
 from pyscf.lib import logger
@@ -46,39 +49,253 @@
 from pyscf.fci import addons
 from pyscf import __config__
 
-def get_init_guess(norb, nelec, nroots, hdiag, orbsym, wfnsym=0):
+libfci = direct_spin1.libfci
+
+def contract_2e(eri, fcivec, norb, nelec, link_index=None, orbsym=None, wfnsym=0):
+    if orbsym is None:
+        return direct_nosym.contract_2e(eri, fcivec, norb, nelec, link_index)
+
+    neleca, nelecb = direct_spin1._unpack_nelec(nelec)
+    link_indexa, link_indexb = direct_nosym._unpack(norb, nelec, link_index)
+    na, nlinka = link_indexa.shape[:2]
+    nb, nlinkb = link_indexb.shape[:2]
+
+    wfn_momentum = symm.basis.linearmole_irrep2momentum(wfnsym)
+    wfnsym_in_d2h = wfnsym % 10
+    wfn_ungerade = wfnsym_in_d2h >= 4
+    orbsym_d2h = orbsym % 10
+    orb_ungerade = orbsym_d2h >= 4
+    if np.any(orb_ungerade) or wfn_ungerade:
+        max_gerades = 2
+    else:
+        max_gerades = 1
+
+    orb_l = _get_orb_l(orbsym)
+    max_eri_l = abs(orb_l).max() * 2
+
+    strsa = cistring.gen_strings4orblist(range(norb), neleca)
+    strsa_l = _strs_angular_momentum(strsa, orbsym)
+    max_stra_l = strsa_l.max()
+    if neleca == nelecb:
+        strsb_l = strsa_l
+        max_strb_l = max_stra_l
+    else:
+        strsb = cistring.gen_strings4orblist(range(norb), nelecb)
+        strsb_l = _strs_angular_momentum(strsb, orbsym)
+        max_strb_l = strsb_l.max()
+    max_momentum = max(max_stra_l, max_strb_l, max_eri_l)
+
+    eri, rank_eri, irrep_eri = reorder_eri(eri, norb, orbsym, max_momentum,
+                                           max_gerades)
+    eri_ir_dims = eri.ir_dims
+    aidx, link_indexa = gen_str_irrep(strsa, orbsym, link_indexa, rank_eri,
+                                      irrep_eri, max_momentum, max_gerades)
+    nas = nbs = np.array([x.size for x in aidx], dtype=np.int32)
+    if neleca == nelecb:
+        bidx, link_indexb = aidx, link_indexa
+    else:
+        bidx, link_indexb = gen_str_irrep(strsb, orbsym, link_indexb, rank_eri,
+                                          irrep_eri, max_momentum, max_gerades)
+        nbs = np.array([x.size for x in bidx], dtype=np.int32)
+
+    nirreps = (max_momentum * 2 + 1) * max_gerades
+    ug_offsets = max_momentum * 2 + 1
+    ab_idx = [np.zeros(0, dtype=int)] * nirreps
+    for ag in range(max_gerades):
+        bg = wfn_ungerade ^ ag
+        # abs(al) < max_stra_l and abs(bl := wfn_momentum-al) < max_strb_l
+        for al in range(max(-max_stra_l, wfn_momentum-max_strb_l),
+                        min( max_stra_l, wfn_momentum+max_strb_l)+1):
+            bl = wfn_momentum - al
+            stra_ir = al + max_momentum + ag * ug_offsets
+            strb_ir = bl + max_momentum + bg * ug_offsets
+            ab_idx[stra_ir] = (aidx[stra_ir][:,None] * nb + bidx[strb_ir]).ravel()
+    ci_size = np.array([x.size for x in ab_idx], dtype=np.int32)
+
+    if fcivec.size == na * nb:
+        ab_idx = np.hstack(ab_idx)
+        ci0 = fcivec.ravel()[ab_idx]
+    else:
+        ci0 = fcivec
+    ci1 = np.zeros_like(ci0)
+
+    libfci.FCIcontract_2e_cyl_sym(
+        eri.ctypes.data_as(ctypes.c_void_p),
+        ci0.ctypes.data_as(ctypes.c_void_p),
+        ci1.ctypes.data_as(ctypes.c_void_p),
+        eri_ir_dims.ctypes.data_as(ctypes.c_void_p),
+        ci_size.ctypes.data_as(ctypes.c_void_p),
+        nas.ctypes.data_as(ctypes.c_void_p),
+        nbs.ctypes.data_as(ctypes.c_void_p),
+        link_indexa.ctypes.data_as(ctypes.c_void_p),
+        link_indexb.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(norb), ctypes.c_int(nlinka), ctypes.c_int(nlinkb),
+        ctypes.c_int(max_momentum), ctypes.c_int(max_gerades),
+        ctypes.c_int(wfn_momentum), ctypes.c_int(wfn_ungerade))
+
+    if fcivec.size == na * nb:
+        ci1new = np.zeros(fcivec.size, dtype=fcivec.dtype)
+        ci1new[ab_idx] = ci1
+        ci1 = ci1new.reshape(fcivec.shape)
+    return ci1.view(direct_spin1.FCIvector)
+
+def _get_orb_l(orbsym):
+    '''angular momentum for each orbital'''
+    orb_l = (orbsym // 10) * 2
+    orbsym_d2h = orbsym % 10
+    e1_mask = np.isin(orbsym_d2h, (2, 3, 6, 7))
+    orb_l[e1_mask] += 1
+    ey_mask = np.isin(orbsym_d2h, (1, 3, 4, 6))
+    orb_l[ey_mask] *= -1
+    return orb_l
+
+def reorder_eri(eri, norb, orbsym, max_momentum, max_gerades):
+    eri = eri.reshape(norb,norb,norb,norb)
+    # Swap last two indices because they are contracted to the t1 intermediates
+    # in FCIcontract_2e_cyl_sym. t1 is generated with swapped orbital indices (a*norb+i).
+    eri = eri.transpose(0,1,3,2).reshape(norb**2, norb**2)
+
+    # % 10 to map irrep IDs of Dooh or Coov, etc. to irreps of D2h, C2v
+    orbsym_d2h = orbsym % 10
+    orb_ungerade = orbsym_d2h >= 4
+    nirreps = (max_momentum * 2 + 1) * max_gerades
+
+    # irrep of (ij| pair
+    orb_l = _get_orb_l(orbsym)
+    ll_prod = (orb_l[:,None] - orb_l).ravel()
+    maxll = abs(orb_l).max() * 2
+
+    old_eri_irrep = np.asarray(ll_prod+max_momentum, dtype=np.int32)
+    rank_in_irrep = np.empty_like(old_eri_irrep)
+    ir_idx_pairs = [None] * nirreps
+
+    if max_gerades == 2:
+        ug_offsets = max_momentum * 2 + 1
+        ug_prod = (orb_ungerade[:,None] ^ orb_ungerade).ravel()
+        old_eri_irrep[ug_prod] += ug_offsets
+
+        # gerade
+        idx = np.asarray(np.where((ll_prod == 0) & ~ug_prod)[0], dtype=np.int32)
+        ir_idx_pairs[max_momentum] = (idx, idx)
+        rank_in_irrep[idx] = np.arange(idx.size, dtype=np.int32)
+        # ungerade
+        idx = np.asarray(np.where((ll_prod == 0) & ug_prod)[0], dtype=np.int32)
+        ir_idx_pairs[max_momentum+ug_offsets] = (idx, idx)
+        rank_in_irrep[idx] = np.arange(idx.size, dtype=np.int32)
+
+        for ll in range(1, maxll+1):
+            # gerade
+            idx_p = np.asarray(np.where((ll_prod == ll) & ~ug_prod)[0], dtype=np.int32)
+            idx_m = np.asarray(np.where((ll_prod ==-ll) & ~ug_prod)[0], dtype=np.int32)
+            assert idx_p.size == idx_m.size
+            if idx_p.size > 0:
+                ir_idx_pairs[max_momentum+ll] = (idx_p, idx_p)
+                ir_idx_pairs[max_momentum-ll] = (idx_m, idx_m)
+                rank_in_irrep[idx_p] = np.arange(idx_p.size, dtype=np.int32)
+                rank_in_irrep[idx_m] = np.arange(idx_m.size, dtype=np.int32)
+            # ungerade
+            idx_p = np.asarray(np.where((ll_prod == ll) & ug_prod)[0], dtype=np.int32)
+            idx_m = np.asarray(np.where((ll_prod ==-ll) & ug_prod)[0], dtype=np.int32)
+            assert idx_p.size == idx_m.size
+            if idx_p.size > 0:
+                ir_idx_pairs[max_momentum+ll+ug_offsets] = (idx_p, idx_p)
+                ir_idx_pairs[max_momentum-ll+ug_offsets] = (idx_m, idx_m)
+                rank_in_irrep[idx_p] = np.arange(idx_p.size, dtype=np.int32)
+                rank_in_irrep[idx_m] = np.arange(idx_m.size, dtype=np.int32)
+    else:
+        idx = np.asarray(np.where(ll_prod == 0)[0], dtype=np.int32)
+        ir_idx_pairs[max_momentum] = (idx, idx)
+        rank_in_irrep[idx] = np.arange(idx.size, dtype=np.int32)
+
+        for ll in range(1, maxll+1):
+            idx_p = np.asarray(np.where(ll_prod == ll)[0], dtype=np.int32)
+            idx_m = np.asarray(np.where(ll_prod ==-ll)[0], dtype=np.int32)
+            assert idx_p.size == idx_m.size
+            if idx_p.size > 0:
+                ir_idx_pairs[max_momentum+ll] = (idx_p, idx_p)
+                ir_idx_pairs[max_momentum-ll] = (idx_m, idx_m)
+                rank_in_irrep[idx_p] = np.arange(idx_p.size, dtype=np.int32)
+                rank_in_irrep[idx_m] = np.arange(idx_m.size, dtype=np.int32)
+
+    ir_dims = np.hstack([0 if x is None else x[0].size for x in ir_idx_pairs])
+    eri_irs = np.empty((ir_dims**2).sum())
+    p1 = 0
+    for idx in ir_idx_pairs:
+        if idx is not None:
+            p0, p1 = p1, p1 + idx[0].size**2
+            lib.take_2d(eri, idx[0], idx[1], out=eri_irs[p0:p1])
+    eri_irs = lib.tag_array(eri_irs, ir_dims=np.asarray(ir_dims, dtype=np.int32))
+    return eri_irs, rank_in_irrep, old_eri_irrep
+
+def argsort_strs_by_irrep(strs, orbsym, max_momentum, max_gerades):
+    strs_ls = _strs_angular_momentum(strs, orbsym)
+    maxl = abs(strs_ls).max()
+    nirreps = (max_momentum * 2 + 1) * max_gerades
+    aidx = [np.zeros(0, dtype=np.int32)] * nirreps
+
+    if max_gerades == 2:
+        ug_offsets = max_momentum * 2 + 1
+        irreps_d2h = direct_spin1_symm._gen_strs_irrep(strs, orbsym)
+        strs_ug = irreps_d2h >= 4
+        for l in range(-maxl, maxl+1):
+            idx = np.where((strs_ls == l) & ~strs_ug)[0]
+            aidx[max_momentum+l] = idx
+            idx = np.where((strs_ls == l) & strs_ug)[0]
+            aidx[max_momentum+l+ug_offsets] = idx
+    else:
+        for l in range(-maxl, maxl+1):
+            idx = np.where(strs_ls == l)[0]
+            aidx[max_momentum+l] = idx
+    return aidx
+
+def gen_str_irrep(strs, orbsym, link_index, rank_eri, irrep_eri, max_momentum,
+                  max_gerades):
+    aidx = argsort_strs_by_irrep(strs, orbsym, max_momentum, max_gerades)
+    na = len(strs)
+    rank = np.zeros(na, dtype=np.int32)
+    for idx in aidx:
+        if idx.size > 0:
+            rank[idx] = np.arange(idx.size, dtype=np.int32)
+
+    link_index = link_index.copy()
+    norb = orbsym.size
+    ai_addr = link_index[:,:,0] * norb + link_index[:,:,1]
+    link_index[:,:,0] = rank_eri[ai_addr]
+    link_index[:,:,1] = irrep_eri[ai_addr]
+    link_index[:,:,2] = rank[link_index[:,:,2]]
+
+    link_index = link_index.take(np.hstack(aidx), axis=0)
+    return aidx, link_index
+
+def get_init_guess(norb, nelec, nroots, hdiag, orbsym, wfnsym=0,
+                   sym_allowed_idx=None):
     neleca, nelecb = direct_spin1._unpack_nelec(nelec)
     strsa = strsb = cistring.gen_strings4orblist(range(norb), neleca)
-    airreps_d2h = birreps_d2h = direct_spin1_symm._gen_strs_irrep(strsa, orbsym)
-    a_ls = b_ls = _strs_angular_momentum(strsa, orbsym)
     if neleca != nelecb:
         strsb = cistring.gen_strings4orblist(range(norb), nelecb)
-        birreps_d2h = direct_spin1_symm._gen_strs_irrep(strsb, orbsym)
-        b_ls = _strs_angular_momentum(strsb, orbsym)
-
-    wfnsym_in_d2h = wfnsym % 10
-    wfn_momentum = symm.basis.linearmole_irrep2momentum(wfnsym)
     na = len(strsa)
     nb = len(strsb)
-    hdiag = hdiag.reshape(na,nb)
     degen = orbsym.degen_mapping
-    ci0 = []
-    iroot = 0
-    wfn_ungerade = wfnsym_in_d2h >= 4
-    a_ungerade = airreps_d2h >= 4
-    b_ungerade = birreps_d2h >= 4
-    sym_allowed = a_ungerade[:,None] == b_ungerade ^ wfn_ungerade
-    # total angular momentum == wfn_momentum
-    sym_allowed &= a_ls[:,None] == wfn_momentum - b_ls
+
+    if sym_allowed_idx is None:
+        sym_allowed_idx = sym_allowed_indices(nelec, orbsym, wfnsym)
+    s_idx = np.hstack(sym_allowed_idx)
+    idx_a, idx_b = divmod(s_idx, nb)
+    if hdiag.size == na*nb:
+        hdiag = hdiag[s_idx]
+    civec_size = hdiag.size
+
     if neleca == nelecb and na == nb:
-        idx = numpy.arange(na)
-        sym_allowed[idx[:,None] < idx] = False
-    idx_a, idx_b = numpy.where(sym_allowed)
+        idx = np.arange(idx_a.size)[idx_a >= idx_b]
+        idx_a = idx_a[idx]
+        idx_b = idx_b[idx]
+        hdiag = hdiag[idx]
 
-    for k in hdiag[idx_a,idx_b].argsort():
+    ci0 = []
+    for k in hdiag.argsort():
         addra, addrb = idx_a[k], idx_b[k]
-        x = numpy.zeros((na, nb))
-        x[addra, addrb] = 1.
+        x = np.zeros(civec_size)
+        x[s_idx==addra*nb+addrb] = 1.
         if wfnsym in (0, 1, 4, 5):
             addra1, sign_a = _sv_associated_det(strsa[addra], degen)
             addrb1, sign_b = _sv_associated_det(strsb[addrb], degen)
@@ -87,18 +304,17 @@ def get_init_guess(norb, nelec, nroots, hdiag, orbsym, wfnsym=0):
             # (E+)(E-') - (E-)(E+') => A2
             if wfnsym in (0, 5):  # A1g, A1u
                 # ensure <|sigma_v|> = 1
-                x[addra1,addrb1] += sign_a * sign_b
+                x[s_idx==addra1*nb+addrb1] += sign_a * sign_b
             elif wfnsym in (1, 4):  # A2g, A2u
                 # ensure <|sigma_v|> = -1
-                x[addra1,addrb1] -= sign_a * sign_b
+                x[s_idx==addra1*nb+addrb1] -= sign_a * sign_b
 
-        norm = numpy.linalg.norm(x)
+        norm = np.linalg.norm(x)
         if norm < 1e-3:
             continue
         x *= 1./norm
-        ci0.append(x.ravel().view(direct_spin1.FCIvector))
-        iroot += 1
-        if iroot >= nroots:
+        ci0.append(x.view(direct_spin1.FCIvector))
+        if len(ci0) >= nroots:
             break
 
     if len(ci0) == 0:
@@ -191,7 +407,7 @@ def guess_wfnsym(solver, norb, nelec, fcivec=None, orbsym=None, wfnsym=None, **k
         if neleca != nelecb:
             strsb = cistring.gen_strings4orblist(range(norb), nelecb)
 
-        if not isinstance(fcivec, numpy.ndarray) or fcivec.ndim > 2:
+        if not isinstance(fcivec, np.ndarray) or fcivec.ndim > 2:
             fcivec = fcivec[0]
         wfnsym1 = _guess_wfnsym(fcivec, strsa, strsb, orbsym)
 
@@ -202,34 +418,128 @@ def guess_wfnsym(solver, norb, nelec, fcivec=None, orbsym=None, wfnsym=None, **k
         wfnsym = wfnsym1
     return wfnsym
 
+def sym_allowed_indices(nelec, orbsym, wfnsym):
+    '''Indices of symmetry allowed determinants for each irrep'''
+    norb = orbsym.size
+    neleca, nelecb = nelec
+    strsa = strsb = cistring.gen_strings4orblist(range(norb), neleca)
+    strsa_l = _strs_angular_momentum(strsa, orbsym)
+    max_stra_l = max_strb_l = strsa_l.max()
+    if neleca != nelecb:
+        strsb = cistring.gen_strings4orblist(range(norb), nelecb)
+        strsb_l = _strs_angular_momentum(strsb, orbsym)
+        max_strb_l = strsb_l.max()
+    nb = len(strsb)
+
+    wfn_momentum = symm.basis.linearmole_irrep2momentum(wfnsym)
+    wfnsym_in_d2h = wfnsym % 10
+    wfn_ungerade = wfnsym_in_d2h >= 4
+    orbsym_d2h = orbsym % 10
+    orb_ungerade = orbsym_d2h >= 4
+    if np.any(orb_ungerade) or wfn_ungerade:
+        max_gerades = 2
+    else:
+        max_gerades = 1
+    orb_l = _get_orb_l(orbsym)
+    max_eri_l = abs(orb_l).max() * 2
+    max_momentum = max(max_stra_l, max_strb_l, max_eri_l)
+
+    aidx = bidx = argsort_strs_by_irrep(strsa, orbsym, max_momentum, max_gerades)
+    if neleca != nelecb:
+        bidx = argsort_strs_by_irrep(strsb, orbsym, max_momentum, max_gerades)
+
+    nirreps = (max_momentum * 2 + 1) * max_gerades
+    ug_offsets = max_momentum * 2 + 1
+    ab_idx = [np.zeros(0, dtype=int)] * nirreps
+    for ag in range(max_gerades):
+        bg = wfn_ungerade ^ ag
+        # abs(al) < max_stra_l and abs(bl := wfn_momentum-al) < max_strb_l
+        for al in range(max(-max_stra_l, wfn_momentum-max_strb_l),
+                        min( max_stra_l, wfn_momentum+max_strb_l)+1):
+            bl = wfn_momentum - al
+            stra_ir = al + max_momentum + ag * ug_offsets
+            strb_ir = bl + max_momentum + bg * ug_offsets
+            ab_idx[stra_ir] = (aidx[stra_ir][:,None] * nb + bidx[strb_ir]).ravel()
+    return ab_idx
+
+def _dm_wrapper(fn_rdm):
+    def transform(dm, u):
+        if dm.ndim == 2:
+            dm = u.conj().T.dot(dm).dot(u)
+        else:
+            dm = lib.einsum('pqrs,pi,qj,rk,sl->ijkl', dm, u.conj(), u, u.conj(), u)
+        return dm.real.copy()
+
+    @functools.wraps(fn_rdm)
+    def make_dm(self, fcivec, norb, nelec, *args, **kwargs):
+        nelec = direct_spin1._unpack_nelec(nelec, self.spin)
+        dms = fn_rdm(fcivec, norb, nelec, *args, **kwargs)
+        orbsym = self.orbsym
+        degen_mapping = self.orbsym.degen_mapping
+        u = _cyl_sym_orbital_rotation(orbsym, degen_mapping)
+        if isinstance(dms, np.ndarray):
+            return transform(dms, u)
+        else:
+            return [transform(dm, u) for dm in dms]
+    return make_dm
+
 
 class FCISolver(direct_spin1_symm.FCISolver):
 
     def contract_1e(self, f1e, fcivec, norb, nelec, link_index=None, **kwargs):
-        return direct_nosym.contract_1e(f1e, fcivec, norb, nelec, link_index)
+        raise NotImplementedError
 
     def contract_2e(self, eri, fcivec, norb, nelec, link_index=None,
                     orbsym=None, wfnsym=None, **kwargs):
-        return direct_nosym.contract_2e(eri, fcivec, norb, nelec, link_index)
+        if orbsym is None: orbsym = self.orbsym
+        if wfnsym is None:
+            wfnsym = direct_spin1_symm._id_wfnsym(self, norb, nelec, orbsym, self.wfnsym)
+        return contract_2e(eri, fcivec, norb, nelec, link_index, orbsym, wfnsym)
+
+    def get_init_guess(self, norb, nelec, nroots, hdiag, orbsym=None, wfnsym=None):
+        if orbsym is None:
+            orbsym = self.orbsym
+        if wfnsym is None:
+            wfnsym = direct_spin1_symm._id_wfnsym(self, norb, nelec, orbsym, self.wfnsym)
+        return get_init_guess(norb, nelec, nroots, hdiag, orbsym, wfnsym,
+                              self.sym_allowed_idx)
+
+    def pspace(self, h1e, eri, norb, nelec, hdiag, np=400):
+        nelec = direct_spin1._unpack_nelec(nelec, self.spin)
+        na = cistring.num_strings(norb, nelec[0])
+        nb = cistring.num_strings(norb, nelec[1])
+        s_idx = numpy.hstack(self.sym_allowed_idx)
+        # Screen symmetry forbidden elements
+        hdiag, hdiag0 = numpy.empty(na*nb), hdiag.ravel()
+        hdiag[:] = 1e99
+        if hdiag0.size == s_idx.size:
+            hdiag[s_idx] = hdiag0
+        else:
+            hdiag[s_idx] = hdiag0[s_idx]
 
-    def get_init_guess(self, norb, nelec, nroots, hdiag):
-        wfnsym = direct_spin1_symm._id_wfnsym(self, norb, nelec, self.orbsym, self.wfnsym)
-        return get_init_guess(norb, nelec, nroots, hdiag, self.orbsym, wfnsym)
+        np = min(np, s_idx.size)
+        addr0, h = direct_spin1.pspace(h1e, eri, norb, nelec, hdiag, np)
+
+        # mapping the address in (na,nb) civec to address in sym-allowed civec
+        s_idx_allowed = numpy.where(numpy.isin(s_idx, addr0))[0]
+        addr1 = s_idx[s_idx_allowed]
+        new_idx = numpy.empty_like(s_idx_allowed)
+        new_idx[addr0.argsort()] = addr1.argsort()
+        addr = s_idx_allowed[new_idx]
+        return addr, h
 
     absorb_h1e = direct_nosym.FCISolver.absorb_h1e
-    make_hdiag = direct_nosym.FCISolver.make_hdiag
-    pspace = direct_spin1.FCISolver.pspace
+    make_hdiag = direct_spin1_symm.FCISolver.make_hdiag
     guess_wfnsym = guess_wfnsym
 
-    def make_rdm12(self, fcivec, norb, nelec, link_index=None, reorder=True):
-        nelec = direct_spin1._unpack_nelec(nelec, self.spin)
-        dm1, dm2 = direct_spin1.make_rdm12(fcivec, norb, nelec, link_index, reorder)
-        orbsym = self.orbsym
-        degen_mapping = self.orbsym.degen_mapping
-        u = _cyl_sym_orbital_rotation(orbsym, degen_mapping)
-        dm1 = u.conj().T.dot(dm1).dot(u)
-        dm2 = lib.einsum('pqrs,pi,qj,rk,sl->ijkl', dm2, u.conj(), u, u.conj(), u)
-        return dm1.real.copy(), dm2.real.copy()
+    make_rdm1 = _dm_wrapper(direct_spin1.make_rdm1)
+    make_rdm1s = _dm_wrapper(direct_spin1.make_rdm1s)
+    make_rdm12 = _dm_wrapper(direct_spin1.make_rdm12)
+    make_rdm12s = _dm_wrapper(direct_spin1.make_rdm12s)
+    trans_rdm1 = _dm_wrapper(direct_spin1.trans_rdm1)
+    trans_rdm1s = _dm_wrapper(direct_spin1.trans_rdm1s)
+    trans_rdm12 = _dm_wrapper(direct_spin1.trans_rdm12)
+    trans_rdm12s = _dm_wrapper(direct_spin1.trans_rdm12s)
 
     def kernel(self, h1e, eri, norb, nelec, ci0=None,
                tol=None, lindep=None, max_cycle=None, max_space=None,
@@ -239,38 +549,65 @@ def kernel(self, h1e, eri, norb, nelec, ci0=None,
         if orbsym is None: orbsym = self.orbsym
         if wfnsym is None: wfnsym = self.wfnsym
         if self.verbose >= logger.WARN:
+            if 'verbose' not in kwargs:
+                kwargs['verbose'] = self.verbose
             self.check_sanity()
         self.norb = norb
-        self.nelec = nelec
+        self.nelec = nelec = direct_spin1._unpack_nelec(nelec, self.spin)
 
         if not hasattr(orbsym, 'degen_mapping'):
             degen_mapping = map_degeneracy(h1e.diagonal(), orbsym)
             orbsym = lib.tag_array(orbsym, degen_mapping=degen_mapping)
-        self.orbsym = orbsym
-        u = _cyl_sym_orbital_rotation(orbsym, orbsym.degen_mapping)
-        wfnsym = self.guess_wfnsym(norb, nelec, ci0, orbsym, wfnsym, **kwargs)
 
+        u = _cyl_sym_orbital_rotation(orbsym, orbsym.degen_mapping)
         h1e = u.dot(h1e).dot(u.conj().T)
         eri = ao2mo.restore(1, eri, norb)
         eri = lib.einsum('pqrs,ip,jq,kr,ls->ijkl', eri, u, u.conj(), u, u.conj())
         assert abs(h1e.imag).max() < 1e-12, 'Cylindrical symmetry broken'
         assert abs(eri.imag).max() < 1e-12, 'Cylindrical symmetry broken'
         h1e = h1e.real.copy()
-        # Note: although eri is real, it does not have the permutation relation
+        # Note: eri is real but it does not have the permutation relation
         # (ij|kl) = (ji|kl) = (ij|lk) = (ji|lk)
         # The nosym version fci contraction is required
         eri = eri.real.copy()
 
+        wfnsym_ir = self.guess_wfnsym(norb, nelec, ci0, orbsym, wfnsym, **kwargs)
+        if wfnsym_ir in (1, 4):
+            # sym_allowed_idx does not distinguish A2g and A2u
+            davidson_only = True
+        self.sym_allowed_idx = sym_allowed_indices(nelec, orbsym, wfnsym_ir)
+        self.orbsym = orbsym
+        logger.debug(self, 'Num symmetry allowed elements %d',
+                     sum([x.size for x in self.sym_allowed_idx]))
+
         neleca, nelecb = direct_spin1._unpack_nelec(nelec)
         link_indexa = cistring.gen_linkstr_index(range(norb), neleca)
-        link_indexb = cistring.gen_linkstr_index(range(norb), nelecb)
-        with lib.temporary_env(self, orbsym=orbsym, wfnsym=wfnsym):
+        if neleca == nelecb:
+            link_indexb = link_indexa
+        else:
+            link_indexb = cistring.gen_linkstr_index(range(norb), nelecb)
+
+        with lib.temporary_env(self, wfnsym=wfnsym_ir):
             e, c = direct_spin1.kernel_ms1(self, h1e, eri, norb, nelec, ci0,
                                            (link_indexa,link_indexb),
                                            tol, lindep, max_cycle, max_space,
                                            nroots, davidson_only, pspace_size,
                                            ecore=ecore, **kwargs)
 
+        na = link_indexa.shape[0]
+        nb = link_indexb.shape[0]
+        s_idx = np.hstack(self.sym_allowed_idx)
+        if nroots > 1:
+            c, c_raw = [], c
+            for vec in c_raw:
+                c1 = np.zeros(na*nb)
+                c1[s_idx] = vec.T
+                c.append(c1)
+        else:
+            c1 = np.zeros(na*nb)
+            c1[s_idx] = c
+            c = c1.reshape(na, nb).view(direct_spin1.FCIvector)
+
         self.eci, self.ci = e, c
         return e, c
 
diff --git a/pyscf/fci/direct_spin1_symm.py b/pyscf/fci/direct_spin1_symm.py
index cf728a61e7..c59b4f096f 100644
--- a/pyscf/fci/direct_spin1_symm.py
+++ b/pyscf/fci/direct_spin1_symm.py
@@ -34,6 +34,7 @@
 import sys
 import ctypes
 import numpy
+import numpy as np
 from pyscf import ao2mo
 from pyscf import lib
 from pyscf.lib import logger
@@ -46,13 +47,10 @@
 from pyscf.fci.addons import _unpack_nelec
 from pyscf import __config__
 
-libfci = lib.load_library('libfci')
+libfci = direct_spin1.libfci
 
 TOTIRREPS = 8
 
-def contract_1e(f1e, fcivec, norb, nelec, link_index=None, orbsym=None):
-    return direct_spin1.contract_1e(f1e, fcivec, norb, nelec, link_index)
-
 # Note eri is NOT the 2e hamiltonian matrix, the 2e hamiltonian is
 # h2e = eri_{pq,rs} p^+ q r^+ s
 #     = (pq|rs) p^+ r^+ s q - (pq|rs) \delta_{qr} p^+ s
@@ -70,69 +68,77 @@ def contract_2e(eri, fcivec, norb, nelec, link_index=None, orbsym=None, wfnsym=0
     link_indexa, link_indexb = direct_spin1._unpack(norb, nelec, link_index)
     na, nlinka = link_indexa.shape[:2]
     nb, nlinkb = link_indexb.shape[:2]
-    eri_irs, rank_eri, irrep_eri = reorder_eri(eri, norb, orbsym)
 
+    eri_irs, rank_eri, irrep_eri = reorder_eri(eri, norb, orbsym)
     strsa = cistring.gen_strings4orblist(range(norb), neleca)
     aidx, link_indexa = gen_str_irrep(strsa, orbsym, link_indexa, rank_eri, irrep_eri)
+    nas = np.array([x.size for x in aidx], dtype=np.int32)
     if neleca == nelecb:
         bidx, link_indexb = aidx, link_indexa
+        nbs = nas
     else:
         strsb = cistring.gen_strings4orblist(range(norb), nelecb)
         bidx, link_indexb = gen_str_irrep(strsb, orbsym, link_indexb, rank_eri, irrep_eri)
+        nbs = np.array([x.size for x in bidx], dtype=np.int32)
 
-    Tirrep = ctypes.c_void_p*TOTIRREPS
-    linka_ptr = Tirrep(*[x.ctypes.data_as(ctypes.c_void_p) for x in link_indexa])
-    linkb_ptr = Tirrep(*[x.ctypes.data_as(ctypes.c_void_p) for x in link_indexb])
-    eri_ptrs = Tirrep(*[x.ctypes.data_as(ctypes.c_void_p) for x in eri_irs])
-    dimirrep = (ctypes.c_int*TOTIRREPS)(*[x.shape[0] for x in eri_irs])
-    fcivec_shape = fcivec.shape
-    fcivec = fcivec.reshape((na,nb), order='C')
-    ci1new = numpy.zeros_like(fcivec)
-    nas = (ctypes.c_int*TOTIRREPS)(*[x.size for x in aidx])
-    nbs = (ctypes.c_int*TOTIRREPS)(*[x.size for x in bidx])
-
-    # aa, ab
-    ci0 = []
-    ci1 = []
-    wfnsym_in_d2h = wfnsym % 10
-    for ir in range(TOTIRREPS):
-        ma, mb = aidx[ir].size, bidx[wfnsym_in_d2h ^ ir].size
-        ci0.append(numpy.zeros((ma,mb)))
-        ci1.append(numpy.zeros((ma,mb)))
-        if ma > 0 and mb > 0:
-            lib.take_2d(fcivec, aidx[ir], bidx[wfnsym_in_d2h ^ ir], out=ci0[ir])
-    ci0_ptrs = Tirrep(*[x.ctypes.data_as(ctypes.c_void_p) for x in ci0])
-    ci1_ptrs = Tirrep(*[x.ctypes.data_as(ctypes.c_void_p) for x in ci1])
-    libfci.FCIcontract_2e_symm1(eri_ptrs, ci0_ptrs, ci1_ptrs,
-                                ctypes.c_int(norb), nas, nbs,
-                                ctypes.c_int(nlinka), ctypes.c_int(nlinkb),
-                                linka_ptr, linkb_ptr, dimirrep,
-                                ctypes.c_int(wfnsym_in_d2h))
-    for ir in range(TOTIRREPS):
-        if ci0[ir].size > 0:
-            lib.takebak_2d(ci1new, ci1[ir], aidx[ir], bidx[wfnsym_in_d2h ^ ir])
+    eri_ir_dims = np.array([x.shape[0] for x in eri_irs], dtype=np.int32)
+    eri_irs = np.hstack([x.ravel() for x in eri_irs])
 
-    # bb, ba
-    ci0T = []
-    for ir in range(TOTIRREPS):
-        mb, ma = bidx[ir].size, aidx[wfnsym_in_d2h ^ ir].size
-        ci0T.append(numpy.zeros((mb,ma)))
-        if ma > 0 and mb > 0:
-            lib.transpose(ci0[wfnsym_in_d2h ^ ir], out=ci0T[ir])
-    ci0, ci0T = ci0T, None
-    ci1 = [numpy.zeros_like(x) for x in ci0]
-    ci0_ptrs = Tirrep(*[x.ctypes.data_as(ctypes.c_void_p) for x in ci0])
-    ci1_ptrs = Tirrep(*[x.ctypes.data_as(ctypes.c_void_p) for x in ci1])
-    libfci.FCIcontract_2e_symm1(eri_ptrs, ci0_ptrs, ci1_ptrs,
-                                ctypes.c_int(norb), nbs, nas,
-                                ctypes.c_int(nlinkb), ctypes.c_int(nlinka),
-                                linkb_ptr, linka_ptr, dimirrep,
-                                ctypes.c_int(wfnsym_in_d2h))
-    for ir in range(TOTIRREPS):
-        if ci0[ir].size > 0:
-            lib.takebak_2d(ci1new, lib.transpose(ci1[ir]),
-                           aidx[wfnsym_in_d2h ^ ir], bidx[ir])
-    return ci1new.reshape(fcivec_shape).view(direct_spin1.FCIvector)
+    wfnsym_in_d2h = wfnsym % 10
+    orbsym_in_d2h = np.asarray(orbsym) % 10
+    max_ir = orbsym_in_d2h.max()
+    if max_ir >= 4:
+        nirreps = 8
+    elif max_ir >= 2:
+        nirreps = 4
+    elif max_ir >= 1:
+        nirreps = 2
+    else:
+        nirreps = 1
+
+    if fcivec.size == na * nb:
+        fcivec_shape = fcivec.shape
+        fcivec = fcivec.reshape((na,nb), order='C')
+        ci0 = []
+        for ir in range(nirreps):
+            ma, mb = aidx[ir].size, bidx[wfnsym_in_d2h ^ ir].size
+            ci0.append(np.zeros((ma, mb)))
+            if ma * mb > 0:
+                lib.take_2d(fcivec, aidx[ir], bidx[wfnsym_in_d2h ^ ir], out=ci0[ir])
+        ci_size = np.array([x.size for x in ci0], dtype=np.int32)
+        ci0 = np.hstack([x.ravel() for x in ci0])
+    else:
+        ci_size = []
+        for ir in range(nirreps):
+            ma, mb = aidx[ir].size, bidx[wfnsym_in_d2h ^ ir].size
+            ci_size.append(ma * mb)
+        ci_size = np.array(ci_size, dtype=np.int32)
+        ci0 = fcivec
+    ci1 = np.zeros_like(ci0)
+
+    libfci.FCIcontract_2e_symm1(
+        eri_irs.ctypes.data_as(ctypes.c_void_p),
+        ci0.ctypes.data_as(ctypes.c_void_p),
+        ci1.ctypes.data_as(ctypes.c_void_p),
+        eri_ir_dims.ctypes.data_as(ctypes.c_void_p),
+        ci_size.ctypes.data_as(ctypes.c_void_p),
+        nas.ctypes.data_as(ctypes.c_void_p),
+        nbs.ctypes.data_as(ctypes.c_void_p),
+        link_indexa.ctypes.data_as(ctypes.c_void_p),
+        link_indexb.ctypes.data_as(ctypes.c_void_p),
+        ctypes.c_int(norb), ctypes.c_int(nlinka), ctypes.c_int(nlinkb),
+        ctypes.c_int(nirreps), ctypes.c_int(wfnsym_in_d2h))
+
+    if fcivec.size == na * nb:
+        ci_loc = np.append(0, np.cumsum(ci_size))
+        ci1new = np.zeros_like(fcivec)
+        for ir in range(nirreps):
+            if ci_size[ir] > 0:
+                ma, mb = aidx[ir].size, bidx[wfnsym_in_d2h ^ ir].size
+                buf = ci1[ci_loc[ir]:ci_loc[ir+1]].reshape(ma, mb)
+                lib.takebak_2d(ci1new, buf, aidx[ir], bidx[wfnsym_in_d2h ^ ir])
+        ci1 = ci1new.reshape(fcivec_shape)
+    return ci1.view(direct_spin1.FCIvector)
 
 
 def kernel(h1e, eri, norb, nelec, ci0=None, level_shift=1e-3, tol=1e-10,
@@ -175,7 +181,7 @@ def kernel(h1e, eri, norb, nelec, ci0=None, level_shift=1e-3, tol=1e-10,
 def energy(h1e, eri, fcivec, norb, nelec, link_index=None, orbsym=None, wfnsym=0):
     h2e = direct_spin1.absorb_h1e(h1e, eri, norb, nelec) * .5
     ci1 = contract_2e(h2e, fcivec, norb, nelec, link_index, orbsym, wfnsym)
-    return numpy.dot(fcivec.ravel(), ci1.ravel())
+    return np.dot(fcivec.ravel(), ci1.ravel())
 
 def _id_wfnsym(cisolver, norb, nelec, orbsym, wfnsym):
     '''Guess wfnsym or convert wfnsym to symmetry ID if it's a symmetry label'''
@@ -196,15 +202,15 @@ def _id_wfnsym(cisolver, norb, nelec, orbsym, wfnsym):
 
 def _gen_strs_irrep(strs, orbsym):
     # % 10 to convert irrep_ids to irrep of D2h
-    orbsym_in_d2h = numpy.asarray(orbsym) % 10
-    irreps = numpy.zeros(len(strs), dtype=numpy.int32)
+    orbsym_in_d2h = np.asarray(orbsym) % 10
+    irreps = np.zeros(len(strs), dtype=np.int32)
     if isinstance(strs, cistring.OIndexList):
         nocc = strs.shape[1]
         for i in range(nocc):
             irreps ^= orbsym_in_d2h[strs[:,i]]
     else:
         for i, ir in enumerate(orbsym_in_d2h):
-            irreps[numpy.bitwise_and(strs, 1 << i) > 0] ^= ir
+            irreps[np.bitwise_and(strs, 1 << i) > 0] ^= ir
     return irreps
 
 def _get_init_guess(airreps, birreps, nroots, hdiag, nelec, orbsym, wfnsym=0):
@@ -212,21 +218,18 @@ def _get_init_guess(airreps, birreps, nroots, hdiag, nelec, orbsym, wfnsym=0):
     na = len(airreps)
     nb = len(birreps)
     hdiag = hdiag.reshape(na,nb)
-    ci0 = []
-    iroot = 0
     sym_allowed = airreps[:,None] == wfnsym ^ birreps
     if neleca == nelecb and na == nb:
-        idx = numpy.arange(na)
+        idx = np.arange(na)
         sym_allowed[idx[:,None] < idx] = False
-    idx_a, idx_b = numpy.where(sym_allowed)
-    for k in hdiag[idx_a,idx_b].argsort():
+    idx_a, idx_b = np.where(sym_allowed)
+
+    ci0 = []
+    for k in np.argpartition(hdiag[idx_a,idx_b], nroots-1)[:nroots]:
         addra, addrb = idx_a[k], idx_b[k]
-        x = numpy.zeros((na, nb))
+        x = np.zeros((na, nb))
         x[addra,addrb] = 1
         ci0.append(x.ravel().view(direct_spin1.FCIvector))
-        iroot += 1
-        if iroot >= nroots:
-            break
 
     if len(ci0) == 0:
         raise RuntimeError(f'Initial guess for symmetry {wfnsym} not found')
@@ -234,12 +237,25 @@ def _get_init_guess(airreps, birreps, nroots, hdiag, nelec, orbsym, wfnsym=0):
 
 def get_init_guess(norb, nelec, nroots, hdiag, orbsym, wfnsym=0):
     neleca, nelecb = _unpack_nelec(nelec)
-    strsa = cistring.gen_strings4orblist(range(norb), neleca)
-    airreps = birreps = _gen_strs_irrep(strsa, orbsym)
-    if neleca != nelecb:
-        strsb = cistring.gen_strings4orblist(range(norb), nelecb)
-        birreps = _gen_strs_irrep(strsb, orbsym)
-    return _get_init_guess(airreps, birreps, nroots, hdiag, nelec, orbsym, wfnsym)
+    na = cistring.num_strings(norb, neleca)
+    nb = cistring.num_strings(norb, nelecb)
+    if hdiag.size == na * nb:
+        strsa = cistring.gen_strings4orblist(range(norb), neleca)
+        airreps = birreps = _gen_strs_irrep(strsa, orbsym)
+        if neleca != nelecb:
+            strsb = cistring.gen_strings4orblist(range(norb), nelecb)
+            birreps = _gen_strs_irrep(strsb, orbsym)
+        return _get_init_guess(airreps, birreps, nroots, hdiag, nelec, orbsym, wfnsym)
+
+    ci0 = []
+    for k in np.argpartition(hdiag, nroots-1)[:nroots]:
+        x = np.zeros_like(hdiag)
+        x[k] = 1.
+        ci0.append(x.ravel().view(direct_spin1.FCIvector))
+
+    if len(ci0) == 0:
+        raise RuntimeError(f'Initial guess for symmetry {wfnsym} not found')
+    return ci0
 
 def get_init_guess_cyl_sym(norb, nelec, nroots, hdiag, orbsym, wfnsym=0):
     neleca, nelecb = _unpack_nelec(nelec)
@@ -266,9 +282,9 @@ def get_init_guess_cyl_sym(norb, nelec, nroots, hdiag, orbsym, wfnsym=0):
     # total angular momentum == wfn_momentum
     sym_allowed &= a_ls[:,None] == wfn_momentum - b_ls
     if neleca == nelecb and na == nb:
-        idx = numpy.arange(na)
+        idx = np.arange(na)
         sym_allowed[idx[:,None] < idx] = False
-    idx_a, idx_b = numpy.where(sym_allowed)
+    idx_a, idx_b = np.where(sym_allowed)
 
     for k in hdiag[idx_a,idx_b].argsort():
         addra, addrb = idx_a[k], idx_b[k]
@@ -294,7 +310,7 @@ def get_init_guess_cyl_sym(norb, nelec, nroots, hdiag, orbsym, wfnsym=0):
             elif wfnsym in (1, 4):  # A2g, A2u
                 x -= sign_a * sign_b * ca[:,None] * cb
                 #assert (sign_a*sign_b==1 and x.real==0) or (sign_a*sign_b==-1 and x.imag==0)
-            if numpy.linalg.norm(x.real) > 1e-6:
+            if np.linalg.norm(x.real) > 1e-6:
                 x = x.real.copy()
             else:
                 x = x.imag.copy()
@@ -306,7 +322,7 @@ def get_init_guess_cyl_sym(norb, nelec, nroots, hdiag, orbsym, wfnsym=0):
             x = ca.imag[:,None] * cb.real
             x+= ca.real[:,None] * cb.imag
 
-        norm = numpy.linalg.norm(x)
+        norm = np.linalg.norm(x)
         if norm < 1e-3:
             continue
         x *= 1./norm
@@ -324,15 +340,15 @@ def _cyl_sym_csf2civec(strs, addr, orbsym, degen_mapping):
     transformation  addons.transform_ci(civec, (0, nelec), u)
     '''
     norb = orbsym.size
-    one_particle_strs = numpy.asarray([1 << i for i in range(norb)])
+    one_particle_strs = np.asarray([1 << i for i in range(norb)])
     occ_masks = (strs[:,None] & one_particle_strs) != 0
     na = strs.size
-    occ_idx_all_strs = numpy.where(occ_masks)[1].reshape(na,-1)
+    occ_idx_all_strs = np.where(occ_masks)[1].reshape(na,-1)
 
     u = _cyl_sym_orbital_rotation(orbsym, degen_mapping)
     ui = u[occ_masks[addr]].T.copy()
     minors = ui[occ_idx_all_strs]
-    civec = numpy.linalg.det(minors)
+    civec = np.linalg.det(minors)
     return civec
 
 def _cyl_sym_orbital_rotation(orbsym, degen_mapping):
@@ -340,7 +356,7 @@ def _cyl_sym_orbital_rotation(orbsym, degen_mapping):
     |Ex/Ey> = |E(+/-)> * u
     '''
     norb = orbsym.size
-    u = numpy.zeros((norb, norb), dtype=numpy.complex128)
+    u = np.zeros((norb, norb), dtype=np.complex128)
     sqrth = .5**.5
     sqrthi = sqrth * 1j
     for i, j in enumerate(degen_mapping):
@@ -371,69 +387,73 @@ def _sv_associated_det(ci_str, degen_mapping):
     return cistring.str2addr(degen_mapping.size, nelec, ci_str1), sign
 
 def _strs_angular_momentum(strs, orbsym):
-    # angular momentum for each orbitals
+    # angular momentum for each orbital
     orb_l = (orbsym // 10) * 2
-    e1_mask = numpy.isin(orbsym % 10, (2, 3, 6, 7))
+    e1_mask = np.isin(orbsym % 10, (2, 3, 6, 7))
     orb_l[e1_mask] += 1
-    ey_mask = numpy.isin(orbsym % 10, (1, 3, 4, 6))
+    ey_mask = np.isin(orbsym % 10, (1, 3, 4, 6))
     orb_l[ey_mask] *= -1
 
     # total angular for each determinant (CSF)
-    ls = numpy.zeros(len(strs), dtype=int)
+    ls = np.zeros(len(strs), dtype=int)
     if isinstance(strs, cistring.OIndexList):
         nocc = strs.shape[1]
         for i in range(nocc):
             ls += orb_l[strs[:,i]]
     else:
         for i, l in enumerate(orb_l):
-            ls[numpy.bitwise_and(strs, 1 << i) > 0] += l
+            ls[np.bitwise_and(strs, 1 << i) > 0] += l
     return ls
 
 def reorder_eri(eri, norb, orbsym):
     if orbsym is None:
-        return [eri], numpy.arange(norb), numpy.zeros(norb,dtype=numpy.int32)
+        return [eri], np.arange(norb), np.zeros(norb,dtype=np.int32)
 
     # % 10 to map irrep IDs of Dooh or Coov, etc. to irreps of D2h, C2v
-    orbsym = numpy.asarray(orbsym) % 10
+    orbsym = np.asarray(orbsym) % 10
 
     # irrep of (ij| pair
-    trilirrep = (orbsym[:,None] ^ orbsym)[numpy.tril_indices(norb)]
+    trilirrep = (orbsym[:,None] ^ orbsym)[np.tril_indices(norb)]
     # and the number of occurence for each irrep
-    dimirrep = numpy.asarray(numpy.bincount(trilirrep), dtype=numpy.int32)
+    dimirrep = np.asarray(np.bincount(trilirrep), dtype=np.int32)
     # we sort the irreps of (ij| pair, to group the pairs which have same irreps
     # "order" is irrep-id-sorted index. The (ij| paired is ordered that the
     # pair-id given by order[0] comes first in the sorted pair
     # "rank" is a sorted "order". Given nth (ij| pair, it returns the place(rank)
     # of the sorted pair
-    old_eri_irrep = numpy.asarray(trilirrep, dtype=numpy.int32)
-    rank_in_irrep = numpy.empty_like(old_eri_irrep)
-    p0 = 0
-    eri_irs = [numpy.zeros((0,0))] * TOTIRREPS
+    old_eri_irrep = np.asarray(trilirrep, dtype=np.int32)
+    rank_in_irrep = np.empty_like(old_eri_irrep)
+    eri_irs = [np.zeros((0,0))] * TOTIRREPS
     for ir, nnorb in enumerate(dimirrep):
-        idx = numpy.asarray(numpy.where(trilirrep == ir)[0], dtype=numpy.int32)
-        rank_in_irrep[idx] = numpy.arange(nnorb, dtype=numpy.int32)
+        idx = np.asarray(np.where(trilirrep == ir)[0], dtype=np.int32)
+        rank_in_irrep[idx] = np.arange(nnorb, dtype=np.int32)
         eri_ir = lib.take_2d(eri, idx, idx)
         # Drop small integrals which may break symmetry?
         #eri_ir[abs(eri_ir) < 1e-13] = 0
         eri_irs[ir] = eri_ir
-        p0 += nnorb
     return eri_irs, rank_in_irrep, old_eri_irrep
 
-def gen_str_irrep(strs, orbsym, link_index, rank_eri, irrep_eri):
+def argsort_strs_by_irrep(strs, orbsym):
     airreps = _gen_strs_irrep(strs, orbsym)
-    na = len(airreps)
-    rank = numpy.zeros(na, dtype=numpy.int32)
-    aidx = [numpy.zeros(0,dtype=numpy.int32)] * TOTIRREPS
+    aidx = [np.zeros(0,dtype=np.int32)] * TOTIRREPS
     for ir in range(TOTIRREPS):
-        aidx[ir] = numpy.where(airreps == ir)[0]
-        ma = len(aidx[ir])
-        if ma > 0:
-            rank[aidx[ir]] = numpy.arange(ma, dtype=numpy.int32)
+        aidx[ir] = np.where(airreps == ir)[0]
+    return aidx
+
+def gen_str_irrep(strs, orbsym, link_index, rank_eri, irrep_eri):
+    aidx = argsort_strs_by_irrep(strs, orbsym)
+    na = len(strs)
+    rank = np.zeros(na, dtype=np.int32)
+    for idx in aidx:
+        if idx.size > 0:
+            rank[idx] = np.arange(idx.size, dtype=np.int32)
+
     link_index = link_index.copy()
+    link_index[:,:,2] = rank[link_index[:,:,2]]
     link_index[:,:,1] = irrep_eri[link_index[:,:,0]]
     link_index[:,:,0] = rank_eri[link_index[:,:,0]]
-    link_index[:,:,2] = rank[link_index[:,:,2]]
-    link_index = [link_index.take(aidx[ir], axis=0) for ir in range(TOTIRREPS)]
+
+    link_index = link_index.take(np.hstack(aidx), axis=0)
     return aidx, link_index
 
 def _guess_wfnsym_cyl_sym(civec, strsa, strsb, orbsym):
@@ -453,8 +473,8 @@ def _guess_wfnsym_cyl_sym(civec, strsa, strsb, orbsym):
         ca1 = _cyl_sym_csf2civec(strsa, addra1, orbsym, degen_mapping)
     if addrb != addrb1:
         cb1 = _cyl_sym_csf2civec(strsb, addrb1, orbsym, degen_mapping)
-    ua = numpy.stack([ca, ca1])
-    ub = numpy.stack([cb, cb1])
+    ua = np.stack([ca, ca1])
+    ub = np.stack([cb, cb1])
     # civec is in the Ex/Ey basis. Transform the largest coefficient to
     # (E+)/(E-) basis.
     c_max = ua.conj().dot(civec.reshape(na,nb)).dot(ub.conj().T)
@@ -537,7 +557,7 @@ def guess_wfnsym(solver, norb, nelec, fcivec=None, orbsym=None, wfnsym=None, **k
             strsa = strsb = cistring.gen_strings4orblist(range(norb), neleca)
             if neleca != nelecb:
                 strsb = cistring.gen_strings4orblist(range(norb), nelecb)
-            if not isinstance(fcivec, numpy.ndarray) or fcivec.ndim > 2:
+            if not isinstance(fcivec, np.ndarray) or fcivec.ndim > 2:
                 fcivec = fcivec[0]
             wfnsym = _guess_wfnsym_cyl_sym(fcivec, strsa, strsb, orbsym)
         else:
@@ -551,7 +571,7 @@ def guess_wfnsym(solver, norb, nelec, fcivec=None, orbsym=None, wfnsym=None, **k
             strsb = cistring.gen_strings4orblist(range(norb), nelecb)
 
         if groupname in ('Dooh', 'Coov'):
-            if not isinstance(fcivec, numpy.ndarray) or fcivec.ndim > 2:
+            if not isinstance(fcivec, np.ndarray) or fcivec.ndim > 2:
                 fcivec = fcivec[0]
             wfnsym1 = _guess_wfnsym_cyl_sym(fcivec, strsa, strsb, orbsym)
             if wfnsym1 != _id_wfnsym(solver, norb, nelec, orbsym, wfnsym):
@@ -560,43 +580,55 @@ def guess_wfnsym(solver, norb, nelec, fcivec=None, orbsym=None, wfnsym=None, **k
             wfnsym = wfnsym1
         else:
             na, nb = strsa.size, strsb.size
-            orbsym_in_d2h = numpy.asarray(orbsym) % 10
-            airreps = numpy.zeros(na, dtype=numpy.int32)
-            birreps = numpy.zeros(nb, dtype=numpy.int32)
+            orbsym_in_d2h = np.asarray(orbsym) % 10
+            airreps = np.zeros(na, dtype=np.int32)
+            birreps = np.zeros(nb, dtype=np.int32)
             for i, ir in enumerate(orbsym_in_d2h):
-                airreps[numpy.bitwise_and(strsa, 1 << i) > 0] ^= ir
-                birreps[numpy.bitwise_and(strsb, 1 << i) > 0] ^= ir
+                airreps[np.bitwise_and(strsa, 1 << i) > 0] ^= ir
+                birreps[np.bitwise_and(strsb, 1 << i) > 0] ^= ir
 
             wfnsym = _id_wfnsym(solver, norb, nelec, orbsym, wfnsym)
             groupname = getattr(solver.mol, 'groupname', None)
             mask = airreps[:,None] == (wfnsym % 10) ^ birreps
 
-            if isinstance(fcivec, numpy.ndarray) and fcivec.ndim <= 2:
+            if isinstance(fcivec, np.ndarray) and fcivec.ndim <= 2:
                 fcivec = [fcivec]
             if all(abs(c.reshape(na, nb)[mask]).max() < 1e-5 for c in fcivec):
                 raise RuntimeError('Input wfnsym {wfnsym} is not consistent with '
                                    'fcivec coefficients')
-
     return wfnsym
 
+def sym_allowed_indices(nelec, orbsym, wfnsym):
+    '''Indices of symmetry allowed determinants for each irrep'''
+    norb = orbsym.size
+    neleca, nelecb = nelec
+    strsa = strsb = cistring.gen_strings4orblist(range(norb), neleca)
+    aidx = bidx = argsort_strs_by_irrep(strsa, orbsym)
+    if neleca != nelecb:
+        strsb = cistring.gen_strings4orblist(range(norb), nelecb)
+        bidx = argsort_strs_by_irrep(strsb, orbsym)
+    nb = len(strsb)
+    wfnsym_in_d2h = wfnsym % 10
+    ab_idx = [(aidx[ir][:,None] * nb + bidx[wfnsym_in_d2h ^ ir]).ravel()
+              for ir in range(TOTIRREPS)]
+    return ab_idx
 
 class FCISolver(direct_spin1.FCISolver):
 
-    davidson_only = getattr(__config__, 'fci_direct_spin1_symm_FCI_davidson_only', True)
-    # pspace may break point group symmetry
-    pspace_size = getattr(__config__, 'fci_direct_spin1_symm_FCI_pspace_size', 0)
+    pspace_size = getattr(__config__, 'fci_direct_spin1_symm_FCI_pspace_size', 400)
 
     def __init__(self, mol=None, **kwargs):
-        direct_spin1.FCISolver.__init__(self, mol, **kwargs)
         # wfnsym will be guessed based on initial guess if it is None
         self.wfnsym = None
+        self.sym_allowed_idx = None
+        direct_spin1.FCISolver.__init__(self, mol, **kwargs)
 
     def dump_flags(self, verbose=None):
         direct_spin1.FCISolver.dump_flags(self, verbose)
         log = logger.new_logger(self, verbose)
         if isinstance(self.wfnsym, str):
             log.info('Input CI wfn symmetry = %s', self.wfnsym)
-        elif isinstance(self.wfnsym, (int, numpy.number)):
+        elif isinstance(self.wfnsym, (int, np.number)):
             groupname = getattr(self.mol, 'groupname', None)
             if groupname is not None:
                 try:
@@ -611,21 +643,54 @@ def dump_flags(self, verbose=None):
             log.info('CI wfn symmetry = %s', self.wfnsym)
         return self
 
-    def absorb_h1e(self, h1e, eri, norb, nelec, fac=1):
-        nelec = _unpack_nelec(nelec, self.spin)
-        return direct_spin1.absorb_h1e(h1e, eri, norb, nelec, fac)
+    absorb_h1e = direct_spin1.FCISolver.absorb_h1e
 
-    def make_hdiag(self, h1e, eri, norb, nelec):
+    def make_hdiag(self, h1e, eri, norb, nelec, compress=False):
         nelec = _unpack_nelec(nelec, self.spin)
-        return direct_spin1.make_hdiag(h1e, eri, norb, nelec)
+        hdiag = direct_spin1.make_hdiag(h1e, eri, norb, nelec)
+        # TODO: hdiag should return symmetry allowed elements only. However,
+        # get_init_guess_cyl_sym does not strictly follow the D2h (and subgroup)
+        # symmetry treatments. The diagonal of entire Hamiltonian is required.
+        if compress and self.sym_allowed_idx is not None:
+            hdiag = hdiag.ravel()[np.hstack(self.sym_allowed_idx)]
+        return hdiag
 
     def pspace(self, h1e, eri, norb, nelec, hdiag, np=400):
         nelec = _unpack_nelec(nelec, self.spin)
-        return direct_spin1.pspace(h1e, eri, norb, nelec, hdiag, np)
+        na = cistring.num_strings(norb, nelec[0])
+        nb = cistring.num_strings(norb, nelec[1])
+        s_idx = numpy.hstack(self.sym_allowed_idx)
+        if hdiag.size == s_idx.size:
+            hdiag, hdiag0 = numpy.empty(na*nb), hdiag.ravel()
+            hdiag[:] = 1e9
+            hdiag[s_idx] = hdiag0
+        elif not getattr(self.mol, 'groupname', None) in ('Dooh', 'Coov'):
+            # Screen symmetry forbidden elements
+            hdiag, hdiag0 = numpy.empty(na*nb), hdiag.ravel()
+            hdiag[:] = 1e9
+            hdiag[s_idx] = hdiag0[s_idx]
+
+        np = min(np, hdiag.size)
+        addr0, h = direct_spin1.pspace(h1e, eri, norb, nelec, hdiag, np)
+
+        # mapping the address in (na,nb) civec to address in sym-allowed civec
+        addr0_sym_allow = numpy.where(numpy.isin(addr0, s_idx))[0]
+        addr0 = addr0[addr0_sym_allow]
+        s_idx_allowed = numpy.where(numpy.isin(s_idx, addr0))[0]
+        addr1 = s_idx[s_idx_allowed]
+        new_idx = numpy.empty_like(s_idx_allowed)
+        new_idx[addr0.argsort()] = addr1.argsort()
+        addr = s_idx_allowed[new_idx]
+        return addr, h[addr0_sym_allow[:,None],addr0_sym_allow]
 
     def contract_1e(self, f1e, fcivec, norb, nelec, link_index=None, **kwargs):
-        nelec = _unpack_nelec(nelec, self.spin)
-        return contract_1e(f1e, fcivec, norb, nelec, link_index, **kwargs)
+        nelec = direct_spin1._unpack_nelec(nelec)
+        na = cistring.num_strings(norb, nelec[0])
+        nb = cistring.num_strings(norb, nelec[1])
+        if fcivec.size != na * nb:
+            fcivec, ci0 = np.zeros(na*nb), fcivec
+            fcivec[np.hstack(self.sym_allowed_idx)] = ci0
+        return direct_spin1.contract_1e(f1e, fcivec, norb, nelec, link_index, **kwargs)
 
     def contract_2e(self, eri, fcivec, norb, nelec, link_index=None,
                     orbsym=None, wfnsym=None, **kwargs):
@@ -635,13 +700,31 @@ def contract_2e(self, eri, fcivec, norb, nelec, link_index=None,
         nelec = _unpack_nelec(nelec, self.spin)
         return contract_2e(eri, fcivec, norb, nelec, link_index, orbsym, wfnsym, **kwargs)
 
-    def get_init_guess(self, norb, nelec, nroots, hdiag):
-        wfnsym = _id_wfnsym(self, norb, nelec, self.orbsym, self.wfnsym)
+    def contract_ss(self, fcivec, norb, nelec):
+        nelec = direct_spin1._unpack_nelec(nelec)
+        na = cistring.num_strings(norb, nelec[0])
+        nb = cistring.num_strings(norb, nelec[1])
+        if fcivec.size == na*nb:
+            return contract_ss(fcivec, norb, nelec)
+
+        fcivec, ci0 = np.zeros(na*nb), fcivec
+        s_idx = np.hstack(self.sym_allowed_idx)
+        fcivec[s_idx] = ci0
+        ci1 = contract_ss(fcivec, norb, nelec)
+        return ci1.ravel()[s_idx]
+
+    def get_init_guess(self, norb, nelec, nroots, hdiag, orbsym=None, wfnsym=None):
+        if orbsym is None: orbsym = self.orbsym
+        if wfnsym is None:
+            wfnsym = _id_wfnsym(self, norb, nelec, orbsym, self.wfnsym)
+        s_idx = np.hstack(self.sym_allowed_idx)
         if getattr(self.mol, 'groupname', None) in ('Dooh', 'Coov'):
-            return get_init_guess_cyl_sym(
-                norb, nelec, nroots, hdiag, self.orbsym, wfnsym)
+            ci0 = get_init_guess_cyl_sym(
+                norb, nelec, nroots, hdiag, orbsym, wfnsym)
+            return [x[s_idx] for x in ci0]
         else:
-            return get_init_guess(norb, nelec, nroots, hdiag, self.orbsym, wfnsym)
+            return get_init_guess(norb, nelec, nroots, hdiag.ravel()[s_idx],
+                                  orbsym, wfnsym)
 
     guess_wfnsym = guess_wfnsym
 
@@ -653,18 +736,31 @@ def kernel(self, h1e, eri, norb, nelec, ci0=None,
         if orbsym is None: orbsym = self.orbsym
         if wfnsym is None: wfnsym = self.wfnsym
         if self.verbose >= logger.WARN:
+            if 'verbose' not in kwargs:
+                kwargs['verbose'] = self.verbose
             self.check_sanity()
         self.norb = norb
-        self.nelec = nelec
-
-        if (not hasattr(orbsym, 'degen_mapping') and
-            getattr(self.mol, 'groupname', None) in ('Dooh', 'Coov')):
-            degen_mapping = map_degeneracy(h1e.diagonal(), orbsym)
-            orbsym = lib.tag_array(orbsym, degen_mapping=degen_mapping)
-
-        wfnsym = self.guess_wfnsym(norb, nelec, ci0, orbsym, wfnsym, **kwargs)
+        self.nelec = nelec = _unpack_nelec(nelec, self.spin)
+        link_index = direct_spin1._unpack(norb, nelec, None)
 
-        if wfnsym > 7:
+        if getattr(self.mol, 'groupname', None) in ('Dooh', 'Coov'):
+            if not hasattr(orbsym, 'degen_mapping'):
+                degen_mapping = map_degeneracy(h1e.diagonal(), orbsym)
+                orbsym = lib.tag_array(orbsym, degen_mapping=degen_mapping)
+            if davidson_only is None:
+                davidson_only = True
+
+        wfnsym_ir = self.guess_wfnsym(norb, nelec, ci0, orbsym, wfnsym, **kwargs)
+        self.sym_allowed_idx = sym_allowed_indices(nelec, orbsym, wfnsym_ir)
+        s_idx = np.hstack(self.sym_allowed_idx)
+        self.orbsym = orbsym
+        logger.debug(self, 'Num symmetry allowed elements %d',
+                     sum([x.size for x in self.sym_allowed_idx]))
+        if s_idx.size == 0:
+            raise RuntimeError(
+                f'Symmetry allowed determinants not found for wfnsym {wfnsym}')
+
+        if wfnsym_ir > 7:
             # Symmetry broken for Dooh and Coov groups is often observed.
             # A larger max_space is helpful to reduce the error. Also it is
             # hard to converge to high precision.
@@ -673,11 +769,33 @@ def kernel(self, h1e, eri, norb, nelec, ci0=None,
             if tol is None and self.conv_tol == FCISolver.conv_tol:
                 tol = 1e-7
 
-        with lib.temporary_env(self, orbsym=orbsym, wfnsym=wfnsym):
-            e, c = direct_spin1.kernel_ms1(self, h1e, eri, norb, nelec, ci0, None,
-                                           tol, lindep, max_cycle, max_space,
-                                           nroots, davidson_only, pspace_size,
-                                           ecore=ecore, **kwargs)
+        if ci0 is None and getattr(self.mol, 'groupname', None) in ('Dooh', 'Coov'):
+            # self.hdiag returns stripped H_diag (for D2h symmetry).
+            # Different convention of symmetry representations were used in
+            # get_init_guess_cyl_sym (which follows direct_spin1_cyl_sym.py).
+            # Some symmetry forbidden elements for D2h are needed in
+            # get_init_guess_cyl_sym function. Thus the entire hdiag is computed.
+            hdiag = self.make_hdiag(h1e, eri, norb, nelec, compress=False)
+            ci0 = self.get_init_guess(norb, nelec, nroots, hdiag, orbsym, wfnsym_ir)
+
+        with lib.temporary_env(self, wfnsym=wfnsym_ir):
+            e, c = direct_spin1.kernel_ms1(
+                self, h1e, eri, norb, nelec, ci0, link_index, tol, lindep, max_cycle,
+                max_space, nroots, davidson_only, pspace_size, ecore=ecore, **kwargs)
+
+        na = link_index[0].shape[0]
+        nb = link_index[1].shape[0]
+        if nroots > 1:
+            c, c_raw = [], c
+            for vec in c_raw:
+                c1 = np.zeros(na*nb)
+                c1[s_idx] = vec.T
+                c.append(c1.reshape(na, nb).view(direct_spin1.FCIvector))
+        else:
+            c1 = np.zeros(na*nb)
+            c1[s_idx] = c
+            c = c1.reshape(na, nb).view(direct_spin1.FCIvector)
+
         self.eci, self.ci = e, c
         return e, c
 
diff --git a/pyscf/fci/direct_uhf.py b/pyscf/fci/direct_uhf.py
index a67e6135f4..5cd928e75e 100644
--- a/pyscf/fci/direct_uhf.py
+++ b/pyscf/fci/direct_uhf.py
@@ -37,8 +37,9 @@
 from pyscf import ao2mo
 from pyscf.fci import cistring
 from pyscf.fci import direct_spin1
+from pyscf.fci.spin_op import spin_square
 
-libfci = lib.load_library('libfci')
+libfci = direct_spin1.libfci
 
 # When the spin-orbitals do not have the degeneracy on spacial part,
 # there is only one version of FCI which is close to _spin1 solver.
@@ -122,7 +123,7 @@ def contract_2e_hubbard(u, fcivec, norb, nelec, opt=None):
             fcinew[:,maskb] += u_bb * fcivec[:,maskb]
     return fcinew.view(direct_spin1.FCIvector)
 
-def make_hdiag(h1e, eri, norb, nelec):
+def make_hdiag(h1e, eri, norb, nelec, compress=False):
     neleca, nelecb = direct_spin1._unpack_nelec(nelec)
     h1e_a = numpy.ascontiguousarray(h1e[0])
     h1e_b = numpy.ascontiguousarray(h1e[1])
@@ -130,9 +131,9 @@ def make_hdiag(h1e, eri, norb, nelec):
     g2e_ab = ao2mo.restore(1, eri[1], norb)
     g2e_bb = ao2mo.restore(1, eri[2], norb)
 
-    occslsta = occslstb = cistring._gen_occslst(range(norb), neleca)
+    occslsta = occslstb = cistring.gen_occslst(range(norb), neleca)
     if neleca != nelecb:
-        occslstb = cistring._gen_occslst(range(norb), nelecb)
+        occslstb = cistring.gen_occslst(range(norb), nelecb)
     na = len(occslsta)
     nb = len(occslstb)
 
@@ -187,7 +188,10 @@ def pspace(h1e, eri, norb, nelec, hdiag=None, np=400):
     g2e_ab = ao2mo.restore(1, eri[1], norb)
     g2e_bb = ao2mo.restore(1, eri[2], norb)
     if hdiag is None:
-        hdiag = make_hdiag(h1e, eri, norb, nelec)
+        hdiag = make_hdiag(h1e, eri, norb, nelec, compress=False)
+    na = cistring.num_strings(norb, neleca)
+    nb = cistring.num_strings(norb, nelecb)
+    assert hdiag.size == na * nb
     if hdiag.size < np:
         addr = numpy.arange(hdiag.size)
     else:
@@ -195,7 +199,6 @@ def pspace(h1e, eri, norb, nelec, hdiag=None, np=400):
             addr = numpy.argpartition(hdiag, np-1)[:np]
         except AttributeError:
             addr = numpy.argsort(hdiag)[:np]
-    nb = cistring.num_strings(norb, nelecb)
     addra = addr // nb
     addrb = addr % nb
     stra = cistring.addrs2str(norb, neleca, addra)
@@ -234,25 +237,20 @@ def energy(h1e, eri, fcivec, norb, nelec, link_index=None):
     return numpy.dot(fcivec.reshape(-1), ci1.reshape(-1))
 
 # dm_pq = <|p^+ q|>
-def make_rdm1s(fcivec, norb, nelec, link_index=None):
-    return direct_spin1.make_rdm1s(fcivec, norb, nelec, link_index)
+make_rdm1s = direct_spin1.make_rdm1s
 
 # spacial part of DM, dm_pq = <|p^+ q|>
 def make_rdm1(fcivec, norb, nelec, link_index=None):
     raise ValueError('Spin trace for UHF-FCI density matrices.')
 
-def make_rdm12s(fcivec, norb, nelec, link_index=None, reorder=True):
-    return direct_spin1.make_rdm12s(fcivec, norb, nelec, link_index, reorder)
-
-def trans_rdm1s(cibra, ciket, norb, nelec, link_index=None):
-    return direct_spin1.trans_rdm1s(cibra, ciket, norb, nelec, link_index)
+make_rdm12s = direct_spin1.make_rdm12s
+trans_rdm1s = direct_spin1.trans_rdm1s
 
 # spacial part of DM
 def trans_rdm1(cibra, ciket, norb, nelec, link_index=None):
     raise ValueError('Spin trace for UHF-FCI density matrices.')
 
-def trans_rdm12s(cibra, ciket, norb, nelec, link_index=None, reorder=True):
-    return direct_spin1.trans_rdm12s(cibra, ciket, norb, nelec, link_index, reorder)
+trans_rdm12s = direct_spin1.trans_rdm12s
 
 
 ###############################################################
@@ -261,81 +259,14 @@ def trans_rdm12s(cibra, ciket, norb, nelec, link_index=None, reorder=True):
 
 class FCISolver(direct_spin1.FCISolver):
 
-    def absorb_h1e(self, h1e, eri, norb, nelec, fac=1):
-        return absorb_h1e(h1e, eri, norb, nelec, fac)
-
-    def make_hdiag(self, h1e, eri, norb, nelec):
-        return make_hdiag(h1e, eri, norb, nelec)
-
-    def pspace(self, h1e, eri, norb, nelec, hdiag, np=400):
-        return pspace(h1e, eri, norb, nelec, hdiag, np)
+    absorb_h1e = staticmethod(absorb_h1e)
+    make_hdiag = staticmethod(make_hdiag)
+    pspace = staticmethod(pspace)
 
-    def contract_1e(self, f1e, fcivec, norb, nelec, link_index=None, **kwargs):
-        return contract_1e(f1e, fcivec, norb, nelec, link_index, **kwargs)
-
-    def contract_2e(self, eri, fcivec, norb, nelec, link_index=None, **kwargs):
-        return contract_2e(eri, fcivec, norb, nelec, link_index, **kwargs)
-
-    def spin_square(self, fcivec, norb, nelec):
-        from pyscf.fci import spin_op
-        return spin_op.spin_square(fcivec, norb, nelec)
-
-    def make_rdm1(self, cibra, ciket, norb, nelec, link_index=None):
-        return trans_rdm1(cibra, ciket, norb, nelec, link_index)
-
-    def trans_rdm1(self, cibra, ciket, norb, nelec, link_index=None):
-        return trans_rdm1(cibra, ciket, norb, nelec, link_index)
+    contract_1e = lib.module_method(contract_1e)
+    contract_2e = lib.module_method(contract_2e)
+    make_rdm1 = lib.module_method(make_rdm1)
+    trans_rdm1 = lib.module_method(trans_rdm1)
+    spin_square = lib.module_method(spin_square)
 
 FCI = FCISolver
-
-if __name__ == '__main__':
-    from functools import reduce
-    from pyscf import gto
-    from pyscf import scf
-
-    mol = gto.Mole()
-    mol.verbose = 0
-    mol.output = None#"out_h2o"
-    mol.atom = [
-        ['H', ( 1.,-1.    , 0.   )],
-        ['H', ( 0.,-1.    ,-1.   )],
-        ['H', ( 1.,-0.5   ,-1.   )],
-        #['H', ( 0.,-0.5   ,-1.   )],
-        #['H', ( 0.,-0.5   ,-0.   )],
-        ['H', ( 0.,-0.    ,-1.   )],
-        ['H', ( 1.,-0.5   , 0.   )],
-        ['H', ( 0., 1.    , 1.   )],
-    ]
-
-    mol.basis = {'H': 'sto-3g'}
-    mol.charge = 1
-    mol.spin = 1
-    mol.build()
-
-    m = scf.UHF(mol)
-    ehf = m.scf()
-
-    cis = FCISolver(mol)
-    norb = m.mo_energy[0].size
-    nea = (mol.nelectron+1) // 2
-    neb = (mol.nelectron-1) // 2
-    nelec = (nea, neb)
-    mo_a = m.mo_coeff[0]
-    mo_b = m.mo_coeff[1]
-    h1e_a = reduce(numpy.dot, (mo_a.T, m.get_hcore(), mo_a))
-    h1e_b = reduce(numpy.dot, (mo_b.T, m.get_hcore(), mo_b))
-    g2e_aa = ao2mo.incore.general(m._eri, (mo_a,)*4, compact=False)
-    g2e_aa = g2e_aa.reshape(norb,norb,norb,norb)
-    g2e_ab = ao2mo.incore.general(m._eri, (mo_a,mo_a,mo_b,mo_b), compact=False)
-    g2e_ab = g2e_ab.reshape(norb,norb,norb,norb)
-    g2e_bb = ao2mo.incore.general(m._eri, (mo_b,)*4, compact=False)
-    g2e_bb = g2e_bb.reshape(norb,norb,norb,norb)
-    h1e = (h1e_a, h1e_b)
-    eri = (g2e_aa, g2e_ab, g2e_bb)
-    na = cistring.num_strings(norb, nea)
-    nb = cistring.num_strings(norb, neb)
-    numpy.random.seed(15)
-    fcivec = numpy.random.random((na,nb))
-
-    e = kernel(h1e, eri, norb, nelec)[0]
-    print(e, e - -8.65159903476)
diff --git a/pyscf/fci/fci_dhf_slow.py b/pyscf/fci/fci_dhf_slow.py
index d4fb61087e..35f00ece4e 100644
--- a/pyscf/fci/fci_dhf_slow.py
+++ b/pyscf/fci/fci_dhf_slow.py
@@ -52,7 +52,7 @@ def absorb_h1e(h1e, eri, norb, nelec, fac=1):
 
 
 def make_hdiag(h1e, eri, norb, nelec, opt=None):
-    occslist = cistring._gen_occslst(range(norb), nelec)
+    occslist = cistring.gen_occslst(range(norb), nelec)
     diagjk = numpy.einsum('iijj->ij', eri.copy(), optimize=True)
     diagjk -= numpy.einsum('ijji->ij', eri, optimize=True)
     hdiag = []
diff --git a/pyscf/fci/fci_slow.py b/pyscf/fci/fci_slow.py
index 0041d331b7..a7757e40ed 100644
--- a/pyscf/fci/fci_slow.py
+++ b/pyscf/fci/fci_slow.py
@@ -131,7 +131,7 @@ def absorb_h1e(h1e, eri, norb, nelec, fac=1):
     '''
     if not isinstance(nelec, (int, numpy.integer)):
         nelec = sum(nelec)
-    h2e = ao2mo.restore(1, eri.copy(), norb)
+    h2e = ao2mo.restore(1, eri.copy(), norb).astype(h1e.dtype, copy=False)
     f1e = h1e - numpy.einsum('jiik->jk', h2e) * .5
     f1e = f1e * (1./(nelec+1e-100))
     for k in range(norb):
@@ -147,8 +147,8 @@ def make_hdiag(h1e, eri, norb, nelec, opt=None):
     else:
         neleca, nelecb = nelec
 
-    occslista = cistring._gen_occslst(range(norb), neleca)
-    occslistb = cistring._gen_occslst(range(norb), nelecb)
+    occslista = cistring.gen_occslst(range(norb), neleca)
+    occslistb = cistring.gen_occslst(range(norb), nelecb)
     eri = ao2mo.restore(1, eri, norb)
     diagj = numpy.einsum('iijj->ij', eri)
     diagk = numpy.einsum('ijji->ij', eri)
diff --git a/pyscf/fci/rdm.py b/pyscf/fci/rdm.py
index 21dd0302cb..58be01c4fe 100644
--- a/pyscf/fci/rdm.py
+++ b/pyscf/fci/rdm.py
@@ -30,7 +30,7 @@
 from pyscf.fci import cistring
 from pyscf.fci.addons import _unpack_nelec
 
-librdm = lib.load_library('libfci')
+librdm = cistring.libfci
 
 def reorder_rdm(rdm1, rdm2, inplace=False):
     nmo = rdm1.shape[0]
diff --git a/pyscf/fci/selected_ci.py b/pyscf/fci/selected_ci.py
index c31d3e46cd..c51b8a811a 100644
--- a/pyscf/fci/selected_ci.py
+++ b/pyscf/fci/selected_ci.py
@@ -43,7 +43,7 @@
 from pyscf.fci import rdm
 from pyscf import __config__
 
-libfci = lib.load_library('libfci')
+libfci = direct_spin1.libfci
 
 @lib.with_doc(direct_spin1.contract_2e.__doc__)
 def contract_2e(eri, civec_strs, norb, nelec, link_index=None):
@@ -286,7 +286,7 @@ def gen_cre_linkstr(strs, norb, nelec):
     return link_index
 
 
-def make_hdiag(h1e, eri, ci_strs, norb, nelec):
+def make_hdiag(h1e, eri, ci_strs, norb, nelec, compress=False):
     ci_coeff, nelec, ci_strs = _unpack(None, nelec, ci_strs)
     na = len(ci_strs[0])
     nb = len(ci_strs[1])
@@ -332,7 +332,7 @@ def kernel_fixed_space(myci, h1e, eri, norb, nelec, ci_strs, ci0=None,
     h2e = ao2mo.restore(1, h2e, norb)
 
     link_index = _all_linkstr_index(ci_strs, norb, nelec)
-    hdiag = myci.make_hdiag(h1e, eri, ci_strs, norb, nelec)
+    hdiag = myci.make_hdiag(h1e, eri, ci_strs, norb, nelec, compress=True)
 
     if isinstance(ci0, SCIvector):
         if ci0.size == na*nb:
@@ -342,8 +342,10 @@ def kernel_fixed_space(myci, h1e, eri, norb, nelec, ci_strs, ci0=None,
     else:
         ci0 = myci.get_init_guess(ci_strs, norb, nelec, nroots, hdiag)
 
+    cpu0 = [logger.process_clock(), logger.perf_counter()]
     def hop(c):
         hc = myci.contract_2e(h2e, _as_SCIvector(c, ci_strs), norb, nelec, link_index)
+        cpu0[:] = log.timer_debug1('contract_2e', *cpu0)
         return hc.reshape(-1)
     precond = lambda x, e, *args: x/(hdiag-e+1e-4)
 
@@ -400,7 +402,7 @@ def kernel_float_space(myci, h1e, eri, norb, nelec, ci0=None,
         if ci0.size < nroots:
             raise RuntimeError('Not enough selected-CI space for %d states' % nroots)
         ci_strs = ci0._strs
-        hdiag = myci.make_hdiag(h1e, eri, ci_strs, norb, nelec)
+        hdiag = myci.make_hdiag(h1e, eri, ci_strs, norb, nelec, compress=True)
         ci0 = myci.get_init_guess(ci_strs, norb, nelec, nroots, hdiag)
 
     def hop(c):
@@ -422,7 +424,7 @@ def hop(c):
 
         ci0 = [c.ravel() for c in ci0]
         link_index = _all_linkstr_index(ci_strs, norb, nelec)
-        hdiag = myci.make_hdiag(h1e, eri, ci_strs, norb, nelec)
+        hdiag = myci.make_hdiag(h1e, eri, ci_strs, norb, nelec, compress=True)
         #e, ci0 = lib.davidson(hop, ci0.reshape(-1), precond, tol=float_tol)
         e, ci0 = myci.eig(hop, ci0, precond, tol=float_tol, lindep=lindep,
                           max_cycle=max_cycle, max_space=max_space, nroots=nroots,
@@ -453,7 +455,7 @@ def hop(c):
     log.debug('Extra CI in selected space %s', (len(ci_strs[0]), len(ci_strs[1])))
     ci0 = [c.ravel() for c in ci0]
     link_index = _all_linkstr_index(ci_strs, norb, nelec)
-    hdiag = myci.make_hdiag(h1e, eri, ci_strs, norb, nelec)
+    hdiag = myci.make_hdiag(h1e, eri, ci_strs, norb, nelec, compress=True)
     e, c = myci.eig(hop, ci0, precond, tol=tol, lindep=lindep,
                     max_cycle=max_cycle, max_space=max_space, nroots=nroots,
                     max_memory=max_memory, verbose=log, **kwargs)
@@ -765,8 +767,7 @@ def get_init_guess(self, ci_strs, norb, nelec, nroots, hdiag):
         ci0 = direct_spin1._get_init_guess(na, nb, nroots, hdiag, nelec)
         return [_as_SCIvector(x, ci_strs) for x in ci0]
 
-    def make_hdiag(self, h1e, eri, ci_strs, norb, nelec):
-        return make_hdiag(h1e, eri, ci_strs, norb, nelec)
+    make_hdiag = staticmethod(make_hdiag)
 
     enlarge_space = enlarge_space
     kernel = kernel_float_space
diff --git a/pyscf/fci/selected_ci_spin0.py b/pyscf/fci/selected_ci_spin0.py
index 586ec2afdd..cffca71f76 100644
--- a/pyscf/fci/selected_ci_spin0.py
+++ b/pyscf/fci/selected_ci_spin0.py
@@ -23,7 +23,7 @@
 from pyscf.fci import direct_spin1
 from pyscf.fci import selected_ci
 
-libfci = lib.load_library('libfci')
+libfci = direct_spin1.libfci
 
 def contract_2e(eri, civec_strs, norb, nelec, link_index=None):
     ci_coeff, nelec, ci_strs = selected_ci._unpack(civec_strs, nelec)
@@ -117,7 +117,7 @@ def enlarge_space(myci, civec_strs, eri, norb, nelec):
     return cs
 
 
-def make_hdiag(h1e, eri, ci_strs, norb, nelec):
+def make_hdiag(h1e, eri, ci_strs, norb, nelec, compress=False):
     hdiag = selected_ci.make_hdiag(h1e, eri, ci_strs, norb, nelec)
     na = len(ci_strs[0])
     lib.transpose_sum(hdiag.reshape(na,na), inplace=True)
@@ -156,8 +156,7 @@ def contract_2e(self, eri, civec_strs, norb, nelec, link_index=None, **kwargs):
             civec_strs = selected_ci._as_SCIvector(civec_strs, self._strs)
         return contract_2e(eri, civec_strs, norb, nelec, link_index)
 
-    def make_hdiag(self, h1e, eri, ci_strs, norb, nelec):
-        return make_hdiag(h1e, eri, ci_strs, norb, nelec)
+    make_hdiag = staticmethod(make_hdiag)
 
     enlarge_space = enlarge_space
 
diff --git a/pyscf/fci/selected_ci_spin0_symm.py b/pyscf/fci/selected_ci_spin0_symm.py
index 87ea3105a2..a8875a2fcf 100644
--- a/pyscf/fci/selected_ci_spin0_symm.py
+++ b/pyscf/fci/selected_ci_spin0_symm.py
@@ -26,7 +26,7 @@
 from pyscf.fci import selected_ci_symm
 from pyscf.fci import selected_ci_spin0
 
-libfci = lib.load_library('libfci')
+libfci = direct_spin1.libfci
 
 def contract_2e(eri, civec_strs, norb, nelec, link_index=None, orbsym=None):
     ci_coeff, nelec, ci_strs = selected_ci._unpack(civec_strs, nelec)
@@ -114,8 +114,7 @@ def contract_2e(self, eri, civec_strs, norb, nelec, link_index=None,
             civec_strs = selected_ci._as_SCIvector(civec_strs, self._strs)
         return contract_2e(eri, civec_strs, norb, nelec, link_index, orbsym)
 
-    def make_hdiag(self, h1e, eri, ci_strs, norb, nelec):
-        return selected_ci_spin0.make_hdiag(h1e, eri, ci_strs, norb, nelec)
+    make_hdiag = staticmethod(selected_ci_spin0.make_hdiag)
 
     enlarge_space = selected_ci_spin0.enlarge_space
 
diff --git a/pyscf/fci/selected_ci_symm.py b/pyscf/fci/selected_ci_symm.py
index 0ee292b39a..6d3578f9d8 100644
--- a/pyscf/fci/selected_ci_symm.py
+++ b/pyscf/fci/selected_ci_symm.py
@@ -27,7 +27,7 @@
 from pyscf.fci import selected_ci
 from pyscf.fci import addons
 
-libfci = lib.load_library('libfci')
+libfci = direct_spin1.libfci
 
 def reorder4irrep(eri, norb, link_index, orbsym, offdiag=0):
     if orbsym is None:
diff --git a/pyscf/fci/spin_op.py b/pyscf/fci/spin_op.py
index af1cf03a24..973e74be04 100644
--- a/pyscf/fci/spin_op.py
+++ b/pyscf/fci/spin_op.py
@@ -19,7 +19,7 @@
 from pyscf.fci import cistring
 from pyscf.fci.addons import _unpack_nelec
 
-librdm = lib.load_library('libfci')
+librdm = cistring.libfci
 
 ######################################################
 # Spin squared operator
diff --git a/pyscf/fci/test/test_addons.py b/pyscf/fci/test/test_addons.py
index d6b83c0265..7dcbe409a6 100644
--- a/pyscf/fci/test/test_addons.py
+++ b/pyscf/fci/test/test_addons.py
@@ -378,36 +378,6 @@ def finger(ci1):
         val = finger(fci.addons.symmetrize_wfn(ci, norb, nelec, [0,6,0,3,5,2], 2))
         self.assertAlmostEqual(val, 3.010642818688976, 12)
 
-    def test_symm_initguess(self):
-        norb = 6
-        nelec = (4,2)
-        orbsym = [6,5,7,2,3,0]
-        ci1 = fci.addons.symm_initguess(norb, nelec, orbsym, wfnsym=0)
-        ci2 = fci.addons.symmetrize_wfn(ci1, norb, nelec, orbsym, wfnsym=0)
-        self.assertEqual(abs(ci1-ci2).max(), 0)
-
-        ci1 = fci.addons.symm_initguess(norb, nelec, orbsym, wfnsym=5)
-        ci2 = fci.addons.symmetrize_wfn(ci1, norb, nelec, orbsym, wfnsym=5)
-        self.assertEqual(abs(ci1-ci2).max(), 0)
-
-        ci1 = fci.addons.symm_initguess(norb, nelec, orbsym, wfnsym=3)
-        ci2 = fci.addons.symmetrize_wfn(ci1, norb, nelec, orbsym, wfnsym=3)
-        self.assertEqual(abs(ci1-ci2).max(), 0)
-
-        ci1 = fci.addons.symm_initguess(6, (4,3), [0,1,5,4,3,7], wfnsym=1, irrep_nelec=None)
-        self.assertEqual(numpy.argwhere(ci1!=0).tolist(), [[0,2]])
-        ci1 = fci.addons.symm_initguess(6, (4,3), [0,1,5,4,3,7], wfnsym=0, irrep_nelec={0:[3,2],3:2})
-        self.assertEqual(numpy.argwhere(ci1!=0).tolist(), [[2,5], [3,4]])
-        ci1 = fci.addons.symm_initguess(6, (3,3), [0,1,5,4,3,7], wfnsym=2, irrep_nelec={1:[0,1],3:[1,0]})
-        self.assertEqual(numpy.argwhere(ci1!=0).tolist(), [[5,0]])
-        ci1 = fci.addons.symm_initguess(6, (3,3), [0,1,5,4,3,7], wfnsym=3, irrep_nelec={5:[0,1],3:[1,0]})
-        self.assertEqual(numpy.argwhere(ci1!=0).tolist(), [[4,2], [7,0]])
-
-        self.assertRaises(RuntimeError, fci.addons.symm_initguess, 6, (3,2), [3,3,3,3,3,3], wfnsym=2)
-
-        ci1 = fci.addons.symm_initguess(6, (3,3), [0,1,5,4,3,7], wfnsym=3, irrep_nelec={5:[0,1],3:[1,0]})
-        self.assertEqual(fci.addons.guess_wfnsym(ci1, 6, (3,3), [0,1,5,4,3,7]), 3)
-
     def test_des_and_cre(self):
         a4 = 10*numpy.arange(4)[:,None]
         a6 = 10*numpy.arange(6)[:,None]
diff --git a/pyscf/fci/test/test_cistring.py b/pyscf/fci/test/test_cistring.py
index 4431eab12d..89c62da79e 100644
--- a/pyscf/fci/test/test_cistring.py
+++ b/pyscf/fci/test/test_cistring.py
@@ -30,7 +30,7 @@ def test_strings4orblist(self):
             self.assertEqual(bin(x), ref[i])
 
         strs = cistring.gen_strings4orblist(range(8), 4)
-        occlst = cistring._gen_occslst(range(8), 4)
+        occlst = cistring.gen_occslst(range(8), 4)
         self.assertAlmostEqual(abs(occlst - cistring._strs2occslst(strs, 8)).sum(), 0, 12)
         self.assertAlmostEqual(abs(strs - cistring._occslst2strs(occlst)).sum(), 0, 12)
 
diff --git a/pyscf/fci/test/test_direct_nosym.py b/pyscf/fci/test/test_direct_nosym.py
index 3449571040..fb32235908 100644
--- a/pyscf/fci/test/test_direct_nosym.py
+++ b/pyscf/fci/test/test_direct_nosym.py
@@ -65,6 +65,11 @@ def test_absorb_h1e(self):
         h1 = fci.direct_nosym.absorb_h1e(h1e, h2e, norb, nelec)
         self.assertTrue(numpy.allclose(href, h1))
 
+    def test_absorb_h1e_complex(self):
+        href = fci_slow.absorb_h1e(h1e.astype(complex), h2e, norb, nelec)
+        h1 = fci.direct_nosym.absorb_h1e(h1e.astype(complex), h2e, norb, nelec)
+        self.assertTrue(numpy.allclose(href, h1))
+
     def test_kernel(self):
         h1 = h1e + h1e.T
         eri = .5* ao2mo.restore(1, ao2mo.restore(8, h2e, norb), norb)
diff --git a/pyscf/fci/test/test_ep.py b/pyscf/fci/test/test_ep.py
new file mode 100644
index 0000000000..19dbd9dc4e
--- /dev/null
+++ b/pyscf/fci/test/test_ep.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+# Copyright 2014-2018 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import reduce
+import numpy
+from pyscf import lib
+from pyscf.fci import direct_ep
+
+nsite = 2
+nelec = 2
+nphonon = 3
+
+t = numpy.zeros((nsite,nsite))
+idx = numpy.arange(nsite-1)
+t[idx+1,idx] = t[idx,idx+1] = -1
+u = 1.5
+g = 0.5
+hpp = numpy.eye(nsite) * 1.1
+hpp[idx+1,idx] = hpp[idx,idx+1] = .1
+
+class KnownValues(unittest.TestCase):
+    def test_kernel(self):
+        es = []
+        nelecs = [(ia,ib) for ia in range(nsite+1) for ib in range(ia+1)]
+        for nelec in nelecs:
+            e,c = direct_ep.kernel(t, u, g, hpp, nsite, nelec, nphonon,
+                                   tol=1e-10, verbose=0, nroots=1)
+            #print('nelec =', nelec, 'E =', e)
+            es.append(e)
+        es = numpy.hstack(es)
+        idx = numpy.argsort(es)
+        #print(es[idx])
+        ref = [-1.43147218e+00, -1.04287040e+00, 0, 0, 4.57129605e-01, 3.00000000e+00]
+        self.assertAlmostEqual(abs(es[idx] - ref).max(), 0, 8)
+
+    def test_make_rdm(self):
+        nelec = (1,1)
+        e,c = direct_ep.kernel(t, u, g, hpp, nsite, nelec, nphonon,
+                     tol=1e-10, verbose=0, nroots=1)
+        dm1 = direct_ep.make_rdm1e(c, nsite, nelec)
+
+        dm1a, dm2 = direct_ep.make_rdm12e(c, nsite, nelec)
+        print('check 1e DM')
+        self.assertTrue(numpy.allclose(dm1, dm1a))
+        print('check 2e DM')
+        self.assertTrue(numpy.allclose(dm1, numpy.einsum('ijkk->ij', dm2)/(sum(nelec)-1.)))
+        print('check 2e DM')
+        self.assertTrue(numpy.allclose(dm1, numpy.einsum('kkij->ij', dm2)/(sum(nelec)-1.)))
+
+        dm1 = direct_ep.make_rdm1p(c, nsite, nelec, nphonon)
+        dm1a = numpy.empty_like(dm1)
+        for i in range(nsite):
+            for j in range(nsite):
+                c1 = direct_ep.des_phonon(c, nsite, nelec, nphonon, j)
+                c1 = direct_ep.cre_phonon(c1, nsite, nelec, nphonon, i)
+                dm1a[i,j] = numpy.dot(c.ravel(), c1.ravel())
+        print('check phonon DM')
+        self.assertTrue(numpy.allclose(dm1, dm1a))
+
+    def test_contract_2e_hubbard(self):
+        cishape = direct_ep.make_shape(nsite, nelec, nphonon)
+        eri = numpy.zeros((nsite,nsite,nsite,nsite))
+        for i in range(nsite):
+            eri[i,i,i,i] = u
+        numpy.random.seed(3)
+        ci0 = numpy.random.random(cishape)
+        ci1 = direct_ep.contract_2e([eri*0,eri*.5,eri*0], ci0, nsite, nelec, nphonon)
+        ci2 = direct_ep.contract_2e_hubbard(u, ci0, nsite, nelec, nphonon)
+        self.assertAlmostEqual(abs(ci1-ci2).sum(), 0, 12)
+
+
+if __name__ == "__main__":
+    print("Full Tests for direct_ep (electron-phonon coupled system)")
+    unittest.main()
diff --git a/pyscf/fci/test/test_spin0.py b/pyscf/fci/test/test_spin0.py
index 74cf24ecb2..04fb52f504 100644
--- a/pyscf/fci/test/test_spin0.py
+++ b/pyscf/fci/test/test_spin0.py
@@ -155,11 +155,9 @@ def test_davidson_only(self):
         self.assertAlmostEqual(e, -0.80755526695538049, 7)
 
         cis = fci.direct_spin0_symm.FCISolver(mol)
-        # Test the default initial guess. It should give "0" in the results
-        cis.get_init_guess = None
-        cis.dump_flags()
-        e, c = cis.kernel(h1e, eri, 2, 2, orbsym=mf.mo_coeff.orbsym[2:4])
-        self.assertAlmostEqual(e, 0, 10)
+        cis.wfnsym = 5
+        self.assertRaises(RuntimeError,
+                          cis.kernel, h1e, eri, 2, 2, orbsym=mf.mo_coeff.orbsym[2:4])
 
     def test_gen_linkstr(self):
         sol = fci.direct_spin0.FCI(mol)
diff --git a/pyscf/fci/test/test_spin0_symm.py b/pyscf/fci/test/test_spin0_symm.py
index 478cc85707..9b8a40127c 100644
--- a/pyscf/fci/test/test_spin0_symm.py
+++ b/pyscf/fci/test/test_spin0_symm.py
@@ -120,6 +120,8 @@ def test_linearmole(self):
         ey, ci_y = mci.kernel(wfnsym='E1uy')
         self.assertAlmostEqual(ex - ey, 0, 7)
         self.assertAlmostEqual(ex - -14.79681308052051, 0, 7)
+        ss, sz = mci.spin_square(ci_x, mf.mo_energy.size, mol.nelec)
+        self.assertAlmostEqual(ss, 0, 6)
 
         swap_xy = numpy.array([
             [0, 1, 0],
diff --git a/pyscf/fci/test/test_spin1_cyl_sym.py b/pyscf/fci/test/test_spin1_cyl_sym.py
new file mode 100644
index 0000000000..84e014d690
--- /dev/null
+++ b/pyscf/fci/test/test_spin1_cyl_sym.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python
+# Copyright 2014-2018 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from pyscf import gto, lib
+from pyscf import scf
+from pyscf import ao2mo
+from pyscf import fci
+from pyscf.fci import cistring, direct_spin1, direct_spin1_symm
+from pyscf.fci import direct_nosym
+from pyscf.fci import direct_spin1_cyl_sym
+from pyscf import mcscf
+from pyscf.symm.basis import linearmole_irrep2momentum
+
+class KnownValues(unittest.TestCase):
+    def test_contract_2e(self):
+        mol = gto.M(
+            atom = 'Li 0 0 0; Li 0 0 2.',
+            basis = {'Li': [[0, [4.5, 1]], [2, [0.5, 1]]]},
+            spin=0,
+            symmetry = True,
+        )
+        mf = mol.RHF().run()
+        norb = mf.mo_coeff.shape[1]
+        nelec = mol.nelec
+        h1e = mf.mo_coeff.T.dot(scf.hf.get_hcore(mol)).dot(mf.mo_coeff)
+        eri = ao2mo.restore(1, ao2mo.kernel(mol, mf.mo_coeff), norb)
+        orbsym = mf.orbsym
+
+        degen_mapping = direct_spin1_cyl_sym.map_degeneracy(h1e.diagonal(), orbsym)
+        orbsym = lib.tag_array(orbsym, degen_mapping=degen_mapping)
+        u = direct_spin1_cyl_sym._cyl_sym_orbital_rotation(orbsym, degen_mapping)
+
+        mo = mf.mo_coeff.dot(u.conj().T)
+        h1e = mo.conj().T.dot(mf.get_hcore()).dot(mo)
+        eri = mol.intor('int2e_sph').reshape([norb]*4)
+        eri = lib.einsum('pqrs,pi,qj,rk,sl->ijkl', eri, mo.conj(), mo, mo.conj(), mo)
+        h1e = h1e.real.copy()
+        g2e = eri.real.copy()
+
+        neleca, nelecb = direct_spin1._unpack_nelec(nelec)
+        strsa = strsb = cistring.gen_strings4orblist(range(norb), neleca)
+        airreps_d2h = birreps_d2h = direct_spin1_symm._gen_strs_irrep(strsa, orbsym)
+        a_ls = b_ls = direct_spin1_cyl_sym._strs_angular_momentum(strsa, orbsym)
+        if neleca != nelecb:
+            strsb = cistring.gen_strings4orblist(range(norb), nelecb)
+            birreps_d2h = direct_spin1_symm._gen_strs_irrep(strsb, orbsym)
+            b_ls = direct_spin1_cyl_sym._strs_angular_momentum(strsb, orbsym)
+        a_ungerade = airreps_d2h >= 4
+        b_ungerade = birreps_d2h >= 4
+        na = len(strsa)
+        nb = len(strsb)
+
+        def check(wfnsym):
+            wfn_momentum = linearmole_irrep2momentum(wfnsym)
+            wfn_ungerade = (wfnsym % 10) >= 4
+            np.random.seed(15)
+            ci0 = np.random.random((na,nb))
+            sym_allowed = a_ungerade[:,None] == b_ungerade ^ wfn_ungerade
+            # total angular momentum == wfn_momentum
+            sym_allowed &= a_ls[:,None] == wfn_momentum - b_ls
+            ci0[~sym_allowed] = 0
+            ci1ref = direct_nosym.contract_2e(g2e, ci0, norb, nelec)
+            ci1 = direct_spin1_cyl_sym.contract_2e(g2e, ci0, norb, nelec,
+                                                   orbsym=orbsym, wfnsym=wfnsym)
+            self.assertAlmostEqual(abs(ci1ref - ci1).max(), 0, 9)
+
+        check(0)
+        check(10)
+        check(15)
+        check(14)
+        check(7)
+        check(3)
+        check(5)
+
+    def test_contract_2e_1(self):
+        mol = gto.M(
+            atom = 'Li 0 0 0; Li 0 0 2.',
+            basis = {'Li': [[0, [4.5, 1]], [1, [0.5, 1]]]},
+            spin=0,
+            symmetry = True,
+        )
+        mf = mol.RHF().run()
+        norb = mf.mo_coeff.shape[1]
+        nelec = mol.nelec
+        h1e = mf.mo_coeff.T.dot(scf.hf.get_hcore(mol)).dot(mf.mo_coeff)
+        eri = ao2mo.restore(1, ao2mo.kernel(mol, mf.mo_coeff), norb)
+        orbsym = mf.orbsym
+
+        degen_mapping = direct_spin1_cyl_sym.map_degeneracy(h1e.diagonal(), orbsym)
+        orbsym = lib.tag_array(orbsym, degen_mapping=degen_mapping)
+        u = direct_spin1_cyl_sym._cyl_sym_orbital_rotation(orbsym, degen_mapping)
+
+        mo = mf.mo_coeff.dot(u.conj().T)
+        h1e = mo.conj().T.dot(mf.get_hcore()).dot(mo)
+        eri = mol.intor('int2e_sph').reshape([norb]*4)
+        eri = lib.einsum('pqrs,pi,qj,rk,sl->ijkl', eri, mo.conj(), mo, mo.conj(), mo)
+        h1e = h1e.real.copy()
+        g2e = eri.real.copy()
+
+        neleca, nelecb = direct_spin1._unpack_nelec(nelec)
+        strsa = strsb = cistring.gen_strings4orblist(range(norb), neleca)
+        airreps_d2h = birreps_d2h = direct_spin1_symm._gen_strs_irrep(strsa, orbsym)
+        a_ls = b_ls = direct_spin1_cyl_sym._strs_angular_momentum(strsa, orbsym)
+        if neleca != nelecb:
+            strsb = cistring.gen_strings4orblist(range(norb), nelecb)
+            birreps_d2h = direct_spin1_symm._gen_strs_irrep(strsb, orbsym)
+            b_ls = direct_spin1_cyl_sym._strs_angular_momentum(strsb, orbsym)
+        a_ungerade = airreps_d2h >= 4
+        b_ungerade = birreps_d2h >= 4
+        na = len(strsa)
+        nb = len(strsb)
+
+        def check(wfnsym):
+            wfn_momentum = linearmole_irrep2momentum(wfnsym)
+            wfn_ungerade = (wfnsym % 10) >= 4
+            np.random.seed(15)
+            ci0 = np.random.random((na,nb))
+            sym_allowed = a_ungerade[:,None] == b_ungerade ^ wfn_ungerade
+            # total angular momentum == wfn_momentum
+            sym_allowed &= a_ls[:,None] == wfn_momentum - b_ls
+            ci0[~sym_allowed] = 0
+            ci1ref = direct_nosym.contract_2e(g2e, ci0, norb, nelec)
+            ci1 = direct_spin1_cyl_sym.contract_2e(g2e, ci0, norb, nelec,
+                                                   orbsym=orbsym, wfnsym=wfnsym)
+            self.assertAlmostEqual(abs(ci1ref - ci1).max(), 0, 9)
+
+        check(0)
+        check(11)
+        check(6)
+        check(3)
+        check(5)
+
+    def test_spin1_cyl_sym(self):
+        mol = gto.M(
+            atom = 'N 0 0 0; N 0 0 1.5',
+            basis = 'cc-pVDZ',
+            spin = 0,
+            symmetry = True,
+        )
+        mc = mol.RHF().run().CASCI(12, 6)
+        mc.fcisolver.wfnsym = 'E1ux'
+        mc.run()
+        e1 = mc.e_tot
+        ci1 = mc.ci
+        self.assertAlmostEqual(e1, -108.683383569227, 7)
+
+        mc.fcisolver = direct_spin1_cyl_sym.FCI(mol)
+        mc.fcisolver.wfnsym = 'E1ux'
+        mc.fcisolver.davidson_only = True
+        mc.run()
+        e2 = mc.e_tot
+        self.assertAlmostEqual(e2, -108.683383569227, 7)
+        orbsym = mc.fcisolver.orbsym
+        degen_mapping = orbsym.degen_mapping
+        u = direct_spin1_symm._cyl_sym_orbital_rotation(orbsym, degen_mapping)
+        ci2 = fci.addons.transform_ci(mc.ci, (3,3), u)
+        ci2 = ci2.real / np.linalg.norm(ci2.real)
+        self.assertAlmostEqual(abs(ci1.ravel().dot(ci2.ravel())), 1, 6)
+
+    def test_wrong_initial_guess(self):
+        mol = gto.M(
+            atom = 'H 0 0 0; H 0 0 1.2',
+            basis = [[0, [3, 1]], [1, [1, 1]]],
+            spin = 1,
+            charge = 1,
+            symmetry = True)
+        mf = mol.RHF().run()
+        mc = mcscf.CASCI(mf, mf.mo_energy.size, mol.nelec)
+        mc.fcisolver.wfnsym = 'A2g'
+        self.assertRaises(RuntimeError, mc.run)
+
+        mc.fcisolver = direct_spin1_cyl_sym.FCI(mol)
+        mc.fcisolver.wfnsym = 'A2g'
+        self.assertRaises(RuntimeError, mc.run)
+
+    def test_linearmole_a2(self):
+        mol = gto.M(
+            atom = 'H 0 0 0; H 0 0 1.2',
+            basis = [[0, [3, 1]], [1, [1, 1]]],
+            symmetry = True)
+        mf = mol.RHF().run()
+
+        mc = mcscf.CASCI(mf, mf.mo_energy.size, mol.nelec)
+        mc.fcisolver.wfnsym = 'A2g'
+        mc.run()
+        self.assertAlmostEqual(mc.e_tot, 2.6561956585409616, 8)
+        mc.fcisolver = direct_spin1_cyl_sym.FCI(mol)
+        mc.fcisolver.wfnsym = 'A2g'
+        mc.run()
+        self.assertAlmostEqual(mc.e_tot, 2.6561956585409616, 8)
+
+        mc = mcscf.CASCI(mf, mf.mo_energy.size, mol.nelec)
+        mc.fcisolver.wfnsym = 'A2u'
+        mc.run()
+        self.assertAlmostEqual(mc.e_tot, 2.8999951068356475, 8)
+        mc.fcisolver = direct_spin1_cyl_sym.FCI(mol)
+        mc.fcisolver.wfnsym = 'A2u'
+        mc.run()
+        self.assertAlmostEqual(mc.e_tot, 2.8999951068356475, 8)
+
+if __name__ == "__main__":
+    print("Full Tests for spin1-symm")
+    unittest.main()
diff --git a/pyscf/fci/test/test_spin1_symm.py b/pyscf/fci/test/test_spin1_symm.py
index 8e94f6ede3..462009e3dd 100644
--- a/pyscf/fci/test/test_spin1_symm.py
+++ b/pyscf/fci/test/test_spin1_symm.py
@@ -20,8 +20,6 @@
 from pyscf import ao2mo
 from pyscf import fci
 from pyscf.fci import cistring, direct_spin1, direct_spin1_symm
-from pyscf.fci import direct_spin1_cyl_sym
-import pyscf.symm
 from pyscf import mcscf
 
 def setUpModule():
@@ -44,7 +42,7 @@ def setUpModule():
     h1e = m.mo_coeff.T.dot(scf.hf.get_hcore(mol)).dot(m.mo_coeff)
     g2e = ao2mo.incore.full(m._eri, m.mo_coeff)
     orbsym = m.orbsym
-    cis = fci.direct_spin1_symm.FCISolver(mol)
+    cis = direct_spin1_symm.FCISolver(mol)
     cis.orbsym = orbsym
 
     numpy.random.seed(15)
@@ -183,6 +181,8 @@ def test_linearmole(self):
         ey, ci_y = mci.kernel(wfnsym='E2uy')
         self.assertAlmostEqual(ex - ey, 0, 7)
         self.assertAlmostEqual(ex - -14.70061197088, 0, 7)
+        ss, sz = mci.spin_square(ci_x, mf.mo_energy.size, mol.nelec)
+        self.assertAlmostEqual(ss, 2, 6)
 
         swap_xy = numpy.array([
             [0, 1, 0],
@@ -197,73 +197,6 @@ def test_linearmole(self):
         ci1 = fci.addons.transform_ci(ci_y, (3,3), u.T)
         self.assertAlmostEqual(abs(ci1.ravel().dot(ci_y.ravel())), 1, 9)
 
-    def test_spin1_cyl_sym(self):
-        mol = gto.M(
-            atom = 'N 0 0 0; N 0 0 1.5',
-            basis = 'cc-pVDZ',
-            spin = 0,
-            symmetry = True,
-        )
-        mc = mol.RHF().run().CASCI(12, 6)
-        mc.fcisolver.wfnsym = 'E1ux'
-        mc.run()
-        e1 = mc.e_tot
-        ci1 = mc.ci
-        self.assertAlmostEqual(e1, -108.683383569227, 7)
-
-        mc.fcisolver = direct_spin1_cyl_sym.FCI(mol)
-        mc.fcisolver.wfnsym = 'E1ux'
-        mc.run()
-        e2 = mc.e_tot
-        self.assertAlmostEqual(e2, -108.683383569227, 7)
-        orbsym = mc.fcisolver.orbsym
-        degen_mapping = orbsym.degen_mapping
-        u = direct_spin1_symm._cyl_sym_orbital_rotation(orbsym, degen_mapping)
-        ci2 = fci.addons.transform_ci(mc.ci, (3,3), u)
-        ci2 = ci2.real / numpy.linalg.norm(ci2.real)
-        self.assertAlmostEqual(abs(ci1.ravel().dot(ci2.ravel())), 1, 6)
-
-    def test_wrong_initial_guess(self):
-        mol = gto.M(
-            atom = 'H 0 0 0; H 0 0 1.2',
-            basis = [[0, [3, 1]], [1, [1, 1]]],
-            spin = 1,
-            charge = 1,
-            symmetry = True)
-        mf = mol.RHF().run()
-        mc = mcscf.CASCI(mf, mf.mo_energy.size, mol.nelec)
-        mc.fcisolver.wfnsym = 'A2g'
-        self.assertRaises(RuntimeError, mc.run)
-
-        mc.fcisolver = direct_spin1_cyl_sym.FCI(mol)
-        mc.fcisolver.wfnsym = 'A2g'
-        self.assertRaises(RuntimeError, mc.run)
-
-    def test_linearmole_a2(self):
-        mol = gto.M(
-            atom = 'H 0 0 0; H 0 0 1.2',
-            basis = [[0, [3, 1]], [1, [1, 1]]],
-            symmetry = True)
-        mf = mol.RHF().run()
-
-        mc = mcscf.CASCI(mf, mf.mo_energy.size, mol.nelec)
-        mc.fcisolver.wfnsym = 'A2g'
-        mc.run()
-        self.assertAlmostEqual(mc.e_tot, 2.6561956585409616, 8)
-        mc.fcisolver = direct_spin1_cyl_sym.FCI(mol)
-        mc.fcisolver.wfnsym = 'A2g'
-        mc.run()
-        self.assertAlmostEqual(mc.e_tot, 2.6561956585409616, 8)
-
-        mc = mcscf.CASCI(mf, mf.mo_energy.size, mol.nelec)
-        mc.fcisolver.wfnsym = 'A2u'
-        mc.run()
-        self.assertAlmostEqual(mc.e_tot, 2.8999951068356475, 8)
-        mc.fcisolver = direct_spin1_cyl_sym.FCI(mol)
-        mc.fcisolver.wfnsym = 'A2u'
-        mc.run()
-        self.assertAlmostEqual(mc.e_tot, 2.8999951068356475, 8)
-
 if __name__ == "__main__":
     print("Full Tests for spin1-symm")
     unittest.main()
diff --git a/pyscf/fci/test/test_spin_op.py b/pyscf/fci/test/test_spin_op.py
index 0d38931178..df9f7a90c4 100644
--- a/pyscf/fci/test/test_spin_op.py
+++ b/pyscf/fci/test/test_spin_op.py
@@ -102,11 +102,10 @@ def test_contract_ss(self):
         # decrease the convergence tolerance. Otherwise the davidson solver
         # may produce vectors that break the symmetry required by direct_spin0.
         nelec = (5,5)
-        fci.addons.fix_spin_(fci.direct_spin0)
+        fci.addons.fix_spin_(fci.direct_spin0, shift=0.02)
         na = fci.cistring.num_strings(norb, nelec[0])
         c0 = numpy.zeros((na,na))
         c0[0,0] = 1
-        c0[-1,-1] = 1e-4
         e, ci0 = fci.direct_spin0.kernel(h1, h2, norb, nelec, ci0=c0,
                                          conv_tol=1e-8)
 
@@ -172,7 +171,3 @@ def test_local_spin(self):
 if __name__ == "__main__":
     print("Full Tests for fci.spin_op")
     unittest.main()
-
-
-
-
diff --git a/pyscf/fci/test/test_symm_init_guess.py b/pyscf/fci/test/test_symm_init_guess.py
index 988b7f376f..5304b5f21f 100644
--- a/pyscf/fci/test/test_symm_init_guess.py
+++ b/pyscf/fci/test/test_symm_init_guess.py
@@ -42,9 +42,9 @@ def test_symm_spin0(self):
         self.assertAlmostEqual(e[0], -19.286003160337+mol.energy_nuc(), 9)
         self.assertAlmostEqual(e[1], -18.812177419921+mol.energy_nuc(), 9)
         self.assertAlmostEqual(e[2], -18.786684534678+mol.energy_nuc(), 9)
-        self.assertAlmostEqual(fci.spin_op.spin_square0(c[0], norb, nelec)[0], 0, 9)
-        self.assertAlmostEqual(fci.spin_op.spin_square0(c[1], norb, nelec)[0], 6, 9)
-        self.assertAlmostEqual(fci.spin_op.spin_square0(c[2], norb, nelec)[0], 0, 9)
+        self.assertAlmostEqual(fci.spin_op.spin_square0(c[0], norb, nelec)[0], 0, 7)
+        self.assertAlmostEqual(fci.spin_op.spin_square0(c[1], norb, nelec)[0], 6, 7)
+        self.assertAlmostEqual(fci.spin_op.spin_square0(c[2], norb, nelec)[0], 0, 7)
 
     def test_symm_spin1(self):
         fs = fci.FCI(mol, m.mo_coeff, singlet=False)
@@ -60,4 +60,3 @@ def test_symm_spin1(self):
 if __name__ == "__main__":
     print("Full Tests for init_guess")
     unittest.main()
-
diff --git a/pyscf/fci/test/test_uhf.py b/pyscf/fci/test/test_uhf.py
index 8ccb8ce124..d4c31c402f 100644
--- a/pyscf/fci/test/test_uhf.py
+++ b/pyscf/fci/test/test_uhf.py
@@ -184,4 +184,3 @@ def test_contract2e_hubbard(self):
 if __name__ == "__main__":
     print("Full Tests for uhf-based fci")
     unittest.main()
-
diff --git a/pyscf/geomopt/geometric_solver.py b/pyscf/geomopt/geometric_solver.py
index bede4e3d70..f11c46f9c6 100644
--- a/pyscf/geomopt/geometric_solver.py
+++ b/pyscf/geomopt/geometric_solver.py
@@ -18,12 +18,14 @@
 '''
 
 import os
+import uuid
 import tempfile
 import numpy
 import geometric
 import geometric.molecule
 #from geometric import molecule
 from pyscf import lib
+from pyscf.lib import logger
 from pyscf.geomopt.addons import (as_pyscf_method, dump_mol_geometry,
                                   symmetrize)  # noqa
 from pyscf import __config__
@@ -72,11 +74,11 @@ def calc_new(self, coords, dirname):
         g_scanner = self.scanner
         mol = self.mol
         self.cycle += 1
-        lib.logger.note(g_scanner, '\nGeometry optimization cycle %d', self.cycle)
+        logger.note(g_scanner, '\nGeometry optimization cycle %d', self.cycle)
 
         # geomeTRIC requires coords and gradients in atomic unit
         coords = coords.reshape(-1,3)
-        if g_scanner.verbose >= lib.logger.NOTE:
+        if g_scanner.verbose >= logger.NOTE:
             dump_mol_geometry(mol, coords*lib.param.BOHR)
 
         if mol.symmetry:
@@ -84,9 +86,8 @@ def calc_new(self, coords, dirname):
 
         mol.set_geom_(coords, unit='Bohr')
         energy, gradients = g_scanner(mol)
-        lib.logger.note(g_scanner,
-                        'cycle %d: E = %.12g  dE = %g  norm(grad) = %g', self.cycle,
-                        energy, energy - self.e_last, numpy.linalg.norm(gradients))
+        logger.note(g_scanner, 'cycle %d: E = %.12g  dE = %g  norm(grad) = %g',
+                    self.cycle, energy, energy - self.e_last, numpy.linalg.norm(gradients))
         self.e_last = energy
 
         if callable(self.callback):
@@ -128,7 +129,6 @@ def kernel(method, assert_convergence=ASSERT_CONV,
     if not include_ghost:
         g_scanner.atmlst = numpy.where(method.mol.atom_charges() != 0)[0]
 
-    tmpf = tempfile.mktemp(dir=lib.param.TMPDIR)
     engine = PySCFEngine(g_scanner)
     engine.callback = callback
     engine.maxsteps = maxsteps
@@ -142,21 +142,28 @@ def kernel(method, assert_convergence=ASSERT_CONV,
     # detection code in Mole.build function).
     if engine.mol.symmetry:
         engine.mol.symmetry = engine.mol.topgroup
+    engine.assert_convergence = assert_convergence
 
     # geomeTRIC library on pypi requires to provide config file log.ini.
     if not os.path.exists(os.path.abspath(
             os.path.join(geometric.optimize.__file__, '..', 'log.ini'))) and kwargs.get('logIni') is None:
         kwargs['logIni'] = os.path.abspath(os.path.join(__file__, '..', 'log.ini'))
 
-    engine.assert_convergence = assert_convergence
-    try:
-        geometric.optimize.run_optimizer(customengine=engine, input=tmpf,
-                                         constraints=constraints, **kwargs)
-        conv = True
-        # method.mol.set_geom_(m.xyzs[-1], unit='Angstrom')
-    except NotConvergedError as e:
-        lib.logger.note(method, str(e))
-        conv = False
+    with tempfile.TemporaryDirectory(dir=lib.param.TMPDIR) as tmpdir:
+        tmpf = os.path.join(tmpdir, str(uuid.uuid4()))
+
+        if 'hessian' in kwargs:
+            kwargs['hessian'] = _make_hessian(g_scanner, kwargs['hessian'], tmpdir)
+            logger.debug(g_scanner, 'Analytical hessian saved in %s', kwargs['hessian'])
+
+        try:
+            geometric.optimize.run_optimizer(customengine=engine, input=tmpf,
+                                             constraints=constraints, **kwargs)
+            conv = True
+            # method.mol.set_geom_(m.xyzs[-1], unit='Angstrom')
+        except NotConvergedError as e:
+            logger.note(method, str(e))
+            conv = False
     return conv, engine.mol
 
 def optimize(method, assert_convergence=ASSERT_CONV,
@@ -182,6 +189,30 @@ def optimize(method, assert_convergence=ASSERT_CONV,
     return kernel(method, assert_convergence=assert_convergence, include_ghost=include_ghost,
                   constraints=constraints, callback=callback, maxsteps=maxsteps, **kwargs)[1]
 
+def _make_hessian(g_scanner, hessian_option, tmpdir):
+    '''calculate hessian and saved to a file.
+    Returns the filename in the geomeTRIC supported format, e.g.
+    file:/path/to/hessian_file
+    '''
+    if not isinstance(hessian_option, str):
+        hessian_option = os.path.join(tmpdir, str(uuid.uuid4()))
+    if ':' in hessian_option:
+        hessian_file = hessian_option.split(':', 1)[1]
+    else:
+        hessian_file, hessian_option = hessian_option, f'first:{hessian_file}'
+
+    method = g_scanner.base
+    natm = method.mol.natm
+    try:
+        h = method.Hessian().kernel()
+    except (TypeError, NotImplementedError):
+        logger.warn(g_scanner, 'Analytical hessian for %s is not available', method)
+        hessian_option = False
+    else:
+        h = h.transpose(0,2,1,3).reshape(3*natm, 3*natm)
+        numpy.savetxt(hessian_file, h)
+    return hessian_option
+
 class GeometryOptimizer(lib.StreamObject):
     '''Optimize the molecular geometry for the input method.
 
@@ -212,48 +243,3 @@ def kernel(self, params=None):
 
 class NotConvergedError(RuntimeError):
     pass
-
-del (INCLUDE_GHOST, ASSERT_CONV)
-
-
-if __name__ == '__main__':
-    from pyscf import gto
-    from pyscf import scf, dft, cc, mp
-    mol = gto.M(atom='''
-C       1.1879  -0.3829 0.0000
-C       0.0000  0.5526  0.0000
-O       -1.1867 -0.2472 0.0000
-H       -1.9237 0.3850  0.0000
-H       2.0985  0.2306  0.0000
-H       1.1184  -1.0093 0.8869
-H       1.1184  -1.0093 -0.8869
-H       -0.0227 1.1812  0.8852
-H       -0.0227 1.1812  -0.8852
-                ''',
-                basis='3-21g')
-
-    mf = scf.RHF(mol)
-    conv_params = {
-        'convergence_energy': 1e-4,  # Eh
-        'convergence_grms': 3e-3,    # Eh/Bohr
-        'convergence_gmax': 4.5e-3,  # Eh/Bohr
-        'convergence_drms': 1.2e-2,  # Angstrom
-        'convergence_dmax': 1.8e-2,  # Angstrom
-    }
-    opt = GeometryOptimizer(mf).set(params=conv_params)#.run()
-    opt.max_cycle=1
-    opt.run()
-    mol1 = opt.mol
-    print(mf.kernel() - -153.219208484874)
-    print(scf.RHF(mol1).kernel() - -153.222680852335)
-
-    mf = dft.RKS(mol)
-    mf.xc = 'pbe,'
-    mf.conv_tol = 1e-7
-    mol1 = optimize(mf)
-
-    mymp2 = mp.MP2(scf.RHF(mol))
-    mol1 = optimize(mymp2)
-
-    mycc = cc.CCSD(scf.RHF(mol))
-    mol1 = optimize(mycc)
diff --git a/pyscf/geomopt/test/test_geometric_solver.py b/pyscf/geomopt/test/test_geometric_solver.py
index 45a409dd53..e7116cbc14 100644
--- a/pyscf/geomopt/test/test_geometric_solver.py
+++ b/pyscf/geomopt/test/test_geometric_solver.py
@@ -71,4 +71,3 @@ def test_optimize_high_cost(self):
 if __name__ == "__main__":
     print("Tests for geometric_solver")
     unittest.main()
-
diff --git a/pyscf/grad/casci.py b/pyscf/grad/casci.py
index 6e56f3cd6d..fb192dece6 100644
--- a/pyscf/grad/casci.py
+++ b/pyscf/grad/casci.py
@@ -224,6 +224,7 @@ def __call__(self, mol_or_geom, state=state, **kwargs):
                 mol = mol_or_geom
             else:
                 mol = self.mol.set_geom_(mol_or_geom, inplace=False)
+            self.reset(mol)
 
             if state is None:
                 state = self.state
@@ -242,7 +243,6 @@ def __call__(self, mol_or_geom, state=state, **kwargs):
                 # in self.kernel
                 ci = ci[state]
 
-            self.mol = mol
             de = self.kernel(ci=ci, state=state, **kwargs)
             return e_tot, de
     return CASCI_GradScanner(mcscf_grad)
diff --git a/pyscf/grad/casscf.py b/pyscf/grad/casscf.py
index 49d03dd3d4..31b8011d6c 100644
--- a/pyscf/grad/casscf.py
+++ b/pyscf/grad/casscf.py
@@ -171,13 +171,13 @@ def __call__(self, mol_or_geom, **kwargs):
                 mol = mol_or_geom
             else:
                 mol = self.mol.set_geom_(mol_or_geom, inplace=False)
+            self.reset(mol)
 
             mc_scanner = self.base
             e_tot = mc_scanner(mol)
             if isinstance(mc_scanner, StateAverageMCSCFSolver):
                 e_tot = mc_scanner.e_average
 
-            self.mol = mol
             de = self.kernel(**kwargs)
             return e_tot, de
     return CASSCF_GradScanner(mcscf_grad)
diff --git a/pyscf/grad/ccsd.py b/pyscf/grad/ccsd.py
index 9366599323..141435ccfe 100644
--- a/pyscf/grad/ccsd.py
+++ b/pyscf/grad/ccsd.py
@@ -238,7 +238,7 @@ def __call__(self, mol_or_geom, **kwargs):
             else:
                 last_size = 0
 
-            cc.reset(mol)
+            self.reset(mol)
             mf_scanner = cc._scf
             mf_scanner(mol)
             cc.mo_coeff = mf_scanner.mo_coeff
@@ -246,7 +246,6 @@ def __call__(self, mol_or_geom, **kwargs):
             if last_size != cc.vector_size():
                 cc.t1 = cc.t2 = cc.l1 = cc.l2 = None
 
-            self.mol = mol
             eris = cc.ao2mo(cc.mo_coeff)
             # Update cc.t1 and cc.t2
             cc.kernel(t1=cc.t1, t2=cc.t2, eris=eris)
diff --git a/pyscf/grad/cisd.py b/pyscf/grad/cisd.py
index 0bf52a6df8..d77638932f 100644
--- a/pyscf/grad/cisd.py
+++ b/pyscf/grad/cisd.py
@@ -78,6 +78,7 @@ def __call__(self, mol_or_geom, state=state, **kwargs):
                 mol = mol_or_geom
             else:
                 mol = self.mol.set_geom_(mol_or_geom, inplace=False)
+            self.reset(mol)
 
             ci_scanner = self.base
             if ci_scanner.nroots > 1 and state >= ci_scanner.nroots:
@@ -101,7 +102,6 @@ def __call__(self, mol_or_geom, state=state, **kwargs):
                 e_tot = ci_scanner.e_tot
                 civec = ci_scanner.ci
 
-            self.mol = mol
             de = self.kernel(civec, eris=eris, **kwargs)
             return e_tot, de
         @property
diff --git a/pyscf/grad/lagrange.py b/pyscf/grad/lagrange.py
index 0c4be22fd1..eb66360323 100644
--- a/pyscf/grad/lagrange.py
+++ b/pyscf/grad/lagrange.py
@@ -147,22 +147,25 @@ def kernel (self, level_shift=None, **kwargs):
 
         self.converged, self.Lvec, bvec, Aop, Adiag = self.solve_lagrange (
             level_shift=level_shift, **kwargs)
-        self.debug_lagrange (self.Lvec, bvec, Aop, Adiag, **kwargs)
-        cput1 = logger.timer (self, 'Lagrange gradient multiplier solution', *cput0)
+        if self.verbose >= logger.INFO:
+            self.debug_lagrange (self.Lvec, bvec, Aop, Adiag, **kwargs)
+            cput1 = logger.timer (self, 'Lagrange gradient multiplier solution', *cput0)
 
         ham_response = self.get_ham_response (**kwargs)
-        logger.info(self, '--------------- %s gradient Hamiltonian response ---------------',
-                    self.base.__class__.__name__)
-        rhf_grad._write(self, self.mol, ham_response, self.atmlst)
-        logger.info(self, '----------------------------------------------')
-        cput1 = logger.timer (self, 'Lagrange gradient Hellmann-Feynman determination', *cput1)
+        if self.verbose >= logger.INFO:
+            logger.info(self, '--------------- %s gradient Hamiltonian response ---------------',
+                        self.base.__class__.__name__)
+            rhf_grad._write(self, self.mol, ham_response, self.atmlst)
+            logger.info(self, '----------------------------------------------')
+            cput1 = logger.timer (self, 'Lagrange gradient Hellmann-Feynman determination', *cput1)
 
         LdotJnuc = self.get_LdotJnuc (self.Lvec, **kwargs)
-        logger.info(self, '--------------- %s gradient Lagrange response ---------------',
-                    self.base.__class__.__name__)
-        rhf_grad._write(self, self.mol, LdotJnuc, self.atmlst)
-        logger.info(self, '----------------------------------------------')
-        cput1 = logger.timer (self, 'Lagrange gradient Jacobian', *cput1)
+        if self.verbose >= logger.INFO:
+            logger.info(self, '--------------- %s gradient Lagrange response ---------------',
+                        self.base.__class__.__name__)
+            rhf_grad._write(self, self.mol, LdotJnuc, self.atmlst)
+            logger.info(self, '----------------------------------------------')
+            cput1 = logger.timer (self, 'Lagrange gradient Jacobian', *cput1)
 
         self.de = ham_response + LdotJnuc
         log.timer('Lagrange gradients', *cput0)
diff --git a/pyscf/grad/mp2.py b/pyscf/grad/mp2.py
index 81cd511658..294be7a116 100644
--- a/pyscf/grad/mp2.py
+++ b/pyscf/grad/mp2.py
@@ -224,6 +224,7 @@ def __call__(self, mol_or_geom, **kwargs):
                 mol = mol_or_geom
             else:
                 mol = self.mol.set_geom_(mol_or_geom, inplace=False)
+            self.reset(mol)
 
             mp_scanner = self.base
             mp_scanner(mol, with_t2=True)
@@ -279,7 +280,7 @@ class Gradients(rhf_grad.GradientsMixin):
     def kernel(self, t2=None, atmlst=None, verbose=None):
         log = logger.new_logger(self, verbose)
         if t2 is None: t2 = self.base.t2
-        if t2 is None: t2 = self.base.kernel()
+        if t2 is None: t2 = self.base.kernel()[1]
         if atmlst is None:
             atmlst = self.atmlst
         else:
diff --git a/pyscf/grad/rhf.py b/pyscf/grad/rhf.py
index 59b86d56a3..87ef29c468 100644
--- a/pyscf/grad/rhf.py
+++ b/pyscf/grad/rhf.py
@@ -26,7 +26,7 @@
 from pyscf import gto
 from pyscf import lib
 from pyscf.lib import logger
-from pyscf.scf import _vhf
+from pyscf.scf import hf, _vhf
 from pyscf.gto.mole import is_au
 
 
@@ -255,14 +255,15 @@ def __call__(self, mol_or_geom, **kwargs):
             else:
                 mol = self.mol.set_geom_(mol_or_geom, inplace=False)
 
+            self.reset(mol)
             mf_scanner = self.base
             e_tot = mf_scanner(mol)
-            self.mol = mol
 
-            # If second integration grids are created for RKS and UKS
-            # gradients
-            if getattr(self, 'grids', None):
-                self.grids.reset(mol)
+            if isinstance(mf_scanner, hf.KohnShamDFT):
+                if getattr(self, 'grids', None):
+                    self.grids.reset(mol)
+                if getattr(self, 'nlcgrids', None):
+                    self.nlcgrids.reset(mol)
 
             de = self.kernel(**kwargs)
             return e_tot, de
@@ -301,6 +302,12 @@ def dump_flags(self, verbose=None):
                  self.max_memory, lib.current_memory()[0])
         return self
 
+    def reset(self, mol=None):
+        if mol is not None:
+            self.mol = mol
+        self.base.reset(mol)
+        return self
+
     def get_hcore(self, mol=None):
         if mol is None: mol = self.mol
         return get_hcore(mol)
@@ -312,27 +319,39 @@ def get_ovlp(self, mol=None):
         return get_ovlp(mol)
 
     @lib.with_doc(get_jk.__doc__)
-    def get_jk(self, mol=None, dm=None, hermi=0):
+    def get_jk(self, mol=None, dm=None, hermi=0, omega=None):
         if mol is None: mol = self.mol
         if dm is None: dm = self.base.make_rdm1()
         cpu0 = (logger.process_clock(), logger.perf_counter())
-        vj, vk = get_jk(mol, dm)
+        if omega is None:
+            vj, vk = get_jk(mol, dm)
+        else:
+            with mol.with_range_coulomb(omega):
+                vj, vk = get_jk(mol, dm)
         logger.timer(self, 'vj and vk', *cpu0)
         return vj, vk
 
-    def get_j(self, mol=None, dm=None, hermi=0):
+    def get_j(self, mol=None, dm=None, hermi=0, omega=None):
         if mol is None: mol = self.mol
         if dm is None: dm = self.base.make_rdm1()
         intor = mol._add_suffix('int2e_ip1')
-        return -_vhf.direct_mapdm(intor, 's2kl', 'lk->s1ij', dm, 3,
-                                  mol._atm, mol._bas, mol._env)
-
-    def get_k(self, mol=None, dm=None, hermi=0):
+        if omega is None:
+            return -_vhf.direct_mapdm(intor, 's2kl', 'lk->s1ij', dm, 3,
+                                      mol._atm, mol._bas, mol._env)
+        with mol.with_range_coulomb(omega):
+            return -_vhf.direct_mapdm(intor, 's2kl', 'lk->s1ij', dm, 3,
+                                      mol._atm, mol._bas, mol._env)
+
+    def get_k(self, mol=None, dm=None, hermi=0, omega=None):
         if mol is None: mol = self.mol
         if dm is None: dm = self.base.make_rdm1()
         intor = mol._add_suffix('int2e_ip1')
-        return -_vhf.direct_mapdm(intor, 's2kl', 'jk->s1il', dm, 3,
-                                  mol._atm, mol._bas, mol._env)
+        if omega is None:
+            return -_vhf.direct_mapdm(intor, 's2kl', 'jk->s1il', dm, 3,
+                                      mol._atm, mol._bas, mol._env)
+        with mol.with_range_coulomb(omega):
+            return -_vhf.direct_mapdm(intor, 's2kl', 'jk->s1il', dm, 3,
+                                      mol._atm, mol._bas, mol._env)
 
     def get_veff(self, mol=None, dm=None):
         raise NotImplementedError
diff --git a/pyscf/grad/rks.py b/pyscf/grad/rks.py
index 43a621827e..75e193079a 100644
--- a/pyscf/grad/rks.py
+++ b/pyscf/grad/rks.py
@@ -44,22 +44,7 @@ def get_veff(ks_grad, mol=None, dm=None):
 
     mf = ks_grad.base
     ni = mf._numint
-    if ks_grad.grids is not None:
-        grids = ks_grad.grids
-    else:
-        grids = mf.grids
-    if mf.nlc != '':
-        if ks_grad.nlcgrids is not None:
-            nlcgrids = ks_grad.nlcgrids
-        else:
-            nlcgrids = mf.nlcgrids
-        if nlcgrids.coords is None:
-            nlcgrids.build(with_non0tab=True)
-    if grids.coords is None:
-        grids.build(with_non0tab=True)
-
-    #enabling range-separated hybrids
-    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
+    grids, nlcgrids = _initialize_grids(ks_grad)
 
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, ks_grad.max_memory*.9-mem_now)
@@ -67,39 +52,64 @@ def get_veff(ks_grad, mol=None, dm=None):
         exc, vxc = get_vxc_full_response(ni, mol, grids, mf.xc, dm,
                                          max_memory=max_memory,
                                          verbose=ks_grad.verbose)
-        if mf.nlc:
-            assert 'VV10' in mf.nlc.upper()
-            enlc, vnlc = get_vxc_full_response(ni, mol, nlcgrids,
-                                               mf.xc+'__'+mf.nlc, dm,
-                                               max_memory=max_memory,
-                                               verbose=ks_grad.verbose)
+        if mf.nlc or ni.libxc.is_nlc(mf.xc):
+            if ni.libxc.is_nlc(mf.xc):
+                xc = mf.xc
+            else:
+                xc = mf.nlc
+            enlc, vnlc = get_nlc_vxc_full_response(
+                ni, mol, nlcgrids, xc, dm,
+                max_memory=max_memory, verbose=ks_grad.verbose)
             exc += enlc
             vxc += vnlc
         logger.debug1(ks_grad, 'sum(grids response) %s', exc.sum(axis=0))
     else:
         exc, vxc = get_vxc(ni, mol, grids, mf.xc, dm,
                            max_memory=max_memory, verbose=ks_grad.verbose)
-        if mf.nlc:
-            assert 'VV10' in mf.nlc.upper()
-            enlc, vnlc = get_vxc(ni, mol, nlcgrids, mf.xc+'__'+mf.nlc, dm,
-                                 max_memory=max_memory,
-                                 verbose=ks_grad.verbose)
+        if mf.nlc or ni.libxc.is_nlc(mf.xc):
+            if ni.libxc.is_nlc(mf.xc):
+                xc = mf.xc
+            else:
+                xc = mf.nlc
+            enlc, vnlc = get_nlc_vxc(
+                ni, mol, nlcgrids, xc, dm,
+                max_memory=max_memory, verbose=ks_grad.verbose)
             vxc += vnlc
     t0 = logger.timer(ks_grad, 'vxc', *t0)
 
-    if abs(hyb) < 1e-10 and abs(alpha) < 1e-10:
+    if not ni.libxc.is_hybrid_xc(mf.xc):
         vj = ks_grad.get_j(mol, dm)
         vxc += vj
     else:
+        omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
         vj, vk = ks_grad.get_jk(mol, dm)
         vk *= hyb
-        if abs(omega) > 1e-10:  # For range separated Coulomb operator
-            with mol.with_range_coulomb(omega):
-                vk += ks_grad.get_k(mol, dm) * (alpha - hyb)
+        if omega != 0:
+            vk += ks_grad.get_k(mol, dm, omega=omega) * (alpha - hyb)
         vxc += vj - vk * .5
 
     return lib.tag_array(vxc, exc1_grid=exc)
 
+def _initialize_grids(ks_grad):
+    mf = ks_grad.base
+    ni = mf._numint
+    if ks_grad.grids is not None:
+        grids = ks_grad.grids
+    else:
+        grids = mf.grids
+    if grids.coords is None:
+        grids.build(with_non0tab=True)
+
+    nlcgrids = None
+    if mf.nlc or ni.libxc.is_nlc(mf.xc):
+        if ks_grad.nlcgrids is not None:
+            nlcgrids = ks_grad.nlcgrids
+        else:
+            nlcgrids = mf.nlcgrids
+        if nlcgrids.coords is None:
+            nlcgrids.build(with_non0tab=True)
+    return grids, nlcgrids
+
 
 def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
             max_memory=2000, verbose=None):
@@ -130,31 +140,6 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
                 wv[0] *= .5
                 _gga_grad_sum_(vmat[idm], mol, ao, wv, mask, ao_loc)
 
-    elif xctype == 'NLC':
-        nlc_pars = ni.nlc_coeff(xc_code)
-        ao_deriv = 2
-        vvrho = []
-        for ao, mask, weight, coords \
-                in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
-            vvrho.append([make_rho(idm, ao[:4], mask, 'GGA')
-                          for idm in range(nset)])
-
-        vv_vxc = []
-        for idm in range(nset):
-            rho = numpy.hstack([r[idm] for r in vvrho])
-            vxc = numint._vv10nlc(rho, grids.coords, rho, grids.weights,
-                                  grids.coords, nlc_pars)[1]
-            vv_vxc.append(xc_deriv.transform_vxc(rho, vxc, 'GGA', spin=0))
-
-        p1 = 0
-        for ao, mask, weight, coords \
-                in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
-            p0, p1 = p1, p1 + weight.size
-            for idm in range(nset):
-                wv = vv_vxc[idm][:,p0:p1] * weight
-                wv[0] *= .5  # *.5 because vmat + vmat.T at the end
-                _gga_grad_sum_(vmat[idm], mol, ao, wv, mask, ao_loc)
-
     elif xctype == 'MGGA':
         ao_deriv = 2
         for ao, mask, weight, coords \
@@ -174,6 +159,40 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     # - sign because nabla_X = -nabla_x
     return exc, -vmat
 
+def get_nlc_vxc(ni, mol, grids, xc_code, dm, relativity=0, hermi=1,
+                max_memory=2000, verbose=None):
+    make_rho, nset, nao = ni._gen_rho_evaluator(mol, dm, hermi, False, grids)
+    assert nset == 1
+    ao_loc = mol.ao_loc_nr()
+
+    vmat = numpy.zeros((3,nao,nao))
+    nlc_coefs = ni.nlc_coeff(xc_code)
+    if len(nlc_coefs) != 1:
+        raise NotImplementedError('Additive NLC')
+    nlc_pars, fac = nlc_coefs[0]
+    ao_deriv = 2
+    vvrho = []
+    for ao, mask, weight, coords \
+            in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
+        vvrho.append(make_rho(0, ao[:4], mask, 'GGA'))
+    rho = numpy.hstack(vvrho)
+
+    vxc = numint._vv10nlc(rho, grids.coords, rho, grids.weights,
+                          grids.coords, nlc_pars)[1]
+    vv_vxc = xc_deriv.transform_vxc(rho, vxc, 'GGA', spin=0)
+
+    p1 = 0
+    for ao, mask, weight, coords \
+            in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
+        p0, p1 = p1, p1 + weight.size
+        wv = vv_vxc[:,p0:p1] * weight
+        wv[0] *= .5  # *.5 because vmat + vmat.T at the end
+        _gga_grad_sum_(vmat, mol, ao, wv, mask, ao_loc)
+
+    exc = None
+    # - sign because nabla_X = -nabla_x
+    return exc, -vmat
+
 def _make_dR_dao_w(ao, wv):
     #:aow = numpy.einsum('npi,p->npi', ao[1:4], wv[0])
     aow = [
@@ -297,7 +316,7 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     make_rho, nset, nao = ni._gen_rho_evaluator(mol, dms, hermi, False, grids)
     ao_loc = mol.ao_loc_nr()
 
-    excsum = 0
+    excsum = numpy.zeros((mol.natm,3))
     vmat = numpy.zeros((3,nao,nao))
     if xctype == 'LDA':
         ao_deriv = 1
@@ -342,56 +361,6 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
             excsum[atm_id] += numpy.einsum('xij,ji->x', vtmp, dms) * 2
             rho = vxc = wv = None
 
-    elif xctype == 'NLC':
-        nlc_pars = ni.nlc_coeff(xc_code)
-        ao_deriv = 2
-        vvrho = []
-        vvcoords = []
-        vvweights = []
-        for atm_id, (coords, weight) in enumerate(grids_noresponse_cc(grids)):
-            mask = gen_grid.make_mask(mol, coords)
-            ao = ni.eval_ao(mol, coords, deriv=ao_deriv, non0tab=mask,
-                            cutoff=grids.cutoff)
-            vvrho.append(make_rho(0, ao[:4], mask, 'GGA'))
-            vvcoords.append(coords)
-            vvweights.append(weight)
-
-        vv_vxc = []
-        vvcoords_flat = numpy.vstack(vvcoords)
-        vvweights_flat = numpy.concatenate(vvweights)
-        vvrho_flat = numpy.hstack(vvrho)
-        for atm_id, (coords, weight, weight1) in enumerate(grids_response_cc(grids)):
-            rho = vvrho[atm_id]
-            mask = gen_grid.make_mask(mol, coords)
-            ao = ni.eval_ao(mol, coords, deriv=ao_deriv, non0tab=mask,
-                            cutoff=grids.cutoff)
-
-            exc, vxc = numint._vv10nlc(rho, coords, vvrho_flat, vvweights_flat,
-                                       vvcoords_flat, nlc_pars)
-            vv_vxc = xc_deriv.transform_vxc(rho, vxc, 'GGA', spin=0)
-            wv = vv_vxc * weight
-            wv[0] *= .5
-            vtmp = numpy.zeros((3,nao,nao))
-            _gga_grad_sum_(vtmp, mol, ao, wv, mask, ao_loc)
-            vmat += vtmp
-
-            vvrho_sub = numpy.hstack(
-                [r for i, r in enumerate(vvrho) if i != atm_id])
-            vvcoords_sub = numpy.vstack(
-                [r for i, r in enumerate(vvcoords) if i != atm_id])
-            vvweights_sub = numpy.concatenate(
-                [r for i, r in enumerate(vvweights) if i != atm_id])
-            egrad, Beta = _vv10nlc_grad(rho, coords, vvrho_sub,
-                                        vvweights_sub, vvcoords_sub,
-                                        nlc_pars)
-            # account for factor of 2 in double integration
-            exc -= 0.5 * Beta
-            # response of weights
-            excsum += 2 * numpy.einsum('r,r,nxr->nx', exc, rho[0], weight1)
-            # response of grids coordinates
-            excsum[atm_id] += 2 * numpy.einsum('xij,ji->x', vtmp, dms)
-            excsum[atm_id] += numpy.einsum('r,rx->x', rho[0]*weight, egrad)
-
     elif xctype == 'MGGA':
         ao_deriv = 2
         for atm_id, (coords, weight, weight1) in enumerate(grids_response_cc(grids)):
@@ -418,6 +387,67 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     # - sign because nabla_X = -nabla_x
     return excsum, -vmat
 
+def get_nlc_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
+                              max_memory=2000, verbose=None):
+    '''Full NLC functional response including the response of the grids'''
+    make_rho, nset, nao = ni._gen_rho_evaluator(mol, dms, hermi, False, grids)
+    ao_loc = mol.ao_loc_nr()
+
+    excsum = numpy.zeros((mol.natm,3))
+    vmat = numpy.zeros((3,nao,nao))
+    nlc_coefs = ni.nlc_coeff(xc_code)
+    if len(nlc_coefs) != 1:
+        raise NotImplementedError('Additive NLC')
+    nlc_pars, fac = nlc_coefs[0]
+    ao_deriv = 2
+    vvrho = []
+    vvcoords = []
+    vvweights = []
+    for atm_id, (coords, weight) in enumerate(grids_noresponse_cc(grids)):
+        mask = gen_grid.make_mask(mol, coords)
+        ao = ni.eval_ao(mol, coords, deriv=ao_deriv, non0tab=mask,
+                        cutoff=grids.cutoff)
+        vvrho.append(make_rho(0, ao[:4], mask, 'GGA'))
+        vvcoords.append(coords)
+        vvweights.append(weight)
+    vvcoords_flat = numpy.vstack(vvcoords)
+    vvweights_flat = numpy.concatenate(vvweights)
+    vvrho_flat = numpy.hstack(vvrho)
+
+    vv_vxc = []
+    for atm_id, (coords, weight, weight1) in enumerate(grids_response_cc(grids)):
+        rho = vvrho[atm_id]
+        mask = gen_grid.make_mask(mol, coords)
+        ao = ni.eval_ao(mol, coords, deriv=ao_deriv, non0tab=mask,
+                        cutoff=grids.cutoff)
+
+        exc, vxc = numint._vv10nlc(rho, coords, vvrho_flat, vvweights_flat,
+                                   vvcoords_flat, nlc_pars)
+        vv_vxc = xc_deriv.transform_vxc(rho, vxc, 'GGA', spin=0)
+        wv = vv_vxc * weight
+        wv[0] *= .5
+        vtmp = numpy.zeros((3,nao,nao))
+        _gga_grad_sum_(vtmp, mol, ao, wv, mask, ao_loc)
+        vmat += vtmp
+
+        vvrho_sub = numpy.hstack(
+            [r for i, r in enumerate(vvrho) if i != atm_id])
+        vvcoords_sub = numpy.vstack(
+            [r for i, r in enumerate(vvcoords) if i != atm_id])
+        vvweights_sub = numpy.concatenate(
+            [r for i, r in enumerate(vvweights) if i != atm_id])
+        egrad, Beta = _vv10nlc_grad(rho, coords, vvrho_sub,
+                                    vvweights_sub, vvcoords_sub, nlc_pars)
+        # account for factor of 2 in double integration
+        exc -= 0.5 * Beta
+        # response of weights
+        excsum += 2 * numpy.einsum('r,r,nxr->nx', exc, rho[0], weight1)
+        # response of grids coordinates
+        excsum[atm_id] += 2 * numpy.einsum('xij,ji->x', vtmp, dms)
+        excsum[atm_id] += numpy.einsum('r,rx->x', rho[0]*weight, egrad)
+    # - sign because nabla_X = -nabla_x
+    return excsum, -vmat
+
 
 # JCP 98, 5612 (1993); DOI:10.1063/1.464906
 def grids_response_cc(grids):
diff --git a/pyscf/grad/sacasscf.py b/pyscf/grad/sacasscf.py
index 68942e8e1c..b27a6c9da7 100644
--- a/pyscf/grad/sacasscf.py
+++ b/pyscf/grad/sacasscf.py
@@ -373,13 +373,13 @@ def __call__(self, mol_or_geom, **kwargs):
                 mol = mol_or_geom
             else:
                 mol = self.mol.set_geom_(mol_or_geom, inplace=False)
+            self.reset(mol)
             if 'state' in kwargs: self.state = kwargs['state']
             mc_scanner = self.base
             e_tot = mc_scanner(mol)
             if hasattr (mc_scanner, 'e_mcscf'): self.e_mcscf = mc_scanner.e_mcscf
             if hasattr (mc_scanner, 'e_states') and self.state is not None:
                 e_tot = mc_scanner.e_states[self.state]
-            self.mol = mol
             if not ('state' in kwargs):
                 kwargs['state'] = self.state
             de = self.kernel(**kwargs)
@@ -447,10 +447,33 @@ def unpack_uniq_var (self, x):
         return xorb, xci
 
     def make_fcasscf (self, state=None, casscf_attr={}, fcisolver_attr={}):
-        ''' Make a fake CASSCF object for ostensible single-state calculations '''
+        ''' SA-CASSCF nuclear gradients require 1) first derivatives wrt wave function variables
+        and nuclear shifts of the target state's energy, AND 2) first and second derivatives of the
+        objective function used to determine the MO coefficients and CI vectors. This function
+        addresses 1).
+
+        Kwargs:
+            state : integer
+                The specific state whose energy is being differentiated. This kwarg is necessary
+                in the context of state_average_mix, where the number of electrons and the
+                make_rdm* functions differ from state to state.
+            casscf_attr : dictionary
+                Extra attributes to apply to fcasscf. Relevant to child methods (i.e., MC-PDFT;
+                NACs)
+            fcisolver_attr : dictionary
+                Extra attributes to apply to fcasscf.fcisolver. Relevant to child methods (i.e.,
+                MC-PDFT; NACs)
+
+        Returns:
+            fcasscf : object of :class:`mc1step.CASSCF`
+                Set up to evaluate first derivatives of state "state". Only functions, classes,
+                and the nelecas variable are set up; the caller should assign MO coefficients
+                and CI vectors explicitly post facto.
+        '''
         fcasscf = mcscf.CASSCF (self.base._scf, self.base.ncas, self.base.nelecas)
         fcasscf.__dict__.update (self.base.__dict__)
 
+        nelecas = self.base.nelecas
         if isinstance (fcasscf.fcisolver, StateAverageFCISolver):
             if isinstance (fcasscf.fcisolver, StateAverageMixFCISolver):
                 p0 = 0
@@ -459,6 +482,7 @@ def make_fcasscf (self, state=None, casscf_attr={}, fcisolver_attr={}):
                     if p0 <= state < p1:
                         solver_class = solver.__class__
                         solver_obj = solver
+                        nelecas = fcasscf.fcisolver._get_nelec (solver_obj, nelecas)
                         break
                     p0 = p1
             else:
@@ -473,13 +497,30 @@ def make_fcasscf (self, state=None, casscf_attr={}, fcisolver_attr={}):
             fcasscf.fcisolver = copy.copy (fcasscf.fcisolver)
             fcasscf.fcisolver.ss_penalty = 0
         fcasscf.__dict__.update (casscf_attr)
+        fcasscf.nelecas = nelecas
         fcasscf.fcisolver.__dict__.update (fcisolver_attr)
         fcasscf.verbose, fcasscf.stdout = self.verbose, self.stdout
         fcasscf._tag_gfock_ov_nonzero = True
         return fcasscf
 
     def make_fcasscf_sa (self, casscf_attr={}, fcisolver_attr={}):
-        ''' Make a fake SA-CASSCF object to get around weird inheritance conflicts '''
+        ''' SA-CASSCF nuclear gradients require 1) first derivatives wrt wave function variables
+        and nuclear shifts of the target state's energy, AND 2) first and second derivatives of the
+        objective function used to determine the MO coefficients and CI vectors. This function
+        addresses 2). Note that penalty methods etc. must be removed, and that child methods such
+        as MC-PDFT which do not reoptimize the orbitals also do not alter this function.
+
+        Kwargs:
+            casscf_attr : dictionary
+                Extra attributes to apply to fcasscf. Just in case.
+            fcisolver_attr : dictionary
+                Extra attributes to apply to fcasscf.fcisolver. Just in case.
+
+        Returns:
+            fcasscf : object of :class:`StateAverageMCSCFSolver`
+                Set up to evaluate second derivatives of SA-CASSCF average energy in the
+                absence of (i.e., spin) penalties.
+        '''
         fcasscf = self.make_fcasscf (state=0, casscf_attr={}, fcisolver_attr={})
         fcasscf.__dict__.update (self.base.__dict__)
         if isinstance (self.base, StateAverageMCSCFSolver):
@@ -579,6 +620,8 @@ def get_ham_response (self, state=None, atmlst=None, verbose=None, mo=None, ci=N
         elif eris is None:
             eris = self.eris
         fcasscf_grad = casscf_grad.Gradients (self.make_fcasscf (state))
+        # Mute some misleading messages
+        fcasscf_grad._finalize = lambda: None
         return fcasscf_grad.kernel (mo_coeff=mo, ci=ci[state], atmlst=atmlst, verbose=verbose)
 
     def get_LdotJnuc (self, Lvec, state=None, atmlst=None, verbose=None, mo=None, ci=None,
diff --git a/pyscf/grad/tdrhf.py b/pyscf/grad/tdrhf.py
index c92ee42a01..5c0dcc86e6 100644
--- a/pyscf/grad/tdrhf.py
+++ b/pyscf/grad/tdrhf.py
@@ -126,10 +126,11 @@ def fvind(x):  # For singlet, closed shell ground state
                                   dmxmy-dmxmy.T))
     vj = vj.reshape(-1,3,nao,nao)
     vk = vk.reshape(-1,3,nao,nao)
+    vhf1 = -vk
     if singlet:
-        vhf1 = vj * 2 - vk
+        vhf1 += vj * 2
     else:
-        vhf1 = numpy.vstack((vj[:2]*2-vk[:2], -vk[2:]))
+        vhf1[:2] += vj[:2]*2
     time1 = log.timer('2e AO integral derivatives', *time1)
 
     if atmlst is None:
@@ -199,10 +200,10 @@ def __call__(self, mol_or_geom, state=state, **kwargs):
                 mol = mol_or_geom
             else:
                 mol = self.mol.set_geom_(mol_or_geom, inplace=False)
+            self.reset(mol)
 
             td_scanner = self.base
             td_scanner(mol)
-            self.mol = mol
 # TODO: Check root flip.  Maybe avoid the initial guess in TDHF otherwise
 # large error may be found in the excited states amplitudes
             de = self.kernel(state=state, **kwargs)
diff --git a/pyscf/grad/tdrks.py b/pyscf/grad/tdrks.py
index 86c33b8c2c..51e13139f1 100644
--- a/pyscf/grad/tdrks.py
+++ b/pyscf/grad/tdrks.py
@@ -76,26 +76,22 @@ def grad_elec(td_grad, x_y, singlet=True, atmlst=None,
     ni = mf._numint
     ni.libxc.test_deriv_order(mf.xc, 3, raise_error=True)
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin)
-    # dm0 = mf.make_rdm1(mo_coeff, mo_occ), but it is not used when computing
-    # fxc since rho0 is passed to fxc function.
-    rho0, vxc, fxc = ni.cache_xc_kernel(mf.mol, mf.grids, mf.xc,
-                                        [mo_coeff]*2, [mo_occ*.5]*2, spin=1)
     f1vo, f1oo, vxc1, k1ao = \
             _contract_xc_kernel(td_grad, mf.xc, dmxpy,
                                 dmzoo, True, True, singlet, max_memory)
 
-    if abs(hyb) > 1e-10:
+    if ni.libxc.is_hybrid_xc(mf.xc):
         dm = (dmzoo, dmxpy+dmxpy.T, dmxmy-dmxmy.T)
         vj, vk = mf.get_jk(mol, dm, hermi=0)
         vk *= hyb
-        if abs(omega) > 1e-10:
+        if omega != 0:
             vk += mf.get_k(mol, dm, hermi=0, omega=omega) * (alpha-hyb)
         veff0doo = vj[0] * 2 - vk[0] + f1oo[0] + k1ao[0] * 2
         wvo = reduce(numpy.dot, (orbv.T, veff0doo, orbo)) * 2
         if singlet:
             veff = vj[1] * 2 - vk[1] + f1vo[0] * 2
         else:
-            veff = -vk[1] + f1vo[0] * 2
+            veff = f1vo[0] - vk[1]
         veff0mop = reduce(numpy.dot, (mo_coeff.T, veff, mo_coeff))
         wvo -= numpy.einsum('ki,ai->ak', veff0mop[:nocc,:nocc], xpy) * 2
         wvo += numpy.einsum('ac,ai->ci', veff0mop[nocc:,nocc:], xpy) * 2
@@ -110,7 +106,7 @@ def grad_elec(td_grad, x_y, singlet=True, atmlst=None,
         if singlet:
             veff = vj[1] * 2 + f1vo[0] * 2
         else:
-            veff = f1vo[0] * 2
+            veff = f1vo[0]
         veff0mop = reduce(numpy.dot, (mo_coeff.T, veff, mo_coeff))
         wvo -= numpy.einsum('ki,ai->ak', veff0mop[:nocc,:nocc], xpy) * 2
         wvo += numpy.einsum('ac,ai->ci', veff0mop[nocc:,nocc:], xpy) * 2
@@ -128,7 +124,7 @@ def fvind(x):
     z1 = z1.reshape(nvir,nocc)
     time1 = log.timer('Z-vector using CPHF solver', *time0)
 
-    z1ao  = reduce(numpy.dot, (orbv, z1, orbo.T))
+    z1ao = reduce(numpy.dot, (orbv, z1, orbo.T))
     veff = vresp(z1ao+z1ao.T)
 
     im0 = numpy.zeros((nmo,nmo))
@@ -158,19 +154,19 @@ def fvind(x):
 
     dmz1doo = z1ao + dmzoo
     oo0 = reduce(numpy.dot, (orbo, orbo.T))
-    if abs(hyb) > 1e-10:
+    if ni.libxc.is_hybrid_xc(mf.xc):
         dm = (oo0, dmz1doo+dmz1doo.T, dmxpy+dmxpy.T, dmxmy-dmxmy.T)
         vj, vk = td_grad.get_jk(mol, dm)
         vk *= hyb
-        if abs(omega) > 1e-10:
-            with mol.with_range_coulomb(omega):
-                vk += td_grad.get_k(mol, dm) * (alpha-hyb)
+        if omega != 0:
+            vk += td_grad.get_k(mol, dm, omega=omega) * (alpha-hyb)
         vj = vj.reshape(-1,3,nao,nao)
         vk = vk.reshape(-1,3,nao,nao)
+        veff1 = -vk
         if singlet:
-            veff1 = vj * 2 - vk
+            veff1 += vj * 2
         else:
-            veff1 = numpy.vstack((vj[:2]*2-vk[:2], -vk[2:]))
+            veff1[:2] += vj[:2] * 2
     else:
         vj = td_grad.get_j(mol, (oo0, dmz1doo+dmz1doo.T, dmxpy+dmxpy.T))
         vj = vj.reshape(-1,3,nao,nao)
@@ -185,7 +181,10 @@ def fvind(x):
 
     veff1[0] += vxc1[1:]
     veff1[1] +=(f1oo[1:] + fxcz1[1:] + k1ao[1:]*2)*2 # *2 for dmz1doo+dmz1oo.T
-    veff1[2] += f1vo[1:] * 2
+    if singlet:
+        veff1[2] += f1vo[1:] * 2
+    else:
+        veff1[2] += f1vo[1:]
     time1 = log.timer('2e AO integral derivatives', *time1)
 
     if atmlst is None:
@@ -208,8 +207,8 @@ def fvind(x):
 
         e1 += numpy.einsum('xij,ij->x', veff1[1,:,p0:p1], oo0[p0:p1])
         e1 += numpy.einsum('xij,ij->x', veff1[2,:,p0:p1], dmxpy[p0:p1,:]) * 2
-        e1 += numpy.einsum('xij,ij->x', veff1[3,:,p0:p1], dmxmy[p0:p1,:]) * 2
         e1 += numpy.einsum('xji,ij->x', veff1[2,:,p0:p1], dmxpy[:,p0:p1]) * 2
+        e1 += numpy.einsum('xij,ij->x', veff1[3,:,p0:p1], dmxmy[p0:p1,:]) * 2
         e1 -= numpy.einsum('xji,ij->x', veff1[3,:,p0:p1], dmxmy[:,p0:p1]) * 2
 
         e1 += td_grad.extra_force(ia, locals())
@@ -257,118 +256,120 @@ def _contract_xc_kernel(td_grad, xc_code, dmvo, dmoo=None, with_vxc=True,
     else:
         k1ao = None
 
-    if xctype == 'LDA':
-        ao_deriv = 1
-        if singlet:
-            def lda_sum_(vmat, ao, wv, mask):
-                aow = numint._scale_ao(ao[0], wv)
-                for k in range(4):
-                    vmat[k] += numint._dot_ao_ao(mol, ao[k], aow, mask, shls_slice, ao_loc)
-
-            for ao, mask, weight, coords \
-                    in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
-                rho = ni.eval_rho2(mol, ao[0], mo_coeff, mo_occ, mask, xctype)
-                vxc, fxc, kxc = ni.eval_xc(xc_code, rho, 0, deriv=deriv)[1:]
-
-                wfxc = fxc[0] * weight * 2  # *2 for alpha+beta
-                rho1 = ni.eval_rho(mol, ao[0], dmvo, mask, xctype, hermi=1)
-                lda_sum_(f1vo, ao, wfxc * rho1, mask)
-                if dmoo is not None:
-                    rho2 = ni.eval_rho(mol, ao[0], dmoo, mask, xctype, hermi=1)
-                    lda_sum_(f1oo, ao, wfxc * rho2, mask)
-                if with_vxc:
-                    lda_sum_(v1ao, ao, vxc[0] * weight, mask)
-                if with_kxc:
-                    lda_sum_(k1ao, ao, kxc[0] * weight * rho1**2, mask)
-            if with_kxc:  # for (rho1*2)^2, *2 for alpha+beta in singlet
-                k1ao *= 4
-
-        else:
-            raise NotImplementedError('LDA triplet')
-
+    if xctype == 'HF':
+        return f1vo, f1oo, v1ao, k1ao
+    elif xctype == 'LDA':
+        fmat_, ao_deriv = _lda_eval_mat_, 1
     elif xctype == 'GGA':
-        if singlet:
-            def gga_sum_(vmat, ao, wv, mask):
-                aow = numint._scale_ao(ao[:4], wv[:4])
-                tmp = numint._dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
-                vmat[0] += tmp + tmp.T
-                rks_grad._gga_grad_sum_(vmat[1:], mol, ao, wv, mask, ao_loc)
-            ao_deriv = 2
-            for ao, mask, weight, coords \
-                    in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
-                rho = ni.eval_rho2(mol, ao, mo_coeff, mo_occ, mask, xctype)
-                vxc, fxc, kxc = ni.eval_xc(xc_code, rho, 0, deriv=deriv)[1:]
-
-                rho1 = ni.eval_rho(mol, ao, dmvo, mask, xctype, hermi=1) * 2  # *2 for alpha + beta
-                wv = numint._rks_gga_wv1(rho, rho1, vxc, fxc, weight)
-                gga_sum_(f1vo, ao, wv, mask)
-
-                if dmoo is not None:
-                    rho2 = ni.eval_rho(mol, ao, dmoo, mask, xctype, hermi=1) * 2
-                    wv = numint._rks_gga_wv1(rho, rho2, vxc, fxc, weight)
-                    gga_sum_(f1oo, ao, wv, mask)
-                if with_vxc:
-                    wv = numint._rks_gga_wv0(rho, vxc, weight)
-                    gga_sum_(v1ao, ao, wv, mask)
-                if with_kxc:
-                    wv = numint._rks_gga_wv2(rho, rho1, fxc, kxc, weight)
-                    gga_sum_(k1ao, ao, wv, mask)
-                vxc = fxc = kxc = rho = rho1 = None
-
-        else:
-            raise NotImplementedError('GGA triplet')
-
+        fmat_, ao_deriv = _gga_eval_mat_, 2
     elif xctype == 'MGGA':
-        if singlet:
-            def mgga_sum_(vmat, ao, wv, mask):
-                aow = numint._scale_ao(ao[:4], wv[:4])
-                tmp = numint._dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
-                aow = numint._scale_ao(ao[1], wv[5], aow)
-                tmp += numint._dot_ao_ao(mol, ao[1], aow, mask, shls_slice, ao_loc)
-                aow = numint._scale_ao(ao[2], wv[5], aow)
-                tmp += numint._dot_ao_ao(mol, ao[2], aow, mask, shls_slice, ao_loc)
-                aow = numint._scale_ao(ao[3], wv[5], aow)
-                tmp += numint._dot_ao_ao(mol, ao[3], aow, mask, shls_slice, ao_loc)
-                vmat[0] += tmp + tmp.T
-
-                rks_grad._gga_grad_sum_(vmat[1:], mol, ao, wv[:4], mask, ao_loc)
-                rks_grad._tau_grad_dot_(vmat[1:], mol, ao, wv[5]*2, mask, ao_loc, True)
-
-            ao_deriv = 2
-            for ao, mask, weight, coords \
-                    in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
-                rho = ni.eval_rho2(mol, ao, mo_coeff, mo_occ, mask, xctype)
-                vxc, fxc, kxc = ni.eval_xc(xc_code, rho, 0, deriv=deriv)[1:]
-
-                rho1 = ni.eval_rho(mol, ao, dmvo, mask, xctype, hermi=1) * 2  # *2 for alpha + beta
-                wv = numint._rks_mgga_wv1(rho, rho1, vxc, fxc, weight)
-                mgga_sum_(f1vo, ao, wv, mask)
-
-                if dmoo is not None:
-                    rho2 = ni.eval_rho(mol, ao, dmoo, mask, xctype, hermi=1) * 2
-                    wv = numint._rks_mgga_wv1(rho, rho2, vxc, fxc, weight)
-                    mgga_sum_(f1oo, ao, wv, mask)
-                if with_vxc:
-                    wv = numint._rks_mgga_wv0(rho, vxc, weight)
-                    mgga_sum_(v1ao, ao, wv, mask)
-                if with_kxc:
-                    wv = numint._rks_mgga_wv2(rho, rho1, fxc, kxc, weight)
-                    mgga_sum_(k1ao, ao, wv, mask)
-                vxc = fxc = kxc = rho = rho1 = None
-        else:
-            raise NotImplementedError('MGGA triplet')
-
-    elif xctype == 'HF':
-        pass
+        fmat_, ao_deriv = _mgga_eval_mat_, 2
+        logger.warn(td_grad, 'TDRKS-MGGA Gradients may be inaccurate due to grids response')
     else:
         raise NotImplementedError(f'td-rks for functional {xc_code}')
 
+    if singlet:
+        for ao, mask, weight, coords \
+                in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
+            if xctype == 'LDA':
+                ao0 = ao[0]
+            else:
+                ao0 = ao
+            rho = ni.eval_rho2(mol, ao0, mo_coeff, mo_occ, mask, xctype, with_lapl=False)
+            vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:]
+
+            rho1 = ni.eval_rho(mol, ao0, dmvo, mask, xctype, hermi=1,
+                               with_lapl=False) * 2  # *2 for alpha + beta
+            if xctype == 'LDA':
+                rho1 = rho1[numpy.newaxis]
+            wv = numpy.einsum('yg,xyg,g->xg', rho1, fxc, weight)
+            fmat_(mol, f1vo, ao, wv, mask, shls_slice, ao_loc)
+
+            if dmoo is not None:
+                rho2 = ni.eval_rho(mol, ao0, dmoo, mask, xctype, hermi=1, with_lapl=False) * 2
+                if xctype == 'LDA':
+                    rho2 = rho2[numpy.newaxis]
+                wv = numpy.einsum('yg,xyg,g->xg', rho2, fxc, weight)
+                fmat_(mol, f1oo, ao, wv, mask, shls_slice, ao_loc)
+            if with_vxc:
+                fmat_(mol, v1ao, ao, vxc * weight, mask, shls_slice, ao_loc)
+            if with_kxc:
+                wv = numpy.einsum('yg,zg,xyzg,g->xg', rho1, rho1, kxc, weight)
+                fmat_(mol, k1ao, ao, wv, mask, shls_slice, ao_loc)
+    else:
+        for ao, mask, weight, coords \
+                in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
+            if xctype == 'LDA':
+                ao0 = ao[0]
+            else:
+                ao0 = ao
+            rho = ni.eval_rho2(mol, ao0, mo_coeff, mo_occ, mask, xctype, with_lapl=False)
+            rho *= .5
+            rho = numpy.repeat(rho[numpy.newaxis], 2, axis=0)
+            vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:]
+            # fxc_t couples triplet excitation amplitues
+            # 1/2 int (tia - tIA) fxc (tjb - tJB) = tia fxc_t tjb
+            fxc_t = fxc[:,:,0] - fxc[:,:,1]
+            fxc_t = fxc_t[0] - fxc_t[1]
+
+            rho1 = ni.eval_rho(mol, ao0, dmvo, mask, xctype, hermi=1, with_lapl=False)
+            if xctype == 'LDA':
+                rho1 = rho1[numpy.newaxis]
+            wv = numpy.einsum('yg,xyg,g->xg', rho1, fxc_t, weight)
+            fmat_(mol, f1vo, ao, wv, mask, shls_slice, ao_loc)
+
+            if dmoo is not None:
+                # fxc_s == 2 * fxc of spin restricted xc kernel
+                # provides f1oo to couple the interaction between first order MO
+                # and density response of tddft amplitudes, which is described by dmoo
+                fxc_s = fxc[0,:,0] + fxc[0,:,1]
+                rho2 = ni.eval_rho(mol, ao0, dmoo, mask, xctype, hermi=1, with_lapl=False)
+                if xctype == 'LDA':
+                    rho2 = rho2[numpy.newaxis]
+                wv = numpy.einsum('yg,xyg,g->xg', rho2, fxc_s, weight)
+                fmat_(mol, f1oo, ao, wv, mask, shls_slice, ao_loc)
+            if with_vxc:
+                vxc = vxc[0]
+                fmat_(mol, v1ao, ao, vxc * weight, mask, shls_slice, ao_loc)
+            if with_kxc:
+                # kxc in terms of the triplet coupling
+                # 1/2 int (tia - tIA) kxc (tjb - tJB) = tia kxc_t tjb
+                kxc = kxc[0,:,0] - kxc[0,:,1]
+                kxc = kxc[:,:,0] - kxc[:,:,1]
+                wv = numpy.einsum('yg,zg,xyzg,g->xg', rho1, rho1, kxc, weight)
+                fmat_(mol, k1ao, ao, wv, mask, shls_slice, ao_loc)
+
     f1vo[1:] *= -1
     if f1oo is not None: f1oo[1:] *= -1
     if v1ao is not None: v1ao[1:] *= -1
     if k1ao is not None: k1ao[1:] *= -1
     return f1vo, f1oo, v1ao, k1ao
 
+def _lda_eval_mat_(mol, vmat, ao, wv, mask, shls_slice, ao_loc):
+    aow = numint._scale_ao(ao[0], wv[0])
+    for k in range(4):
+        vmat[k] += numint._dot_ao_ao(mol, ao[k], aow, mask, shls_slice, ao_loc)
+    return vmat
+
+def _gga_eval_mat_(mol, vmat, ao, wv, mask, shls_slice, ao_loc):
+    wv[0] *= .5  # *.5 because vmat + vmat.T at the end
+    aow = numint._scale_ao(ao[:4], wv[:4])
+    tmp = numint._dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    vmat[0] += tmp + tmp.T
+    rks_grad._gga_grad_sum_(vmat[1:], mol, ao, wv, mask, ao_loc)
+    return vmat
+
+def _mgga_eval_mat_(mol, vmat, ao, wv, mask, shls_slice, ao_loc):
+    wv[0] *= .5  # *.5 because vmat + vmat.T at the end
+    wv[4] *= .5  # *.5 for 1/2 in tau
+    aow = numint._scale_ao(ao[:4], wv[:4])
+    tmp = numint._dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    vmat[0] += tmp + tmp.T
+    vmat[0] += numint._tau_dot(mol, ao, ao, wv[4], mask, shls_slice, ao_loc)
+    rks_grad._gga_grad_sum_(vmat[1:], mol, ao, wv[:4], mask, ao_loc)
+    rks_grad._tau_grad_dot_(vmat[1:], mol, ao, wv[4], mask, ao_loc, True)
+    return vmat
+
 
 class Gradients(tdrhf.Gradients):
     @lib.with_doc(grad_elec.__doc__)
diff --git a/pyscf/grad/tduks.py b/pyscf/grad/tduks.py
index e81692b5be..f0f0d7d79c 100644
--- a/pyscf/grad/tduks.py
+++ b/pyscf/grad/tduks.py
@@ -26,7 +26,7 @@
 from pyscf.lib import logger
 from pyscf.dft import numint
 from pyscf.grad import tdrhf as tdrhf_grad
-from pyscf.grad import rks as rks_grad
+from pyscf.grad import tdrks as tdrks_grad
 from pyscf.scf import ucphf
 
 
@@ -98,12 +98,12 @@ def grad_elec(td_grad, x_y, atmlst=None, max_memory=2000, verbose=logger.INFO):
             _contract_xc_kernel(td_grad, mf.xc, (dmxpya,dmxpyb),
                                 (dmzooa,dmzoob), True, True, max_memory)
 
-    if abs(hyb) > 1e-10:
+    if ni.libxc.is_hybrid_xc(mf.xc):
         dm = (dmzooa, dmxpya+dmxpya.T, dmxmya-dmxmya.T,
               dmzoob, dmxpyb+dmxpyb.T, dmxmyb-dmxmyb.T)
         vj, vk = mf.get_jk(mol, dm, hermi=0)
         vk *= hyb
-        if abs(omega) > 1e-10:
+        if omega != 0:
             vk += mf.get_k(mol, dm, hermi=0, omega=omega) * (alpha-hyb)
         vj = vj.reshape(2,3,nao,nao)
         vk = vk.reshape(2,3,nao,nao)
@@ -215,15 +215,14 @@ def fvind(x):
     oo0b = reduce(numpy.dot, (orbob, orbob.T))
     as_dm1 = oo0a + oo0b + (dmz1dooa + dmz1doob) * .5
 
-    if abs(hyb) > 1e-10:
+    if ni.libxc.is_hybrid_xc(mf.xc):
         dm = (oo0a, dmz1dooa+dmz1dooa.T, dmxpya+dmxpya.T, dmxmya-dmxmya.T,
               oo0b, dmz1doob+dmz1doob.T, dmxpyb+dmxpyb.T, dmxmyb-dmxmyb.T)
         vj, vk = td_grad.get_jk(mol, dm)
         vj = vj.reshape(2,4,3,nao,nao)
         vk = vk.reshape(2,4,3,nao,nao) * hyb
-        if abs(omega) > 1e-10:
-            with mol.with_range_coulomb(omega):
-                vk += td_grad.get_k(mol, dm).reshape(2,4,3,nao,nao) * (alpha-hyb)
+        if omega != 0:
+            vk += td_grad.get_k(mol, dm, omega=omega).reshape(2,4,3,nao,nao) * (alpha-hyb)
         veff1 = vj[0] + vj[1] - vk
     else:
         dm = (oo0a, dmz1dooa+dmz1dooa.T, dmxpya+dmxpya.T,
@@ -320,123 +319,55 @@ def _contract_xc_kernel(td_grad, xc_code, dmvo, dmoo=None, with_vxc=True,
     else:
         k1ao = None
 
-    if xctype == 'LDA':
-        def lda_sum_(vmat, ao, wv, mask):
-            aow = numint._scale_ao(ao[0], wv)
-            for k in range(4):
-                vmat[k] += numint._dot_ao_ao(mol, ao[k], aow, mask, shls_slice, ao_loc)
-
-        ao_deriv = 1
-        for ao, mask, weight, coords \
-                in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
-            rho = (ni.eval_rho2(mol, ao[0], mo_coeff[0], mo_occ[0], mask, xctype),
-                   ni.eval_rho2(mol, ao[0], mo_coeff[1], mo_occ[1], mask, xctype))
-            vxc, fxc, kxc = ni.eval_xc(xc_code, rho, 1, deriv=deriv)[1:]
-
-            u_u, u_d, d_d = fxc[0].T * weight
-            rho1a = ni.eval_rho(mol, ao[0], dmvo[0], mask, xctype, hermi=1)
-            rho1b = ni.eval_rho(mol, ao[0], dmvo[1], mask, xctype, hermi=1)
-            lda_sum_(f1vo[0], ao, u_u*rho1a+u_d*rho1b, mask)
-            lda_sum_(f1vo[1], ao, u_d*rho1a+d_d*rho1b, mask)
-            if dmoo is not None:
-                rho2a = ni.eval_rho(mol, ao[0], dmoo[0], mask, xctype, hermi=1)
-                rho2b = ni.eval_rho(mol, ao[0], dmoo[1], mask, xctype, hermi=1)
-                lda_sum_(f1oo[0], ao, u_u*rho2a+u_d*rho2b, mask)
-                lda_sum_(f1oo[1], ao, u_d*rho2a+d_d*rho2b, mask)
-            if with_vxc:
-                vrho = vxc[0].T * weight
-                lda_sum_(v1ao[0], ao, vrho[0], mask)
-                lda_sum_(v1ao[1], ao, vrho[1], mask)
-            if with_kxc:
-                u_u_u, u_u_d, u_d_d, d_d_d = kxc[0].T * weight
-                lda_sum_(k1ao[0], ao, u_u_u*rho1a*rho1a+u_u_d*rho1a*rho1b*2+u_d_d*rho1b*rho1b, mask)
-                lda_sum_(k1ao[1], ao, u_u_d*rho1a*rho1a+u_d_d*rho1a*rho1b*2+d_d_d*rho1b*rho1b, mask)
-
+    if xctype == 'HF':
+        return f1vo, f1oo, v1ao, k1ao
+    elif xctype == 'LDA':
+        fmat_, ao_deriv = tdrks_grad._lda_eval_mat_, 1
     elif xctype == 'GGA':
-        def gga_sum_(vmat, ao, wv, mask):
-            aow = numint._scale_ao(ao[:4], wv[:4])
-            tmp = numint._dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
-            vmat[0] += tmp + tmp.T
-            rks_grad._gga_grad_sum_(vmat[1:], mol, ao, wv, mask, ao_loc)
-        ao_deriv = 2
-        for ao, mask, weight, coords \
-                in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
-            rho = (ni.eval_rho2(mol, ao, mo_coeff[0], mo_occ[0], mask, xctype),
-                   ni.eval_rho2(mol, ao, mo_coeff[1], mo_occ[1], mask, xctype))
-            vxc, fxc, kxc = ni.eval_xc(xc_code, rho, 1, deriv=deriv)[1:]
-
-            rho1 = (ni.eval_rho(mol, ao, dmvo[0], mask, xctype, hermi=1),
-                    ni.eval_rho(mol, ao, dmvo[1], mask, xctype, hermi=1))
-            wv = numint._uks_gga_wv1(rho, rho1, vxc, fxc, weight)
-            gga_sum_(f1vo[0], ao, wv[0], mask)
-            gga_sum_(f1vo[1], ao, wv[1], mask)
-
-            if dmoo is not None:
-                rho2 = (ni.eval_rho(mol, ao, dmoo[0], mask, xctype, hermi=1),
-                        ni.eval_rho(mol, ao, dmoo[1], mask, xctype, hermi=1))
-                wv = numint._uks_gga_wv1(rho, rho2, vxc, fxc, weight)
-                gga_sum_(f1oo[0], ao, wv[0], mask)
-                gga_sum_(f1oo[1], ao, wv[1], mask)
-            if with_vxc:
-                wv = numint._uks_gga_wv0(rho, vxc, weight)
-                gga_sum_(v1ao[0], ao, wv[0], mask)
-                gga_sum_(v1ao[1], ao, wv[1], mask)
-            if with_kxc:
-                wv = numint._uks_gga_wv2(rho, rho1, fxc, kxc, weight)
-                gga_sum_(k1ao[0], ao, wv[0], mask)
-                gga_sum_(k1ao[1], ao, wv[1], mask)
-            vxc = fxc = kxc = rho = rho1 = None
-
+        fmat_, ao_deriv = tdrks_grad._gga_eval_mat_, 2
     elif xctype == 'MGGA':
-        logger.warn(mol, 'TD-MGGA gradients may be incorrect.')
-        def mgga_sum_(vmat, ao, wv, mask):
-            aow = numint._scale_ao(ao[:4], wv[:4])
-            tmp = numint._dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
-            aow = numint._scale_ao(ao[1], wv[5], aow)
-            tmp += numint._dot_ao_ao(mol, ao[1], aow, mask, shls_slice, ao_loc)
-            aow = numint._scale_ao(ao[2], wv[5], aow)
-            tmp += numint._dot_ao_ao(mol, ao[2], aow, mask, shls_slice, ao_loc)
-            aow = numint._scale_ao(ao[3], wv[5], aow)
-            tmp += numint._dot_ao_ao(mol, ao[3], aow, mask, shls_slice, ao_loc)
-            vmat[0] += tmp + tmp.T
-
-            rks_grad._gga_grad_sum_(vmat[1:], mol, ao, wv[:4], mask, ao_loc)
-            rks_grad._tau_grad_dot_(vmat[1:], mol, ao, wv[5]*2, mask, ao_loc, True)
-
-        ao_deriv = 2
-        for ao, mask, weight, coords \
-                in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
-            rho = (ni.eval_rho2(mol, ao, mo_coeff[0], mo_occ[0], mask, xctype),
-                   ni.eval_rho2(mol, ao, mo_coeff[1], mo_occ[1], mask, xctype))
-            vxc, fxc, kxc = ni.eval_xc(xc_code, rho, 1, deriv=deriv)[1:]
-
-            rho1 = (ni.eval_rho(mol, ao, dmvo[0], mask, xctype, hermi=1),
-                    ni.eval_rho(mol, ao, dmvo[1], mask, xctype, hermi=1))
-            wv = numint._uks_mgga_wv1(rho, rho1, vxc, fxc, weight)
-            mgga_sum_(f1vo[0], ao, wv[0], mask)
-            mgga_sum_(f1vo[1], ao, wv[1], mask)
-
-            if dmoo is not None:
-                rho2 = (ni.eval_rho(mol, ao, dmoo[0], mask, xctype, hermi=1),
-                        ni.eval_rho(mol, ao, dmoo[1], mask, xctype, hermi=1))
-                wv = numint._uks_mgga_wv1(rho, rho2, vxc, fxc, weight)
-                mgga_sum_(f1oo[0], ao, wv[0], mask)
-                mgga_sum_(f1oo[1], ao, wv[1], mask)
-            if with_vxc:
-                wv = numint._uks_mgga_wv0(rho, vxc, weight)
-                mgga_sum_(v1ao[0], ao, wv[0], mask)
-                mgga_sum_(v1ao[1], ao, wv[1], mask)
-            if with_kxc:
-                wv = numint._uks_mgga_wv2(rho, rho1, fxc, kxc, weight)
-                mgga_sum_(k1ao[0], ao, wv[0], mask)
-                mgga_sum_(k1ao[1], ao, wv[1], mask)
-            vxc = fxc = kxc = rho = rho1 = None
-
-    elif xctype == 'HF':
-        pass
+        fmat_, ao_deriv = tdrks_grad._mgga_eval_mat_, 2
+        logger.warn(td_grad, 'TDUKS-MGGA Gradients may be inaccurate due to grids response')
     else:
         raise NotImplementedError(f'td-uks for functional {xc_code}')
 
+    for ao, mask, weight, coords \
+            in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
+        if xctype == 'LDA':
+            ao0 = ao[0]
+        else:
+            ao0 = ao
+        rho = (ni.eval_rho2(mol, ao0, mo_coeff[0], mo_occ[0], mask, xctype, with_lapl=False),
+               ni.eval_rho2(mol, ao0, mo_coeff[1], mo_occ[1], mask, xctype, with_lapl=False))
+        vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:]
+
+        rho1 = numpy.asarray((
+            ni.eval_rho(mol, ao0, dmvo[0], mask, xctype, hermi=1, with_lapl=False),
+            ni.eval_rho(mol, ao0, dmvo[1], mask, xctype, hermi=1, with_lapl=False)))
+        if xctype == 'LDA':
+            rho1 = rho1[:,numpy.newaxis]
+        wv = numpy.einsum('axg,axbyg,g->byg', rho1, fxc, weight)
+        fmat_(mol, f1vo[0], ao, wv[0], mask, shls_slice, ao_loc)
+        fmat_(mol, f1vo[1], ao, wv[1], mask, shls_slice, ao_loc)
+
+        if dmoo is not None:
+            rho2 = numpy.asarray((
+                ni.eval_rho(mol, ao0, dmoo[0], mask, xctype, hermi=1, with_lapl=False),
+                ni.eval_rho(mol, ao0, dmoo[1], mask, xctype, hermi=1, with_lapl=False)))
+            if xctype == 'LDA':
+                rho2 = rho2[:,numpy.newaxis]
+            wv = numpy.einsum('axg,axbyg,g->byg', rho2, fxc, weight)
+            fmat_(mol, f1oo[0], ao, wv[0], mask, shls_slice, ao_loc)
+            fmat_(mol, f1oo[1], ao, wv[1], mask, shls_slice, ao_loc)
+        if with_vxc:
+            wv = vxc * weight
+            fmat_(mol, v1ao[0], ao, wv[0], mask, shls_slice, ao_loc)
+            fmat_(mol, v1ao[1], ao, wv[1], mask, shls_slice, ao_loc)
+        if with_kxc:
+            wv = numpy.einsum('axg,byg,axbyczg,g->czg', rho1, rho1, kxc, weight)
+            fmat_(mol, k1ao[0], ao, wv[0], mask, shls_slice, ao_loc)
+            fmat_(mol, k1ao[1], ao, wv[1], mask, shls_slice, ao_loc)
+
     f1vo[:,1:] *= -1
     if f1oo is not None: f1oo[:,1:] *= -1
     if v1ao is not None: v1ao[:,1:] *= -1
diff --git a/pyscf/grad/test/test_mp2.py b/pyscf/grad/test/test_mp2.py
index 382c82af16..1d920cb772 100644
--- a/pyscf/grad/test/test_mp2.py
+++ b/pyscf/grad/test/test_mp2.py
@@ -52,6 +52,17 @@ def test_mp2_grad(self):
 # H     0.0000000000    -0.0222745046    -0.0044605683
         self.assertAlmostEqual(lib.fp(g1), -0.035681131697586257, 6)
 
+        geom1 = [
+            [8 , (0. , 0.     , 0.)],
+            [1 , (0. , -0.757 , 0.55)],
+            [1 , (0. , 0.757  , 0.54)]]
+        mol1 = gto.M(atom=geom1, basis='631g')
+        pt1 = mol1.MP2().Gradients()
+        de_ref = pt1.kernel()
+        e, de = pt.Gradients().as_scanner()(geom1)
+        self.assertAlmostEqual(pt1.base.e_tot, e, 7)
+        self.assertAlmostEqual(abs(de - de_ref).max(), 0, 5)
+
     def test_mp2_grad_finite_diff(self):
         mol = gto.M(
             verbose = 0,
@@ -146,4 +157,3 @@ def test_symmetrize(self):
 if __name__ == "__main__":
     print("Tests for MP2 gradients")
     unittest.main()
-
diff --git a/pyscf/grad/test/test_tdrhf_grad.py b/pyscf/grad/test/test_tdrhf_grad.py
index 155ed8b79f..6da2e61006 100644
--- a/pyscf/grad/test/test_tdrhf_grad.py
+++ b/pyscf/grad/test/test_tdrhf_grad.py
@@ -208,5 +208,3 @@ def test_symmetrize(self):
 if __name__ == "__main__":
     print("Full Tests for TD-RHF gradients")
     unittest.main()
-
-
diff --git a/pyscf/grad/test/test_tdrks_grad.py b/pyscf/grad/test/test_tdrks_grad.py
index b518832d77..6da0a4b965 100644
--- a/pyscf/grad/test/test_tdrks_grad.py
+++ b/pyscf/grad/test/test_tdrks_grad.py
@@ -54,18 +54,17 @@ def test_tda_singlet_lda(self):
         g1 = tdg.kernel(td.xy[2])
         self.assertAlmostEqual(g1[0,2], -9.23916667e-02, 6)
 
-    @unittest.skip('not implmented')
     def test_tda_triplet_lda(self):
         td = tdscf.TDA(mf_lda).run(singlet=False, nstates=3)
         tdg = td.nuc_grad_method()
         g1 = tdg.kernel(state=3)
-        self.assertAlmostEqual(g1[0,2], -0.3633334, 6)
+        self.assertAlmostEqual(g1[0,2], -0.3311324654, 6)
 
         td_solver = td.as_scanner()
         pmol = mol.copy()
         e1 = td_solver(pmol.set_geom_('H 0 0 1.805; F 0 0 0', unit='B'))
         e2 = td_solver(pmol.set_geom_('H 0 0 1.803; F 0 0 0', unit='B'))
-        self.assertAlmostEqual(abs((e1[2]-e2[2])/.002 - g1[0,2]).max(), 0, 5)
+        self.assertAlmostEqual(abs((e1[2]-e2[2])/.002 - g1[0,2]).max(), 0, 4)
 
     def test_tda_singlet_b88(self):
         td = tdscf.TDA(mf_gga).run(nstates=3)
@@ -93,9 +92,8 @@ def test_tda_singlet_b3lyp_xcfun(self):
         pmol = mol.copy()
         e1 = td_solver(pmol.set_geom_('H 0 0 1.805; F 0 0 0', unit='B'))
         e2 = td_solver(pmol.set_geom_('H 0 0 1.803; F 0 0 0', unit='B'))
-        self.assertAlmostEqual(abs((e1[2]-e2[2])/.002 - g1[0,2]).max(), 0, 5)
+        self.assertAlmostEqual(abs((e1[2]-e2[2])/.002 - g1[0,2]).max(), 0, 4)
 
-    @unittest.skip('not implmented')
     def test_tda_triplet_b3lyp(self):
         mf = dft.RKS(mol)
         mf.xc = 'b3lyp'
@@ -104,13 +102,13 @@ def test_tda_triplet_b3lyp(self):
         td = tdscf.TDA(mf).run(singlet=False, nstates=3)
         tdg = td.nuc_grad_method()
         g1 = tdg.kernel(state=3)
-        self.assertAlmostEqual(g1[0,2], -0.3633334, 6)
+        self.assertAlmostEqual(g1[0,2], -0.36333834, 6)
 
         td_solver = td.as_scanner()
         pmol = mol.copy()
         e1 = td_solver(pmol.set_geom_('H 0 0 1.805; F 0 0 0', unit='B'))
         e2 = td_solver(pmol.set_geom_('H 0 0 1.803; F 0 0 0', unit='B'))
-        self.assertAlmostEqual(abs((e1[2]-e2[2])/.002 - g1[0,2]).max(), 0, 5)
+        self.assertAlmostEqual(abs((e1[2]-e2[2])/.002 - g1[0,2]).max(), 0, 4)
 
     def test_tda_singlet_mgga(self):
         mf = dft.RKS(mol)
@@ -126,7 +124,7 @@ def test_tda_singlet_mgga(self):
         pmol = mol.copy()
         e1 = td_solver(pmol.set_geom_('H 0 0 1.805; F 0 0 0', unit='B'))
         e2 = td_solver(pmol.set_geom_('H 0 0 1.803; F 0 0 0', unit='B'))
-        # FIXME: why the error is larger than 1e-4?
+        # FIXME: why the error is larger than 1e-4? Issue of grids response?
         self.assertAlmostEqual(abs((e1[2]-e2[2])/.002 - g1[0,2]).max(), 0, 3)
 
     def test_tddft_lda(self):
diff --git a/pyscf/grad/test/test_tduks_grad.py b/pyscf/grad/test/test_tduks_grad.py
index 90120d57e5..9708ab6e76 100644
--- a/pyscf/grad/test/test_tduks_grad.py
+++ b/pyscf/grad/test/test_tduks_grad.py
@@ -48,82 +48,82 @@ def tearDownModule():
     del mol, pmol, mf_lda, mf_gga
 
 class KnownValues(unittest.TestCase):
-#    def test_tda_lda(self):
-#        td = tdscf.TDA(mf_lda).run(nstates=3)
-#        tdg = td.nuc_grad_method()
-#        g1 = tdg.kernel(td.xy[2])
-#        self.assertAlmostEqual(g1[0,2], -0.40279473514282405, 6)
-#
-#        td_solver = td.as_scanner()
-#        e1 = td_solver(pmol.set_geom_('H 0 0 1.805; F 0 0 0', unit='B'))
-#        e2 = td_solver(pmol.set_geom_('H 0 0 1.803; F 0 0 0', unit='B'))
-#        self.assertAlmostEqual((e1[2]-e2[2])/.002, g1[0,2], 5)
-#
-#    def test_tda_b88(self):
-#        td = tdscf.TDA(mf_gga).run(nstates=3)
-#        tdg = td.nuc_grad_method()
-#        g1 = tdg.kernel(state=3)
-#        self.assertAlmostEqual(g1[0,2], -0.8120037135120326, 6)
-#
-#        td_solver = td.as_scanner()
-#        e1 = td_solver(pmol.set_geom_('H 0 0 1.805; F 0 0 0', unit='B'))
-#        e2 = td_solver(pmol.set_geom_('H 0 0 1.803; F 0 0 0', unit='B'))
-#        self.assertAlmostEqual((e1[2]-e2[2])/.002, g1[0,2], 5)
-#
-#    def test_tddft_lda(self):
-#        td = tdscf.TDDFT(mf_lda).run(nstates=3)
-#        tdg = td.nuc_grad_method()
-#        g1 = tdg.kernel(state=3)
-#        self.assertAlmostEqual(g1[0,2], -0.39791714992157035, 6)
-#
-#        td_solver = td.as_scanner()
-#        e1 = td_solver(pmol.set_geom_('H 0 0 1.805; F 0 0 0', unit='B'))
-#        e2 = td_solver(pmol.set_geom_('H 0 0 1.803; F 0 0 0', unit='B'))
-#        self.assertAlmostEqual((e1[2]-e2[2])/.002, g1[0,2], 5)
-#
-#    @unittest.skip('has bug')
-#    def test_tda_mgga(self):
-#        mf = dft.UKS(mol)
-#        mf.xc = 'm06l'
-#        mf.conv_tol = 1e-12
-#        mf.kernel()
-#        td = mf.TDA().run(nstates=3)
-#        tdg = td.Gradients()
-#        g1 = tdg.kernel(state=2)
-#        self.assertAlmostEqual(g1[0,2], -0.31324464083043635, 4)
-#
-#        td_solver = td.as_scanner()
-#        pmol = mol.copy()
-#        e1 = td_solver(pmol.set_geom_('H 0 0 1.805; F 0 0 0', unit='B'))
-#        e2 = td_solver(pmol.set_geom_('H 0 0 1.803; F 0 0 0', unit='B'))
-#        self.assertAlmostEqual(abs((e1[2]-e2[2])/.002 - g1[0,2]).max(), 0, 4)
-#        self.assertAlmostEqual(abs((e1[2]-e2[2])/.002 - g1[1,2]).max(), 0, 4)
-#
-#    def test_tddft_b3lyp(self):
-#        mf = dft.UKS(mol).set(conv_tol=1e-12)
-#        mf.xc = '.2*HF + .8*b88, vwn'
-#        mf.scf()
-#        td = tdscf.TDDFT(mf).run(nstates=3)
-#        tdg = td.nuc_grad_method()
-#        g1 = tdg.kernel(state=3)
-#        self.assertAlmostEqual(g1[0,2], -0.80446691153291727, 6)
-#
-#        td_solver = td.as_scanner()
-#        e1 = td_solver(pmol.set_geom_('H 0 0 1.805; F 0 0 0', unit='B'))
-#        e2 = td_solver(pmol.set_geom_('H 0 0 1.803; F 0 0 0', unit='B'))
-#        self.assertAlmostEqual((e1[2]-e2[2])/.002, g1[0,2], 4)
-#
-#    def test_range_separated(self):
-#        mol = gto.M(atom="H; H 1 1.", basis='631g', verbose=0)
-#        mf = dft.UKS(mol).set(xc='CAMB3LYP')
-#        td = mf.apply(tdscf.TDA)
-#        tdg_scanner = td.nuc_grad_method().as_scanner()
-#        g = tdg_scanner(mol, state=3)[1]
-#        self.assertAlmostEqual(lib.fp(g), -0.46656653988919661, 6)
-#        smf = td.as_scanner()
-#        e1 = smf(mol.set_geom_("H; H 1 1.001"))[2]
-#        e2 = smf(mol.set_geom_("H; H 1 0.999"))[2]
-#        self.assertAlmostEqual((e1-e2)/0.002*lib.param.BOHR, g[1,0], 4)
+    def test_tda_lda(self):
+        td = tdscf.TDA(mf_lda).run(nstates=3)
+        tdg = td.nuc_grad_method()
+        g1 = tdg.kernel(td.xy[2])
+        self.assertAlmostEqual(g1[0,2], -0.40279473514282405, 6)
+
+        td_solver = td.as_scanner()
+        e1 = td_solver(pmol.set_geom_('H 0 0 1.805; F 0 0 0', unit='B'))
+        e2 = td_solver(pmol.set_geom_('H 0 0 1.803; F 0 0 0', unit='B'))
+        self.assertAlmostEqual((e1[2]-e2[2])/.002, g1[0,2], 4)
+
+    def test_tda_b88(self):
+        td = tdscf.TDA(mf_gga).run(nstates=3)
+        tdg = td.nuc_grad_method()
+        g1 = tdg.kernel(state=3)
+        self.assertAlmostEqual(g1[0,2], -0.8120037135120326, 6)
+
+        td_solver = td.as_scanner()
+        e1 = td_solver(pmol.set_geom_('H 0 0 1.805; F 0 0 0', unit='B'))
+        e2 = td_solver(pmol.set_geom_('H 0 0 1.803; F 0 0 0', unit='B'))
+        self.assertAlmostEqual((e1[2]-e2[2])/.002, g1[0,2], 4)
+
+    def test_tddft_lda(self):
+        td = tdscf.TDDFT(mf_lda).run(nstates=3)
+        tdg = td.nuc_grad_method()
+        g1 = tdg.kernel(state=3)
+        self.assertAlmostEqual(g1[0,2], -0.39791714992157035, 6)
+
+        td_solver = td.as_scanner()
+        e1 = td_solver(pmol.set_geom_('H 0 0 1.805; F 0 0 0', unit='B'))
+        e2 = td_solver(pmol.set_geom_('H 0 0 1.803; F 0 0 0', unit='B'))
+        self.assertAlmostEqual((e1[2]-e2[2])/.002, g1[0,2], 4)
+
+    @unittest.skip('tduks-mgga has large error due to grids response')
+    def test_tda_mgga(self):
+        mf = dft.UKS(mol)
+        mf.xc = 'm06l'
+        mf.conv_tol = 1e-12
+        mf.kernel()
+        td = mf.TDA().run(nstates=3)
+        tdg = td.Gradients()
+        g1 = tdg.kernel(state=2)
+        self.assertAlmostEqual(g1[0,2], -0.31324464083043635, 4)
+
+        td_solver = td.as_scanner()
+        pmol = mol.copy()
+        e1 = td_solver(pmol.set_geom_('H 0 0 1.805; F 0 0 0', unit='B'))
+        e2 = td_solver(pmol.set_geom_('H 0 0 1.803; F 0 0 0', unit='B'))
+        self.assertAlmostEqual(abs((e1[2]-e2[2])/.002 - g1[0,2]).max(), 0, 4)
+        self.assertAlmostEqual(abs((e1[2]-e2[2])/.002 - g1[1,2]).max(), 0, 4)
+
+    def test_tddft_b3lyp(self):
+        mf = dft.UKS(mol).set(conv_tol=1e-12)
+        mf.xc = '.2*HF + .8*b88, vwn'
+        mf.scf()
+        td = tdscf.TDDFT(mf).run(nstates=3)
+        tdg = td.nuc_grad_method()
+        g1 = tdg.kernel(state=3)
+        self.assertAlmostEqual(g1[0,2], -0.80446691153291727, 6)
+
+        td_solver = td.as_scanner()
+        e1 = td_solver(pmol.set_geom_('H 0 0 1.805; F 0 0 0', unit='B'))
+        e2 = td_solver(pmol.set_geom_('H 0 0 1.803; F 0 0 0', unit='B'))
+        self.assertAlmostEqual((e1[2]-e2[2])/.002, g1[0,2], 4)
+
+    def test_range_separated(self):
+        mol = gto.M(atom="H; H 1 1.", basis='631g', verbose=0)
+        mf = dft.UKS(mol).set(xc='CAMB3LYP')
+        td = mf.apply(tdscf.TDA)
+        tdg_scanner = td.nuc_grad_method().as_scanner()
+        g = tdg_scanner(mol, state=3)[1]
+        self.assertAlmostEqual(lib.fp(g), -0.46656653988919661, 6)
+        smf = td.as_scanner()
+        e1 = smf(mol.set_geom_("H; H 1 1.001"))[2]
+        e2 = smf(mol.set_geom_("H; H 1 0.999"))[2]
+        self.assertAlmostEqual((e1-e2)/0.002*lib.param.BOHR, g[1,0], 4)
 
     def test_custom_xc(self):
         mol = gto.Mole()
diff --git a/pyscf/grad/uks.py b/pyscf/grad/uks.py
index 105ca68313..cdc586823f 100644
--- a/pyscf/grad/uks.py
+++ b/pyscf/grad/uks.py
@@ -41,57 +41,49 @@ def get_veff(ks_grad, mol=None, dm=None):
 
     mf = ks_grad.base
     ni = mf._numint
-    if ks_grad.grids is not None:
-        grids = ks_grad.grids
-    else:
-        grids = mf.grids
-    if mf.nlc != '':
-        if ks_grad.nlcgrids is not None:
-            nlcgrids = ks_grad.nlcgrids
-        else:
-            nlcgrids = mf.nlcgrids
-        if nlcgrids.coords is None:
-            nlcgrids.build(with_non0tab=True)
-    if grids.coords is None:
-        grids.build(with_non0tab=True)
-
-    #enabling range-separated hybrids
-    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
+    grids, nlcgrids = rks_grad._initialize_grids(ks_grad)
 
+    ni = mf._numint
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, ks_grad.max_memory*.9-mem_now)
     if ks_grad.grid_response:
         exc, vxc = get_vxc_full_response(ni, mol, grids, mf.xc, dm,
                                          max_memory=max_memory,
                                          verbose=ks_grad.verbose)
-        logger.debug1(ks_grad, 'sum(grids response) %s', exc.sum(axis=0))
-        if mf.nlc:
-            assert 'VV10' in mf.nlc.upper()
-            enlc, vnlc = rks_grad.get_vxc_full_response(
-                ni, mol, nlcgrids, mf.xc+'__'+mf.nlc, dm[0]+dm[1],
+        if mf.nlc or ni.libxc.is_nlc(mf.xc):
+            if ni.libxc.is_nlc(mf.xc):
+                xc = mf.xc
+            else:
+                xc = mf.nlc
+            enlc, vnlc = rks_grad.get_nlc_vxc_full_response(
+                ni, mol, nlcgrids, xc, dm[0]+dm[1],
                 max_memory=max_memory, verbose=ks_grad.verbose)
             exc += enlc
             vxc += vnlc
+        logger.debug1(ks_grad, 'sum(grids response) %s', exc.sum(axis=0))
     else:
         exc, vxc = get_vxc(ni, mol, grids, mf.xc, dm,
                            max_memory=max_memory, verbose=ks_grad.verbose)
-        if mf.nlc:
-            assert 'VV10' in mf.nlc.upper()
-            enlc, vnlc = rks_grad.get_vxc(ni, mol, nlcgrids, mf.xc+'__'+mf.nlc,
-                                          dm[0]+dm[1], max_memory=max_memory,
-                                          verbose=ks_grad.verbose)
+        if mf.nlc or ni.libxc.is_nlc(mf.xc):
+            if ni.libxc.is_nlc(mf.xc):
+                xc = mf.xc
+            else:
+                xc = mf.nlc
+            enlc, vnlc = rks_grad.get_nlc_vxc(
+                ni, mol, nlcgrids, xc, dm[0]+dm[1],
+                max_memory=max_memory, verbose=ks_grad.verbose)
             vxc += vnlc
     t0 = logger.timer(ks_grad, 'vxc', *t0)
 
-    if abs(hyb) < 1e-10:
+    if not ni.libxc.is_hybrid_xc(mf.xc):
         vj = ks_grad.get_j(mol, dm)
         vxc += vj[0] + vj[1]
     else:
+        omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
         vj, vk = ks_grad.get_jk(mol, dm)
         vk *= hyb
-        if abs(omega) > 1e-10:  # For range separated Coulomb operator
-            with mol.with_range_coulomb(omega):
-                vk += ks_grad.get_k(mol, dm) * (alpha - hyb)
+        if omega != 0:
+            vk += ks_grad.get_k(mol, dm, omega=omega) * (alpha - hyb)
         vxc += vj[0] + vj[1] - vk
 
     return lib.tag_array(vxc, exc1_grid=exc)
diff --git a/pyscf/gto/basis/__init__.py b/pyscf/gto/basis/__init__.py
index 07fae5e2ee..1601b6e03f 100644
--- a/pyscf/gto/basis/__init__.py
+++ b/pyscf/gto/basis/__init__.py
@@ -513,13 +513,30 @@ def load(filename_or_basisname, symb, optimize=OPTIMIZE_CONTRACTION):
     if not (name in ALIAS or _is_pople_basis(name)):
         try:
             return parse_nwchem.parse(filename_or_basisname, symb)
-        except BasisNotFoundError:
-            try:
-                return parse_nwchem.parse(filename_or_basisname)
-            except IndexError:
-                raise BasisNotFoundError('Invalid basis name %s' % filename_or_basisname)
         except IndexError:
             raise BasisNotFoundError(filename_or_basisname)
+        except BasisNotFoundError as basis_err:
+            pass
+
+        try:
+            return parse_nwchem.parse(filename_or_basisname)
+        except IndexError:
+            raise BasisNotFoundError('Invalid basis name %s' % filename_or_basisname)
+        except BasisNotFoundError:
+            pass
+
+        # Last, a trial to access Basis Set Exchange database
+        from pyscf.basis import bse
+        if bse.basis_set_exchange is not None:
+            try:
+                bse_obj = bse.basis_set_exchange.api.get_basis(
+                    filename_or_basisname, elements=symb)
+            except KeyError:
+                raise BasisNotFoundError(filename_or_basisname)
+            else:
+                return bse._orbital_basis(bse_obj)[0]
+
+        raise basis_err
 
     if name in ALIAS:
         basmod = ALIAS[name]
@@ -565,8 +582,33 @@ def load_ecp(filename_or_basisname, symb):
     if name in ALIAS:
         basmod = ALIAS[name]
         return parse_nwchem.load_ecp(join(_BASIS_DIR, basmod), symb)
-    else:
+
+    try:
         return parse_ecp(filename_or_basisname, symb)
+    except IndexError:
+        raise BasisNotFoundError(filename_or_basisname)
+    except BasisNotFoundError as basis_err:
+        pass
+
+    try:
+        return parse_nwchem.parse_ecp(filename_or_basisname)
+    except IndexError:
+        raise BasisNotFoundError('Invalid basis name %s' % filename_or_basisname)
+    except BasisNotFoundError:
+        pass
+
+    # Last, a trial to access Basis Set Exchange database
+    from pyscf.basis import bse
+    if bse.basis_set_exchange is not None:
+        try:
+            bse_obj = bse.basis_set_exchange.api.get_basis(
+                filename_or_basisname, elements=symb)
+        except KeyError:
+            raise BasisNotFoundError(filename_or_basisname)
+        else:
+            return bse._ecp_basis(bse_obj)[0]
+
+    raise basis_err
 
 def _format_basis_name(basisname):
     return basisname.lower().replace('-', '').replace('_', '').replace(' ', '')
diff --git a/pyscf/gto/basis/bse.py b/pyscf/gto/basis/bse.py
new file mode 100644
index 0000000000..b9b6e9561e
--- /dev/null
+++ b/pyscf/gto/basis/bse.py
@@ -0,0 +1,139 @@
+'''
+Conversion from the Basis Set Exchange format to PySCF format
+
+17 Nov 2021 Susi Lehtola
+'''
+
+try:
+    from basis_set_exchange import lut, manip, sort
+except ImportError:
+    basis_set_exchange = None
+
+
+def _orbital_basis(basis):
+    '''Extracts the orbital basis from the BSE format in PySCF format'''
+
+    r = {}
+
+    basis = manip.make_general(basis, False, True)
+    basis = sort.sort_basis(basis, False)
+
+    # Elements for which we have electron basis
+    electron_elements = [k for k, v in basis['elements'].items() if 'electron_shells' in v]
+
+    # List of references in the used basis
+    reference_list = []
+
+    # Electron Basis
+    if electron_elements:
+        for z in electron_elements:
+            data = basis['elements'][z]
+
+            sym = lut.element_sym_from_Z(z, True)
+
+            # List of shells
+            atom_shells = []
+            for shell in data['electron_shells']:
+                exponents = shell['exponents']
+                coefficients = shell['coefficients']
+                ncontr = len(coefficients)
+                nprim = len(exponents)
+                am = shell['angular_momentum']
+                assert len(am) == 1
+
+                shell_data = [am[0]]
+                for iprim in range(nprim):
+                    row = [float(coefficients[ic][iprim]) for ic in range(ncontr)]
+                    row.insert(0, float(exponents[iprim]))
+                    shell_data.append(row)
+                atom_shells.append(shell_data)
+            r[sym] = atom_shells
+
+            # Collect the literature references
+            for ref in data['references']:
+                for key in ref['reference_keys']:
+                    if key not in reference_list:
+                        reference_list.append(key)
+
+    return r, reference_list
+
+
+def _ecp_basis(basis):
+    '''Extracts the ECP from the BSE format in PySCF format'''
+
+    r = {}
+
+    basis = manip.make_general(basis, False, True)
+    basis = sort.sort_basis(basis, False)
+
+    # Elements for which we have ECP
+    ecp_elements = [k for k, v in basis['elements'].items() if 'ecp_potentials' in v]
+
+    # Electron Basis
+    if ecp_elements:
+        for z in ecp_elements:
+            data = basis['elements'][z]
+            sym = lut.element_sym_from_Z(z, True)
+
+            # Sort lowest->highest
+            ecp_list = sorted(data['ecp_potentials'], key=lambda x: x['angular_momentum'])
+
+            # List of ECP
+            atom_ecp = [data['ecp_electrons'], []]
+            for ir, pot in enumerate(ecp_list):
+                rexponents = pot['r_exponents']
+                gexponents = pot['gaussian_exponents']
+                coefficients = pot['coefficients']
+                am = pot['angular_momentum']
+                nprim = len(rexponents)
+
+                shell_data = [am[0], []]
+                # PySCF wants the data in order of rexp=0, 1, 2, ..
+                for rexpval in range(max(rexponents) + 1):
+                    rcontr = []
+                    for i in range(nprim):
+                        if rexponents[i] == rexpval:
+                            rcontr.append([float(gexponents[i]), float(coefficients[0][i])])
+                    shell_data[1].append(rcontr)
+                atom_ecp[1].append(shell_data)
+            r[sym] = atom_ecp
+
+    return r
+
+def _print_basis_information(basis):
+    name = basis['name']
+    version = basis['version']
+    revision_description = basis['revision_description']
+    revision_date = basis['revision_date']
+    print('{} basis set, version {}'.format(name, version))
+    print('Last revised on {}'.format(revision_date))
+    print('Revision description: {}'.format(revision_description))
+
+if __name__ == '__main__':
+    from basis_set_exchange import api, references
+
+    # Get reference data
+    reference_data = api.get_reference_data()
+    #print(references)
+
+    o631gbas = api.get_basis('6-31g', elements='O')
+    #print('O 6-31G basis, BSE format\n{}'.format(o631gbas))
+    _print_basis_information(o631gbas)
+    o631gorb, o631gref = _orbital_basis(o631gbas)
+    print('O 6-31G orbital basis, PySCF format\n{}'.format(o631gorb))
+    print('Literature references')
+    for ref in o631gref:
+        print(references.reference_text(ref, reference_data[ref]))
+    print('')
+
+    nalanl2dzbas = api.get_basis('lanl2dz', elements='Na')
+    #print('Na LANL2DZ basis, BSE format\n{}'.format(nalanl2dzbas))
+    _print_basis_information(nalanl2dzbas)
+    nalanl2dzorb, nalanl2dzref = _orbital_basis(nalanl2dzbas)
+    print('Na LANL2DZ orbital basis, PySCF format\n{}'.format(nalanl2dzorb))
+    nalanl2dzecp = _ecp_basis(nalanl2dzbas)
+    print('Na LANL2DZ ECP basis, PySCF format\n{}'.format(nalanl2dzecp))
+    print('Literature references')
+    for ref in nalanl2dzref:
+        print(references.reference_text(ref, reference_data[ref]))
+    print('')
diff --git a/pyscf/gto/basis/parse_gaussian.py b/pyscf/gto/basis/parse_gaussian.py
index 1ec58f814a..6d1e305b39 100644
--- a/pyscf/gto/basis/parse_gaussian.py
+++ b/pyscf/gto/basis/parse_gaussian.py
@@ -28,6 +28,7 @@
 except ImportError:
     optimize_contraction = lambda basis: basis
     remove_zero = lambda basis: basis
+from pyscf.lib.exceptions import BasisNotFoundError
 
 MAXL = 12
 SPDF = 'SPDFGHIJKLMN'
@@ -109,6 +110,8 @@ def _parse(raw_basis, optimize=True):
     basis_sorted = []
     for l in range(MAXL):
         basis_sorted.extend([b for b in basis_add if b[0] == l])
+    if not basis_sorted:
+        raise BasisNotFoundError(f'Basis data not found in "{raw_basis}"')
 
     if optimize:
         basis_sorted = optimize_contraction(basis_sorted)
diff --git a/pyscf/gto/basis/parse_nwchem.py b/pyscf/gto/basis/parse_nwchem.py
index 47a8f31c03..b741e89468 100644
--- a/pyscf/gto/basis/parse_nwchem.py
+++ b/pyscf/gto/basis/parse_nwchem.py
@@ -136,6 +136,8 @@ def _parse(raw_basis, optimize=True):
             else:
                 current_basis.append(dat)
     basis_sorted = [b for bs in basis_parsed for b in bs]
+    if not basis_sorted:
+        raise BasisNotFoundError(f'Basis data not found in "{raw_basis}"')
 
     if optimize:
         basis_sorted = optimize_contraction(basis_sorted)
@@ -205,11 +207,13 @@ def _parse_ecp(raw_ecp):
 
     if nelec is None:
         return []
-    else:
-        bsort = []
-        for l in range(-1, MAXL):
-            bsort.extend([b for b in ecp_add if b[0] == l])
-        return [nelec, bsort]
+
+    bsort = []
+    for l in range(-1, MAXL):
+        bsort.extend([b for b in ecp_add if b[0] == l])
+    if not bsort:
+        raise BasisNotFoundError(f'ECP data not found in "{raw_ecp}"')
+    return [nelec, bsort]
 
 def load_ecp(basisfile, symb):
     return _parse_ecp(search_ecp(basisfile, symb))
diff --git a/pyscf/gto/ecp.py b/pyscf/gto/ecp.py
index 7683edde31..c30465299e 100644
--- a/pyscf/gto/ecp.py
+++ b/pyscf/gto/ecp.py
@@ -37,6 +37,7 @@
 import numpy
 from pyscf import lib
 from pyscf.gto import moleintor
+from pyscf.data.elements import ELEMENTS
 
 libecp = moleintor.libcgto
 libecp.ECPscalar_cache_size.restype = ctypes.c_int
@@ -124,7 +125,7 @@ def so_by_shell(mol, shls):
        cache.ctypes.data_as(ctypes.c_void_p))
     return buf
 
-def core_configuration(nelec_core):
+def core_configuration(nelec_core, atom_symbol=None):
     conf_dic = {
         0 : '0s0p0d0f',
         2 : '1s0p0d0f',
@@ -139,6 +140,16 @@ def core_configuration(nelec_core):
         78: '5s4p3d1f',
         92: '5s4p3d2f',
     }
+    # Core configurations for f-in-core ECPs defined in the following references
+    # 10.1007/BF00528565 , 10.1007/s00214-005-0629-0 , 10.1007/s00214-009-0584-2
+    elements_4f = ELEMENTS[57:71]
+    elements_5f = ELEMENTS[89:103]
+    if atom_symbol in elements_4f:
+        for i in range(46, 60):
+            conf_dic[i] = '4s3p2d1f'
+    if atom_symbol in elements_5f:
+        for i in range(78, 92):
+            conf_dic[i] = '5s4p3d2f'
     if nelec_core not in conf_dic:
         raise RuntimeError('Core configuration for %d core electrons is not available.' % nelec_core)
     coreshell = [int(x) for x in conf_dic[nelec_core][::2]]
diff --git a/pyscf/gto/mole.py b/pyscf/gto/mole.py
index fd6af2f529..1017fa80fe 100644
--- a/pyscf/gto/mole.py
+++ b/pyscf/gto/mole.py
@@ -916,11 +916,15 @@ def make_atm_env(atom, ptr=0, nuclear_model=NUC_POINT, nucprop={}):
 def make_bas_env(basis_add, atom_id=0, ptr=0):
     '''Convert :attr:`Mole.basis` to the argument ``bas`` for ``libcint`` integrals
     '''
+    # First sort basis accroding to l. This is important for method
+    # decontract_basis, which assumes that basis functions with the same angular
+    # momentum are grouped together
+    basis_add = [b for b in basis_add if b]
+    basis_add = sorted(basis_add, key=lambda b: b[0])
+
     _bas = []
     _env = []
     for b in basis_add:
-        if not b:  # == []
-            continue
         angl = b[0]
         if angl > 14:
             sys.stderr.write('Warning: integral library does not support basis '
@@ -1516,7 +1520,7 @@ def sph_labels(mol, fmt=True, base=BASE):
         if nelec_ecp == 0 or l > 3:
             shl_start = count[ia,l]+l+1
         else:
-            coreshl = core_configuration(nelec_ecp)
+            coreshl = core_configuration(nelec_ecp, atom_symbol=_std_symbol(symb))
             shl_start = coreshl[l]+count[ia,l]+l+1
         count[ia,l] += nc
         for n in range(shl_start, shl_start+nc):
@@ -1566,7 +1570,7 @@ def cart_labels(mol, fmt=True, base=BASE):
         if nelec_ecp == 0 or l > 3:
             shl_start = count[ia,l]+l+1
         else:
-            coreshl = core_configuration(nelec_ecp)
+            coreshl = core_configuration(nelec_ecp, atom_symbol=_std_symbol(symb))
             shl_start = coreshl[l]+count[ia,l]+l+1
         count[ia,l] += nc
         ncart = (l + 1) * (l + 2) // 2
@@ -1616,7 +1620,7 @@ def spinor_labels(mol, fmt=True, base=BASE):
         if nelec_ecp == 0 or l > 3:
             shl_start = count[ia,l]+l+1
         else:
-            coreshl = core_configuration(nelec_ecp)
+            coreshl = core_configuration(nelec_ecp, atom_symbol=_std_symbol(symb))
             shl_start = coreshl[l]+count[ia,l]+l+1
         count[ia,l] += nc
         for n in range(shl_start, shl_start+nc):
diff --git a/pyscf/gto/moleintor.py b/pyscf/gto/moleintor.py
index 6b70b385cd..2c518e63ed 100644
--- a/pyscf/gto/moleintor.py
+++ b/pyscf/gto/moleintor.py
@@ -723,6 +723,8 @@ def num_cgto_of(basid):
         l = bas[shls[2],ANG_OF]
         if intor_name.endswith('_ssc'): # mixed spherical-cartesian
             dk = (l+1)*(l+2)//2 * bas[shls[2],NCTR_OF]
+        elif intor_name.endswith('_cart'):
+            dk = (l+1)*(l+2)//2 * bas[shls[2],NCTR_OF]
         else:
             dk = (l*2+1) * bas[shls[2],NCTR_OF]
         buf = numpy.empty((di,dj,dk,comp), dtype, order='F')
diff --git a/pyscf/gto/test/test_basis_parser.py b/pyscf/gto/test/test_basis_parser.py
index 0d800daa70..d886e252e5 100644
--- a/pyscf/gto/test/test_basis_parser.py
+++ b/pyscf/gto/test/test_basis_parser.py
@@ -36,9 +36,8 @@ def test_parse_pople(self):
         self.assertRaises(KeyError, gto.basis._parse_pople_basis, '631g++', 'C')
 
     def test_basis_load(self):
-        self.assertEqual(gto.basis.load(__file__, 'H'), [])
+        self.assertRaises(BasisNotFoundError, gto.basis.load, __file__, 'H')
         self.assertRaises(BasisNotFoundError, gto.basis.load, 'abas', 'H')
-        #self.assertRaises(BasisNotFoundError, gto.basis.load(__file__, 'C'), [])
 
         self.assertEqual(len(gto.basis.load('631++g**', 'C')), 8)
         self.assertEqual(len(gto.basis.load('ccpcvdz', 'C')), 7)
diff --git a/pyscf/gto/test/test_ecp.py b/pyscf/gto/test/test_ecp.py
index 44b40148a2..f35d0e1f9e 100644
--- a/pyscf/gto/test/test_ecp.py
+++ b/pyscf/gto/test/test_ecp.py
@@ -325,6 +325,60 @@ def test_ecp_hessian1(self):
         mat2 = mol.set_geom_('Na, 0.00, 0.00, 0.00; Cl, 0.00, 0.00, 2.051').intor('ECPscalar_ipnuc')
         self.assertAlmostEqual(abs(mat0.reshape(3,3,nao,nao)[:,2] - (mat2 - mat1) / 0.002).max(), 0, 5)
 
+    def test_ecp_f_in_core(self):
+        mol = gto.M(atom='Eu1, 0.00, 0.00, 0.00',
+                    basis={'Eu': gto.basis.parse('''
+Eu    S
+    0.749719700E+01   -0.288775043E+00
+    0.617255600E+01    0.708008105E+00
+    0.260816600E+01   -0.136569920E+01
+Eu    S
+    0.530389000E+00    0.100000000E+01
+Eu    S
+    0.254033000E+00    0.100000000E+01
+Eu    S
+    0.522020000E-01    0.100000000E+01
+Eu    S
+    0.221100000E-01    0.100000000E+01
+Eu    P
+    0.399434200E+01    0.110821693E+01
+    0.350361700E+01   -0.152518191E+01
+    0.722399000E+00    0.119866293E+01
+Eu    P
+    0.324354000E+00    0.100000000E+01
+Eu    P
+    0.127842000E+00    0.100000000E+01
+Eu    P
+    0.330280000E-01    0.100000000E+01
+Eu    D
+    0.206170800E+01   -0.127297005E+00
+    0.967971000E+00    0.377785014E+00
+    0.369101000E+00    0.765795028E+00
+Eu    D
+    0.128958000E+00    0.100000000E+01
+Eu    D
+    0.419270000E-01    0.100000000E+01
+                    ''')},
+                    ecp={'Eu': gto.basis.parse_ecp('''
+Eu nelec  53
+Eu ul
+2      1.0000000000        0.0000000000
+Eu S
+2      5.1852000000      172.7978960000
+2      2.5926000000      -10.0922600000
+Eu P
+2      4.3588000000      111.3150270000
+2      2.1794000000       -3.4025580000
+Eu D
+2      2.8902000000       41.8677290000
+2      1.4451000000       -1.2874330000
+Eu F
+2      5.3988000000      -63.6010500000
+                    ''')}, charge=2, verbose=0)
+        mf = scf.RHF(mol)
+        self.assertEqual(mol.ao_labels()[0], '0 Eu1 5s    ')
+        self.assertAlmostEqual(lib.fp(mf.get_hcore()), 22.59028455662168)
+
 
 if __name__ == '__main__':
     print("Full Tests for ECP")
diff --git a/pyscf/gto/test/test_mole.py b/pyscf/gto/test/test_mole.py
index c42151c697..294233e168 100644
--- a/pyscf/gto/test/test_mole.py
+++ b/pyscf/gto/test/test_mole.py
@@ -1034,6 +1034,13 @@ def test_decontract_basis(self):
         s = ctr_coeff.T.dot(pmol.intor('int1e_ovlp')).dot(ctr_coeff)
         self.assertAlmostEqual(abs(s - mol.intor('int1e_ovlp')).max(), 0, 12)
 
+        mol = gto.M(atom='He',
+                    basis=('ccpvdz', [[0, [5, 1]], [1, [3, 1]]]))
+        pmol, contr_coeff = mol.decontract_basis()
+        contr_coeff = scipy.linalg.block_diag(*contr_coeff)
+        s = contr_coeff.T.dot(pmol.intor('int1e_ovlp')).dot(contr_coeff)
+        self.assertAlmostEqual(abs(s - mol.intor('int1e_ovlp')).max(), 0, 12)
+
     def test_ao_rotation_matrix(self):
         mol = gto.M(atom='O 0 0 0.2; H1 0 -.8 -.5; H2 0 .8 -.5', basis='ccpvdz')
         numpy.random.seed(1)
diff --git a/pyscf/gw/rpa.py b/pyscf/gw/rpa.py
index e83c795cfc..7a157f40bc 100755
--- a/pyscf/gw/rpa.py
+++ b/pyscf/gw/rpa.py
@@ -39,14 +39,14 @@
 # core routines, kernel, rpa_ecorr, rho_response
 # ****************************************************************************
 
-def kernel(rpa, mo_energy, mo_coeff, Lpq=None, nw=None, verbose=logger.NOTE):
+def kernel(rpa, mo_energy, mo_coeff, Lpq=None, nw=40, x0=0.5, verbose=logger.NOTE):
     """
     RPA correlation and total energy
 
     Args:
         Lpq : density fitting 3-center integral in MO basis.
         nw : number of frequency point on imaginary axis.
-        vhf_df : using density fitting integral to compute HF exchange.
+        x0: scaling factor for frequency grid.
 
     Returns:
         e_tot : RPA total energy
@@ -63,7 +63,7 @@ def kernel(rpa, mo_energy, mo_coeff, Lpq=None, nw=None, verbose=logger.NOTE):
         Lpq = rpa.ao2mo(mo_coeff)
 
     # Grids for integration on imaginary axis
-    freqs, wts = _get_scaled_legendre_roots(nw)
+    freqs, wts = _get_scaled_legendre_roots(nw, x0)
 
     # Compute HF exchange energy (EXX)
     dm = mf.make_rdm1()
@@ -222,13 +222,14 @@ def nmo(self, n):
     get_nmo = get_nmo
     get_frozen_mask = get_frozen_mask
 
-    def kernel(self, mo_energy=None, mo_coeff=None, Lpq=None, nw=40):
+    def kernel(self, mo_energy=None, mo_coeff=None, Lpq=None, nw=40, x0=0.5):
         """
         Args:
             mo_energy : 1D array (nmo), mean-field mo energy
             mo_coeff : 2D array (nmo, nmo), mean-field mo coefficient
             Lpq : 3D array (naux, nmo, nmo), 3-index ERI
             nw: interger, grid number
+            x0: real, scaling factor for frequency grid
 
         Returns:
             self.e_tot : RPA total eenrgy
@@ -243,7 +244,7 @@ def kernel(self, mo_energy=None, mo_coeff=None, Lpq=None, nw=40):
         cput0 = (logger.process_clock(), logger.perf_counter())
         self.dump_flags()
         self.e_tot, self.e_hf, self.e_corr = \
-                        kernel(self, mo_energy, mo_coeff, Lpq=Lpq, nw=nw, verbose=self.verbose)
+                        kernel(self, mo_energy, mo_coeff, Lpq=Lpq, nw=nw, x0=x0, verbose=self.verbose)
 
         logger.timer(self, 'RPA', *cput0)
         return self.e_corr
diff --git a/pyscf/gw/urpa.py b/pyscf/gw/urpa.py
index 06fa277200..cc1324e777 100755
--- a/pyscf/gw/urpa.py
+++ b/pyscf/gw/urpa.py
@@ -40,14 +40,14 @@
 # core routines, kernel, rpa_ecorr, rho_response
 # ****************************************************************************
 
-def kernel(rpa, mo_energy, mo_coeff, Lpq=None, nw=None, verbose=logger.NOTE):
+def kernel(rpa, mo_energy, mo_coeff, Lpq=None, nw=40, x0=0.5, verbose=logger.NOTE):
     """
     RPA correlation and total energy
 
     Args:
         Lpq : density fitting 3-center integral in MO basis.
         nw : number of frequency point on imaginary axis.
-        vhf_df : using density fitting integral to compute HF exchange.
+        x0: scaling factor for frequency grid.
 
     Returns:
         e_tot : RPA total energy
@@ -64,7 +64,7 @@ def kernel(rpa, mo_energy, mo_coeff, Lpq=None, nw=None, verbose=logger.NOTE):
         Lpq = rpa.ao2mo(mo_coeff)
 
     # Grids for integration on imaginary axis
-    freqs, wts = _get_scaled_legendre_roots(nw)
+    freqs, wts = _get_scaled_legendre_roots(nw, x0)
 
     # Compute HF exchange energy (EXX)
     dm = mf.make_rdm1()
@@ -153,13 +153,14 @@ def dump_flags(self):
     get_nmo = get_nmo
     get_frozen_mask = get_frozen_mask
 
-    def kernel(self, mo_energy=None, mo_coeff=None, Lpq=None, nw=40):
+    def kernel(self, mo_energy=None, mo_coeff=None, Lpq=None, nw=40, x0=0.5):
         """
         Args:
             mo_energy : 2D array (2, nmo), mean-field mo energy
             mo_coeff : 3D array (2, nmo, nmo), mean-field mo coefficient
             Lpq : 4D array (2, naux, nmo, nmo), 3-index ERI
             nw: interger, grid number
+            x0: real, scaling factor for frequency grid
 
         Returns:
             self.e_tot : RPA total eenrgy
@@ -174,7 +175,7 @@ def kernel(self, mo_energy=None, mo_coeff=None, Lpq=None, nw=40):
         cput0 = (logger.process_clock(), logger.perf_counter())
         self.dump_flags()
         self.e_tot, self.e_hf, self.e_corr = \
-                        kernel(self, mo_energy, mo_coeff, Lpq=Lpq, nw=nw, verbose=self.verbose)
+                        kernel(self, mo_energy, mo_coeff, Lpq=Lpq, nw=nw, x0=x0, verbose=self.verbose)
 
         logger.timer(self, 'RPA', *cput0)
         return self.e_corr
diff --git a/pyscf/hessian/rks.py b/pyscf/hessian/rks.py
index 19741943ac..ffc32e69e3 100644
--- a/pyscf/hessian/rks.py
+++ b/pyscf/hessian/rks.py
@@ -40,6 +40,10 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     mol = hessobj.mol
     mf = hessobj.base
+    ni = mf._numint
+    if mf.nlc or ni.libxc.is_nlc(mf.xc):
+        raise NotImplementedError('RKS Hessian for NLC functional')
+
     if mo_energy is None: mo_energy = mf.mo_energy
     if mo_occ is None:    mo_occ = mf.mo_occ
     if mo_coeff is None:  mo_coeff = mf.mo_coeff
@@ -49,23 +53,18 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     mocc = mo_coeff[:,mo_occ>0]
     dm0 = numpy.dot(mocc, mocc.T) * 2
 
-    if mf.nlc != '':
-        raise NotImplementedError
-    #enabling range-separated hybrids
-    omega, alpha, beta = mf._numint.rsh_coeff(mf.xc)
-    if abs(omega) > 1e-10:
-        hyb = alpha + beta
-    else:
-        hyb = mf._numint.hybrid_coeff(mf.xc, spin=mol.spin)
+    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
+    hybrid = ni.libxc.is_hybrid_xc(mf.xc)
+
     de2, ej, ek = rhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
                                              atmlst, max_memory, verbose,
-                                             abs(hyb) > 1e-10)
+                                             with_k=hybrid)
     de2 += ej - hyb * ek  # (A,B,dR_A,dR_B)
 
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, mf.max_memory*.9-mem_now)
     veff_diag = _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory)
-    if abs(omega) > 1e-10:
+    if hybrid and omega != 0:
         with mol.with_range_coulomb(omega):
             vk1 = rhf_hess._get_jk(mol, 'int2e_ipip1', 9, 's2kl',
                                    ['jk->s1il', dm0])[0]
@@ -80,7 +79,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
         shls_slice = (shl0, shl1) + (0, mol.nbas)*3
         veff = vxc[ia]
-        if abs(omega) > 1e-10:
+        if hybrid and omega != 0:
             with mol.with_range_coulomb(omega):
                 vk1, vk2 = rhf_hess._get_jk(mol, 'int2e_ip1ip2', 9, 's1',
                                             ['li->s1kj', dm0[:,p0:p1],  # vk1
@@ -122,6 +121,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     ni = mf._numint
     ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
+    hybrid = ni.libxc.is_hybrid_xc(mf.xc)
 
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, mf.max_memory*.9-mem_now)
@@ -130,7 +130,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
         shls_slice = (shl0, shl1) + (0, mol.nbas)*3
-        if abs(hyb) > 1e-10:
+        if hybrid:
             vj1, vj2, vk1, vk2 = \
                     rhf_hess._get_jk(mol, 'int2e_ip1', 3, 's2kl',
                                      ['ji->s2kl', -dm0[:,p0:p1],  # vj1
@@ -140,7 +140,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
                                      shls_slice=shls_slice)
             veff = vj1 - hyb * .5 * vk1
             veff[:,p0:p1] += vj2 - hyb * .5 * vk2
-            if abs(omega) > 1e-10:
+            if omega != 0:
                 with mol.with_range_coulomb(omega):
                     vk1, vk2 = \
                         rhf_hess._get_jk(mol, 'int2e_ip1', 3, 's2kl',
diff --git a/pyscf/hessian/uks.py b/pyscf/hessian/uks.py
index 39ee82f973..43ebaa5b59 100644
--- a/pyscf/hessian/uks.py
+++ b/pyscf/hessian/uks.py
@@ -39,6 +39,10 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
 
     mol = hessobj.mol
     mf = hessobj.base
+    ni = mf._numint
+    if mf.nlc or ni.libxc.is_nlc(mf.xc):
+        raise NotImplementedError('RKS Hessian for NLC functional')
+
     if mo_energy is None: mo_energy = mf.mo_energy
     if mo_occ is None:    mo_occ = mf.mo_occ
     if mo_coeff is None:  mo_coeff = mf.mo_coeff
@@ -56,19 +60,17 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     dme0 = numpy.einsum('pi,qi,i->pq', mocca, mocca, mo_ea)
     dme0+= numpy.einsum('pi,qi,i->pq', moccb, moccb, mo_eb)
 
-    if mf.nlc != '':
-        raise NotImplementedError
-    #enabling range-separated hybrids
-    omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
+    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
+    hybrid = ni.libxc.is_hybrid_xc(mf.xc)
     de2, ej, ek = uhf_hess._partial_hess_ejk(hessobj, mo_energy, mo_coeff, mo_occ,
                                              atmlst, max_memory, verbose,
-                                             abs(hyb) > 1e-10)
+                                             with_k=hybrid)
     de2 += ej - hyb * ek  # (A,B,dR_A,dR_B)
 
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, mf.max_memory*.9-mem_now)
     veffa_diag, veffb_diag = _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory)
-    if abs(omega) > 1e-10:
+    if hybrid and omega != 0:
         with mol.with_range_coulomb(omega):
             vk1a, vk1b = _get_jk(mol, 'int2e_ipip1', 9, 's2kl',
                                  ['jk->s1il', dm0a, 'jk->s1il', dm0b])
@@ -85,7 +87,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
         veffa = vxca[ia]
         veffb = vxcb[ia]
         shls_slice = (shl0, shl1) + (0, mol.nbas)*3
-        if abs(omega) > 1e-10:
+        if hybrid and omega != 0:
             with mol.with_range_coulomb(omega):
                 vk1a, vk1b, vk2a, vk2b = \
                         _get_jk(mol, 'int2e_ip1ip2', 9, 's1',
@@ -138,6 +140,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     ni = mf._numint
     ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
+    hybrid = ni.libxc.is_hybrid_xc(mf.xc)
 
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, mf.max_memory*.9-mem_now)
@@ -146,7 +149,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
     for i0, ia in enumerate(atmlst):
         shl0, shl1, p0, p1 = aoslices[ia]
         shls_slice = (shl0, shl1) + (0, mol.nbas)*3
-        if abs(hyb) > 1e-10:
+        if hybrid:
             vj1a, vj1b, vj2a, vj2b, vk1a, vk1b, vk2a, vk2b = \
                     _get_jk(mol, 'int2e_ip1', 3, 's2kl',
                             ['ji->s2kl', -dm0a[:,p0:p1], 'ji->s2kl', -dm0b[:,p0:p1],
@@ -160,7 +163,7 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None):
             veffb = vj1 - hyb * vk1b
             veffa[:,p0:p1] += vj2 - hyb * vk2a
             veffb[:,p0:p1] += vj2 - hyb * vk2b
-            if abs(omega) > 1e-10:
+            if omega != 0:
                 with mol.with_range_coulomb(omega):
                     vk1a, vk1b, vk2a, vk2b = \
                             _get_jk(mol, 'int2e_ip1', 3, 's2kl',
diff --git a/pyscf/lib/CMakeLists.txt b/pyscf/lib/CMakeLists.txt
index e31e3063bf..8e4367be01 100644
--- a/pyscf/lib/CMakeLists.txt
+++ b/pyscf/lib/CMakeLists.txt
@@ -165,7 +165,7 @@ if(BUILD_LIBCINT)
 
   ExternalProject_Add(libcint
     GIT_REPOSITORY ${LIBCINT_GIT}
-    GIT_TAG v5.1.8
+    GIT_TAG v5.3.0
     PREFIX ${PROJECT_BINARY_DIR}/deps
     INSTALL_DIR ${PROJECT_SOURCE_DIR}/deps
     CMAKE_CACHE_ARGS
diff --git a/pyscf/lib/ao2mo/CMakeLists.txt b/pyscf/lib/ao2mo/CMakeLists.txt
index 9e5839c0a4..d980bf2885 100644
--- a/pyscf/lib/ao2mo/CMakeLists.txt
+++ b/pyscf/lib/ao2mo/CMakeLists.txt
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 add_library(ao2mo SHARED
-  restore_eri.c nr_ao2mo.c nr_incore.c r_ao2mo.c)
+  restore_eri.c nr_ao2mo.c nr_incore.c r_ao2mo.c nrr_ao2mo.c)
 add_dependencies(ao2mo cvhf)
 
 set_target_properties(ao2mo PROPERTIES
diff --git a/pyscf/lib/ao2mo/nrr_ao2mo.c b/pyscf/lib/ao2mo/nrr_ao2mo.c
new file mode 100644
index 0000000000..84c7c43548
--- /dev/null
+++ b/pyscf/lib/ao2mo/nrr_ao2mo.c
@@ -0,0 +1,271 @@
+/* Copyright 2014-2022 The PySCF Developers. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+ *
+ * Author: Xubo Wang <wangxubo0201@outlook.com>
+ *         Qiming Sun <osirpt.sun@gmail.com>
+ */
+
+#include <stdlib.h>
+#include <complex.h>
+#include <math.h>
+#include <assert.h>
+
+//#include <omp.h>
+#include "config.h"
+#include "cint.h"
+#include "np_helper/np_helper.h"
+#include "vhf/cvhf.h"
+#include "vhf/fblas.h"
+#include "vhf/nr_direct.h"
+#include "r_ao2mo.h"
+
+#define MIN(X,Y)        ((X) < (Y) ? (X) : (Y))
+#define MAX(X,Y)        ((X) > (Y) ? (X) : (Y))
+#define NCTRMAX         128
+
+int AO2MOmmm_nrr_iltj(double complex *vout, double *eri,
+                    struct _AO2MOEnvs *envs, int seekdim)
+{
+        switch (seekdim) {
+                case 1: return envs->bra_count * envs->ket_count;
+                case 2: return envs->nao * envs->nao;
+        }
+        const double D0 = 0;
+        const double D1 = 1;
+        const char TRANS_T = 'T';
+        const char TRANS_N = 'N';
+        int n2c = envs->nao;
+        int i_start = envs->bra_start;
+        int i_count = envs->bra_count;
+        int j_start = envs->ket_start;
+        int j_count = envs->ket_count;
+        int i;
+        double *buf1 = malloc(sizeof(double)*n2c*i_count*3);
+        double *buf2 = buf1 + n2c*i_count;
+        double *buf3 = buf2 + n2c*i_count;
+        double *bufr, *bufi;
+        double *mo1 = malloc(sizeof(double) * n2c*MAX(i_count,j_count)*2);
+        double *mo2, *mo_r, *mo_i;
+        double *eri_r = malloc(sizeof(double) * n2c*n2c*3);
+        double *eri_i = eri_r + n2c*n2c;
+        double *eri1  = eri_i + n2c*n2c;
+        double *vout1, *vout2, *vout3;
+
+        // Gauss complex multiplication, C_pi^* (pq| = (iq|, where (pq| is in C-order
+        mo_r = envs->mo_r + i_start * n2c;
+        mo_i = envs->mo_i + i_start * n2c;
+        mo2 = mo1 + n2c*i_count;
+        for (i = 0; i < n2c*i_count; i++) {
+                mo1[i] = mo_r[i] - mo_i[i];
+                mo2[i] =-mo_i[i] - mo_r[i];
+        }
+        for (i = 0; i < n2c*n2c; i++) {
+                eri_r[i] = eri[i];
+                eri_i[i] = 0.0;
+                eri1 [i] = eri_r[i] + eri_i[i];
+        }
+        dgemm_(&TRANS_N, &TRANS_N, &n2c, &i_count, &n2c,
+               &D1, eri1, &n2c, mo_r, &n2c, &D0, buf1, &n2c);
+        dgemm_(&TRANS_N, &TRANS_N, &n2c, &i_count, &n2c,
+               &D1, eri_r, &n2c, mo2, &n2c, &D0, buf2, &n2c);
+        dgemm_(&TRANS_N, &TRANS_N, &n2c, &i_count, &n2c,
+               &D1, eri_i, &n2c, mo1, &n2c, &D0, buf3, &n2c);
+        free(eri_r);
+
+        // C_qj^* (iq| = (ij|
+        bufr = buf3;
+        bufi = buf2;
+        for (i = 0; i < n2c*i_count; i++) {
+                buf3[i] = buf1[i] - buf3[i];
+                buf2[i] = buf1[i] + buf2[i];
+        }
+        for (i = 0; i < n2c*i_count; i++) {
+                buf1[i] = bufr[i] + bufi[i];
+        }
+        mo_r = envs->mo_r + j_start * n2c;
+        mo_i = envs->mo_i + j_start * n2c;
+        mo2 = mo1 + n2c*j_count;
+        for (i = 0; i < n2c*j_count; i++) {
+                mo1[i] = mo_r[i] + mo_i[i];
+                mo2[i] = mo_i[i] - mo_r[i];
+        }
+        vout1 = malloc(sizeof(double)*i_count*j_count*3);
+        vout2 = vout1 + i_count * j_count;
+        vout3 = vout2 + i_count * j_count;
+        dgemm_(&TRANS_T, &TRANS_N, &j_count, &i_count, &n2c,
+               &D1, mo_r, &n2c, buf1, &n2c, &D0, vout1, &j_count);
+        dgemm_(&TRANS_T, &TRANS_N, &j_count, &i_count, &n2c,
+               &D1, mo2, &n2c, bufr, &n2c, &D0, vout2, &j_count);
+        dgemm_(&TRANS_T, &TRANS_N, &j_count, &i_count, &n2c,
+               &D1, mo1, &n2c, bufi, &n2c, &D0, vout3, &j_count);
+        for (i = 0; i < i_count*j_count; i++) {
+                vout[i] = (vout1[i]-vout3[i]) + (vout1[i]+vout2[i])*_Complex_I;
+        }
+        free(vout1);
+        free(buf1);
+        free(mo1);
+        return 0;
+}
+int AO2MOmmm_nrr_s1_iltj(double complex *vout, double *eri,
+                       struct _AO2MOEnvs *envs, int seekdim)
+{
+        return AO2MOmmm_nrr_iltj(vout, eri, envs, seekdim);
+}
+void AO2MOfill_nrr_s1(int (*intor)(), int (*fprescreen)(),
+                    double *eri, int nkl, int ish,
+                    struct _AO2MOEnvs *envs)
+{
+        const int nao = envs->nao;
+        const size_t nao2 = nao * nao;
+        const int *ao_loc = envs->ao_loc;
+        const int klsh_start = envs->klsh_start;
+        const int klsh_end = klsh_start + envs->klsh_count;
+        const int di = ao_loc[ish+1] - ao_loc[ish];
+        const int jshtot = envs->nbas;
+        int kl, jsh, ksh, lsh, dj, dk, dl;
+        int icomp, i, j, k, l, n;
+        int shls[4];
+        double *buf = malloc(sizeof(double) *di*nao*NCTRMAX*NCTRMAX*envs->ncomp);
+        assert(buf);
+        double *pbuf, *pbuf1, *peri;
+
+        shls[0] = ish;
+
+        for (kl = klsh_start; kl < klsh_end; kl++) {
+                ksh = kl / envs->nbas;
+                lsh = kl - ksh * envs->nbas;
+                dk = ao_loc[ksh+1] - ao_loc[ksh];
+                dl = ao_loc[lsh+1] - ao_loc[lsh];
+                shls[2] = ksh;
+                shls[3] = lsh;
+
+                pbuf = buf;
+                for (jsh = 0; jsh < jshtot; jsh++) {
+                        dj = ao_loc[jsh+1] - ao_loc[jsh];
+                        shls[1] = jsh;
+                        n = di * dj * dk * dl * envs->ncomp;
+                        if ((*fprescreen)(shls, envs->vhfopt,
+                                          envs->atm, envs->bas, envs->env)) {
+                                (*intor)(pbuf, NULL, shls, envs->atm, envs->natm,
+                                         envs->bas, envs->nbas, envs->env,
+                                         envs->cintopt, NULL);
+                        } else {
+                                NPdset0(pbuf, n);
+                        }
+                        pbuf += n;
+                }
+
+                pbuf = buf;
+                for (jsh = 0; jsh < jshtot; jsh++) {
+                        dj = ao_loc[jsh+1] - ao_loc[jsh];
+                        for (icomp = 0; icomp < envs->ncomp; icomp++) {
+                                peri = eri + nao2 * nkl * icomp
+                                     + ao_loc[ish] * nao + ao_loc[jsh];
+                                for (k = 0; k < dk; k++) {
+                                for (l = 0; l < dl; l++) {
+                                        pbuf1 = pbuf + di * dj * (l*dk+k);
+                                        for (i = 0; i < di; i++) {
+                                        for (j = 0; j < dj; j++) {
+                                                peri[i*nao+j] = pbuf1[j*di+i];
+                                        } }
+                                        peri += nao2;
+                                } }
+                                pbuf += di * dj * dk * dl;
+                        }
+                }
+                eri += nao2 * dk * dl;
+        }
+        free(buf);
+}
+void AO2MOtranse1_nrr_s1(int (*fmmm)(),
+                       double complex *vout, double *vin, int row_id,
+                       struct _AO2MOEnvs *envs)
+{
+        size_t ij_pair = (*fmmm)(NULL, NULL, envs, 1);
+        size_t nao2 = envs->nao * envs->nao;
+        (*fmmm)(vout+ij_pair*row_id, vin+nao2*row_id, envs, 0);
+}
+void AO2MOnrr_e1_drv(int (*intor)(), void (*fill)(),
+                   void (*ftrans)(), int (*fmmm)(),
+                   double complex *eri, double complex *mo_a,
+                   double complex *mo_b,
+                   int klsh_start, int klsh_count, int nkl, int ncomp,
+                   int *orbs_slice, int *tao, int *ao_loc,
+                   CINTOpt *cintopt, CVHFOpt *vhfopt,
+                   int *atm, int natm, int *bas, int nbas, double *env)
+{
+        const int i_start = orbs_slice[0];
+        const int i_count = orbs_slice[1] - orbs_slice[0];
+        const int j_start = orbs_slice[2];
+        const int j_count = orbs_slice[3] - orbs_slice[2];
+        int ij_count = i_count*j_count;
+        int nao = ao_loc[nbas];
+        int nmo = MAX(orbs_slice[1], orbs_slice[3]);
+        int i;
+        double *mo_ra = malloc(sizeof(double) * nao * nmo);
+        double *mo_ia = malloc(sizeof(double) * nao * nmo);
+        double *mo_rb = malloc(sizeof(double) * nao * nmo);
+        double *mo_ib = malloc(sizeof(double) * nao * nmo);
+        for (i = 0; i < nao*nmo; i++) {
+                mo_ra[i] = creal(mo_a[i]);
+                mo_ia[i] = cimag(mo_a[i]);
+                mo_rb[i] = creal(mo_b[i]);
+                mo_ib[i] = cimag(mo_b[i]);
+        }
+        struct _AO2MOEnvs envs = {natm, nbas, atm, bas, env, nao,
+                                  klsh_start, klsh_count,
+                                  i_start, i_count, j_start, j_count,
+                                  ncomp, tao, ao_loc, mo_a,
+                                  mo_ra, mo_ia, cintopt, vhfopt};
+        struct _AO2MOEnvs envs2 = {natm, nbas, atm, bas, env, nao,
+                                  klsh_start, klsh_count,
+                                  i_start, i_count, j_start, j_count,
+                                  ncomp, tao, ao_loc, mo_b,
+                                  mo_rb, mo_ib, cintopt, vhfopt};
+
+
+        double *eri_ao = malloc(sizeof(double)* nao*nao*nkl*ncomp);
+        assert(eri_ao);
+        int ish, kl;
+        int (*fprescreen)();
+        if (vhfopt) {
+                fprescreen = vhfopt->fprescreen;
+        } else {
+                fprescreen = CVHFnoscreen;
+        }
+
+#pragma omp parallel default(none) \
+        shared(fill, fprescreen, eri_ao, envs, intor, nkl, nbas) \
+        private(ish)
+#pragma omp for nowait schedule(dynamic)
+        for (ish = 0; ish < nbas; ish++) {
+                (*fill)(intor, fprescreen, eri_ao, nkl, ish, &envs, 0);
+        }
+
+#pragma omp parallel default(none) \
+        shared(ftrans, fmmm, eri, eri_ao, nkl, ncomp, ij_count, envs, envs2) \
+        private(kl)
+#pragma omp for nowait schedule(static)
+        for (kl = 0; kl < nkl*ncomp; kl++) {
+                (*ftrans)(fmmm, eri, eri_ao, kl, &envs);
+                (*ftrans)(fmmm, eri+ncomp*nkl*ij_count, eri_ao, kl, &envs2);
+        }
+
+        free(eri_ao);
+        free(mo_ra);
+        free(mo_rb);
+        free(mo_ia);
+        free(mo_ib);
+}
diff --git a/pyscf/lib/dft/libxc_itrf.c b/pyscf/lib/dft/libxc_itrf.c
index ae4be66a5d..989d4519d6 100644
--- a/pyscf/lib/dft/libxc_itrf.c
+++ b/pyscf/lib/dft/libxc_itrf.c
@@ -373,6 +373,9 @@ int LIBXC_is_lda(int xc_id)
         switch(func.info->family)
         {
                 case XC_FAMILY_LDA:
+#ifdef XC_FAMILY_HYB_LDA
+                case XC_FAMILY_HYB_LDA:
+#endif
                         lda = 1;
                         break;
                 default:
@@ -909,7 +912,7 @@ void LIBXC_xc_reference(int xc_id, const char **refs)
         xc_func_type func;
         if(xc_func_init(&func, xc_id, XC_UNPOLARIZED) != 0){
                 fprintf(stderr, "XC functional %d not found\n", xc_id);
-                exit(1);
+                raise_error;
         }
 
         int i;
@@ -921,3 +924,13 @@ void LIBXC_xc_reference(int xc_id, const char **refs)
                 refs[i] = func.info->refs[i]->ref;
         }
 }
+
+int LIBXC_is_nlc(int xc_id)
+{
+        xc_func_type func;
+        if(xc_func_init(&func, xc_id, XC_UNPOLARIZED) != 0){
+                fprintf(stderr, "XC functional %d not found\n", xc_id);
+                raise_error -1;
+        }
+        return func.info->flags & XC_FLAGS_VV10;
+}
diff --git a/pyscf/lib/linalg_helper.py b/pyscf/lib/linalg_helper.py
index 8b4c1bacc6..5de6a1e630 100644
--- a/pyscf/lib/linalg_helper.py
+++ b/pyscf/lib/linalg_helper.py
@@ -389,7 +389,7 @@ def davidson1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
     if isinstance(x0, numpy.ndarray) and x0.ndim == 1:
         x0 = [x0]
     #max_cycle = min(max_cycle, x0[0].size)
-    max_space = max_space + (nroots-1) * 3
+    max_space = max_space + (nroots-1) * 4
     # max_space*2 for holding ax and xs, nroots*2 for holding axt and xt
     _incore = max_memory*1e6/x0[0].nbytes > max_space*2+nroots*3
     lessio = lessio and not _incore
@@ -398,11 +398,11 @@ def davidson1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
     dtype = None
     heff = None
     fresh_start = True
-    e = 0
+    e = None
     v = None
-    conv = [False] * nroots
+    conv = numpy.zeros(nroots, dtype=bool)
     emin = None
-    norm_min = 1
+    level_shift = 0
 
     for icyc in range(max_cycle):
         if fresh_start:
@@ -419,8 +419,7 @@ def davidson1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
             x0len = len(x0)
             xt = _qr(x0, dot, lindep)[0]
             if len(xt) != x0len:
-                log.warn('QR decomposition removed %d vectors.  The davidson may fail.',
-                         x0len - len(xt))
+                log.warn('QR decomposition removed %d vectors.', x0len - len(xt))
                 if callable(pick):
                     log.warn('Check to see if `pick` function %s is providing '
                              'linear dependent vectors', pick.__name__)
@@ -436,7 +435,7 @@ def davidson1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
             x0 = None
             max_dx_last = 1e9
             if SORT_EIG_BY_SIMILARITY:
-                conv = [False] * nroots
+                conv = numpy.zeros(nroots, dtype=bool)
         elif len(xt) > 1:
             xt = _qr(xt, dot, lindep)[0]
             xt = xt[:40]  # 40 trial vectors at most
@@ -473,13 +472,21 @@ def davidson1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
 
         if SORT_EIG_BY_SIMILARITY:
             e, v = _sort_by_similarity(w, v, nroots, conv, vlast, emin)
-            if elast.size != e.size:
-                de = e
-            else:
-                de = e - elast
         else:
             e = w[:nroots]
             v = v[:,:nroots]
+            conv = numpy.zeros(nroots, dtype=bool)
+            elast, conv_last = _sort_elast(elast, conv_last, vlast, v,
+                                           fresh_start, log)
+
+        if elast is None:
+            de = e
+        elif elast.size != e.size:
+            log.debug('Number of roots different from the previous step (%d,%d)',
+                      e.size, elast.size)
+            de = e
+        else:
+            de = e - elast
 
         x0 = None
         x0 = _gen_x0(v, xs)
@@ -488,27 +495,12 @@ def davidson1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
         else:
             ax0 = _gen_x0(v, ax)
 
-        if SORT_EIG_BY_SIMILARITY:
-            dx_norm = [0] * nroots
-            xt = [None] * nroots
-            for k, ek in enumerate(e):
-                if not conv[k]:
-                    xt[k] = ax0[k] - ek * x0[k]
-                    dx_norm[k] = numpy.sqrt(dot(xt[k].conj(), xt[k]).real)
-                    if abs(de[k]) < tol and dx_norm[k] < toloose:
-                        log.debug('root %d converged  |r|= %4.3g  e= %s  max|de|= %4.3g',
-                                  k, dx_norm[k], ek, de[k])
-                        conv[k] = True
-        else:
-            elast, conv_last = _sort_elast(elast, conv_last, vlast, v,
-                                           fresh_start, log)
-            de = e - elast
-            dx_norm = []
-            xt = []
-            conv = [False] * nroots
-            for k, ek in enumerate(e):
-                xt.append(ax0[k] - ek * x0[k])
-                dx_norm.append(numpy.sqrt(dot(xt[k].conj(), xt[k]).real))
+        dx_norm = numpy.zeros(nroots)
+        xt = [None] * nroots
+        for k, ek in enumerate(e):
+            if not conv[k]:
+                xt[k] = ax0[k] - ek * x0[k]
+                dx_norm[k] = numpy.sqrt(dot(xt[k].conj(), xt[k]).real)
                 conv[k] = abs(de[k]) < tol and dx_norm[k] < toloose
                 if conv[k] and not conv_last[k]:
                     log.debug('root %d converged  |r|= %4.3g  e= %s  max|de|= %4.3g',
@@ -522,8 +514,8 @@ def davidson1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
             break
         elif (follow_state and max_dx_norm > 1 and
               max_dx_norm/max_dx_last > 3 and space > nroots+2):
-            log.debug('davidson %d %d  |r|= %4.3g  e= %s  max|de|= %4.3g  lindep= %4.3g',
-                      icyc, space, max_dx_norm, e, de[ide], norm_min)
+            log.debug('davidson %d %d  |r|= %4.3g  e= %s  max|de|= %4.3g',
+                      icyc, space, max_dx_norm, e, de[ide])
             log.debug('Large |r| detected, restore to previous x0')
             x0 = _gen_x0(vlast, xs)
             fresh_start = True
@@ -534,44 +526,32 @@ def davidson1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
                 emin = min(e)
 
         # remove subspace linear dependency
-        if any(((not conv[k]) and n**2>lindep) for k, n in enumerate(dx_norm)):
-            for k, ek in enumerate(e):
-                if (not conv[k]) and dx_norm[k]**2 > lindep:
-                    xt[k] = precond(xt[k], e[0], x0[k])
-                    xt[k] *= 1/numpy.sqrt(dot(xt[k].conj(), xt[k]).real)
-                else:
-                    xt[k] = None
-        else:
-            for k, ek in enumerate(e):
-                if dx_norm[k]**2 > lindep:
-                    xt[k] = precond(xt[k], e[0], x0[k])
-                    xt[k] *= 1/numpy.sqrt(dot(xt[k].conj(), xt[k]).real)
-                else:
-                    xt[k] = None
-                    log.debug1('Throwing out eigenvector %d with norm=%4.3g', k, dx_norm[k])
-        xt = [xi for xi in xt if xi is not None]
-
-        for i in range(space):
-            xsi = numpy.asarray(xs[i])
-            for xi in xt:
-                xi -= xsi * dot(xsi.conj(), xi)
-            xsi = None
-        norm_min = 1
-        for i,xi in enumerate(xt):
-            norm = numpy.sqrt(dot(xi.conj(), xi).real)
-            if norm**2 > lindep:
-                xt[i] *= 1/norm
-                norm_min = min(norm_min, norm)
+        for k, ek in enumerate(e):
+            if (not conv[k]) and dx_norm[k]**2 > lindep:
+                xt[k] = precond(xt[k], e[0]-level_shift, x0[k])
+                xt[k] *= dot(xt[k].conj(), xt[k]).real ** -.5
+            elif not conv[k]:
+                # Remove linearly dependent vector
+                xt[k] = None
+                log.debug1('Drop eigenvector %d, norm=%4.3g', k, dx_norm[k])
             else:
-                xt[i] = None
-        xt = [xi for xi in xt if xi is not None]
-        xi = None
+                xt[k] = None
+
+        xt, ill_precond = _project_xt_(xt, xs, e, lindep, dot, precond)
+        if ill_precond:
+            # Manually adjust the precond because precond function may not be
+            # able to generate linearly dependent basis vectors. e.g. issue 1362
+            log.warn('Matrix may be already a diagonal matrix. '
+                     'level_shift is applied to precond')
+            level_shift = 0.1
+
+        xt, norm_min = _normalize_xt_(xt, lindep, dot)
         log.debug('davidson %d %d  |r|= %4.3g  e= %s  max|de|= %4.3g  lindep= %4.3g',
                   icyc, space, max_dx_norm, e, de[ide], norm_min)
         if len(xt) == 0:
             log.debug('Linear dependency in trial subspace. |r| for each state %s',
                       dx_norm)
-            conv = [conv[k] or (norm < toloose) for k,norm in enumerate(dx_norm)]
+            conv[dx_norm < toloose] = True
             break
 
         max_dx_last = max_dx_norm
@@ -598,6 +578,9 @@ def davidson1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
 
 def make_diag_precond(diag, level_shift=0):
     '''Generate the preconditioner function with the diagonal function.'''
+    # For diagonal matrix A, precond (Ax-x*e)/(diag(A)-e) is not able to
+    # generate linearly independent basis. Use level_shift to break the
+    # correlation between Ax-x*e and diag(A)-e.
     def precond(dx, e, *args):
         diagd = diag - (e - level_shift)
         diagd[abs(diagd)<1e-8] = 1e-8
@@ -665,7 +648,7 @@ def _eigs_cmplx2real(w, v, real_idx, real_eigenvectors=True):
         v = v.real
     return w.real, v, idx
 
-def eig(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
+def eig(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=20,
         lindep=DAVIDSON_LINDEP, max_memory=MAX_MEMORY,
         dot=numpy.dot, callback=None,
         nroots=1, lessio=False, left=False, pick=pick_real_eigs,
@@ -766,7 +749,7 @@ def eig(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
             return e, x
 davidson_nosym = eig
 
-def davidson_nosym1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
+def davidson_nosym1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=20,
                     lindep=DAVIDSON_LINDEP, max_memory=MAX_MEMORY,
                     dot=numpy.dot, callback=None,
                     nroots=1, lessio=False, left=False, pick=pick_real_eigs,
@@ -791,7 +774,7 @@ def davidson_nosym1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
     if isinstance(x0, numpy.ndarray) and x0.ndim == 1:
         x0 = [x0]
     #max_cycle = min(max_cycle, x0[0].size)
-    max_space = max_space + (nroots-1) * 4
+    max_space = max_space + (nroots-1) * 6
     # max_space*2 for holding ax and xs, nroots*2 for holding axt and xt
     _incore = max_memory*1e6/x0[0].nbytes > max_space*2+nroots*3
     lessio = lessio and not _incore
@@ -800,11 +783,11 @@ def davidson_nosym1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
     dtype = None
     heff = None
     fresh_start = True
-    e = 0
+    e = None
     v = None
-    conv = [False] * nroots
+    conv = numpy.zeros(nroots, dtype=bool)
     emin = None
-    norm_min = 1
+    level_shift = 0
 
     for icyc in range(max_cycle):
         if fresh_start:
@@ -821,12 +804,12 @@ def davidson_nosym1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
             x0len = len(x0)
             xt, x0 = _qr(x0, dot, lindep)[0], None
             if len(xt) != x0len:
-                log.warn('QR decomposition removed %d vectors.  The davidson may fail.'
+                log.warn('QR decomposition removed %d vectors. '
                          'Check to see if `pick` function :%s: is providing linear dependent '
                          'vectors' % (x0len - len(xt), pick.__name__))
             max_dx_last = 1e9
             if SORT_EIG_BY_SIMILARITY:
-                conv = [False] * nroots
+                conv = numpy.zeros(nroots, dtype=bool)
         elif len(xt) > 1:
             xt = _qr(xt, dot, lindep)[0]
             xt = xt[:40]  # 40 trial vectors at most
@@ -863,45 +846,40 @@ def davidson_nosym1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
         if SORT_EIG_BY_SIMILARITY:
             e, v = _sort_by_similarity(w, v, nroots, conv, vlast, emin,
                                        heff[:space,:space])
-            if e.size != elast.size:
-                de = e
-            else:
-                de = e - elast
         else:
             e = w[:nroots]
             v = v[:,:nroots]
+            conv = numpy.zeros(nroots, dtype=bool)
+            elast, conv_last = _sort_elast(elast, conv_last, vlast, v,
+                                           fresh_start, log)
 
+        if elast is None:
+            de = e
+        elif elast.size != e.size:
+            log.debug('Number of roots different from the previous step (%d,%d)',
+                      e.size, elast.size)
+            de = e
+        else:
+            de = e - elast
+
+        x0 = None
         x0 = _gen_x0(v, xs)
         if lessio:
             ax0 = aop(x0)
         else:
             ax0 = _gen_x0(v, ax)
 
-        if SORT_EIG_BY_SIMILARITY:
-            dx_norm = [0] * nroots
-            xt = [None] * nroots
-            for k, ek in enumerate(e):
-                if not conv[k]:
-                    xt[k] = ax0[k] - ek * x0[k]
-                    dx_norm[k] = numpy.sqrt(dot(xt[k].conj(), xt[k]).real)
-                    if abs(de[k]) < tol and dx_norm[k] < toloose:
-                        log.debug('root %d converged  |r|= %4.3g  e= %s  max|de|= %4.3g',
-                                  k, dx_norm[k], ek, de[k])
-                        conv[k] = True
-        else:
-            elast, conv_last = _sort_elast(elast, conv_last, vlast, v,
-                                           fresh_start, log)
-            de = e - elast
-            dx_norm = []
-            xt = []
-            for k, ek in enumerate(e):
-                xt.append(ax0[k] - ek * x0[k])
-                dx_norm.append(numpy.sqrt(dot(xt[k].conj(), xt[k]).real))
-                if not conv_last[k] and abs(de[k]) < tol and dx_norm[k] < toloose:
+        dx_norm = numpy.zeros(nroots)
+        xt = [None] * nroots
+        for k, ek in enumerate(e):
+            if not conv[k]:
+                xt[k] = ax0[k] - ek * x0[k]
+                dx_norm[k] = numpy.sqrt(dot(xt[k].conj(), xt[k]).real)
+                conv[k] = abs(de[k]) < tol and dx_norm[k] < toloose
+                if conv[k] and not conv_last[k]:
                     log.debug('root %d converged  |r|= %4.3g  e= %s  max|de|= %4.3g',
                               k, dx_norm[k], ek, de[k])
-            dx_norm = numpy.asarray(dx_norm)
-            conv = (abs(de) < tol) & (dx_norm < toloose)
+                    conv[k] = True
         ax0 = None
         max_dx_norm = max(dx_norm)
         ide = numpy.argmax(abs(de))
@@ -911,8 +889,8 @@ def davidson_nosym1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
             break
         elif (follow_state and max_dx_norm > 1 and
               max_dx_norm/max_dx_last > 3 and space > nroots+4):
-            log.debug('davidson %d %d  |r|= %4.3g  e= %s  max|de|= %4.3g  lindep= %4.3g',
-                      icyc, space, max_dx_norm, e, de[ide], norm_min)
+            log.debug('davidson %d %d  |r|= %4.3g  e= %s  max|de|= %4.3g',
+                      icyc, space, max_dx_norm, e, de[ide])
             log.debug('Large |r| detected, restore to previous x0')
             x0 = _gen_x0(vlast, xs)
             fresh_start = True
@@ -923,38 +901,26 @@ def davidson_nosym1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
                 emin = min(e)
 
         # remove subspace linear dependency
-        if any(((not conv[k]) and n**2>lindep) for k, n in enumerate(dx_norm)):
-            for k, ek in enumerate(e):
-                if (not conv[k]) and dx_norm[k]**2 > lindep:
-                    xt[k] = precond(xt[k], e[0], x0[k])
-                    xt[k] *= 1/numpy.sqrt(dot(xt[k].conj(), xt[k]).real)
-                else:
-                    xt[k] = None
-                    log.debug1('Throwing out eigenvector %d with norm=%4.3g', k, dx_norm[k])
-        else:
-            for k, ek in enumerate(e):
-                if dx_norm[k]**2 > lindep:
-                    xt[k] = precond(xt[k], e[0], x0[k])
-                    xt[k] *= 1/numpy.sqrt(dot(xt[k].conj(), xt[k]).real)
-                else:
-                    xt[k] = None
-        xt = [xi for xi in xt if xi is not None]
-
-        for i in range(space):
-            xsi = numpy.asarray(xs[i])
-            for xi in xt:
-                xi -= xsi * dot(xsi.conj(), xi)
-            xsi = None
-        norm_min = 1
-        for i,xi in enumerate(xt):
-            norm = numpy.sqrt(dot(xi.conj(), xi).real)
-            if norm**2 > lindep:
-                xt[i] *= 1/norm
-                norm_min = min(norm_min, norm)
+        for k, ek in enumerate(e):
+            if (not conv[k]) and dx_norm[k]**2 > lindep:
+                xt[k] = precond(xt[k], e[0]-level_shift, x0[k])
+                xt[k] *= dot(xt[k].conj(), xt[k]).real ** -.5
+            elif not conv[k]:
+                # Remove linearly dependent vector
+                xt[k] = None
+                log.debug1('Drop eigenvector %d, norm=%4.3g', k, dx_norm[k])
             else:
-                xt[i] = None
-        xt = [xi for xi in xt if xi is not None]
-        xi = None
+                xt[k] = None
+
+        xt, ill_precond = _project_xt_(xt, xs, e, lindep, dot, precond)
+        if ill_precond:
+            # Manually adjust the precond because precond function may not be
+            # able to generate linearly dependent basis vectors. e.g. issue 1362
+            log.warn('Matrix may be already a diagonal matrix. '
+                     'level_shift is applied to precond')
+            level_shift = 0.1
+
+        xt, norm_min = _normalize_xt_(xt, lindep, dot)
         log.debug('davidson %d %d  |r|= %4.3g  e= %s  max|de|= %4.3g  lindep= %4.3g',
                   icyc, space, max_dx_norm, e, de[ide], norm_min)
         if len(xt) == 0:
@@ -986,7 +952,7 @@ def davidson_nosym1(aop, x0, precond, tol=1e-12, max_cycle=50, max_space=12,
         e, v, idx = pick(w, v, nroots, locals())
         if len(e) == 0:
             raise RuntimeError(f'Not enough eigenvalues found by {pick}')
-        xl = _gen_x0(vl[:,idx[:nroots]].conj(), xs)
+        xl = _gen_x0(vl[:,idx[:nroots]], xs)
         x0 = _gen_x0(v[:,:nroots], xs)
         xl = [x for x in xl]  # nparray -> list
         x0 = [x for x in x0]  # nparray -> list
@@ -1129,7 +1095,7 @@ def dgeev1(abop, x0, precond, type=1, tol=1e-12, max_cycle=50, max_space=12,
     if isinstance(x0, numpy.ndarray) and x0.ndim == 1:
         x0 = [x0]
     #max_cycle = min(max_cycle, x0[0].size)
-    max_space = max_space + (nroots-1) * 3
+    max_space = max_space + (nroots-1) * 4
     # max_space*3 for holding ax, bx and xs, nroots*3 for holding axt, bxt and xt
     _incore = max_memory*1e6/x0[0].nbytes > max_space*3+nroots*3
     lessio = lessio and not _incore
@@ -1137,6 +1103,7 @@ def dgeev1(abop, x0, precond, type=1, tol=1e-12, max_cycle=50, max_space=12,
     seff = numpy.empty((max_space,max_space), dtype=x0[0].dtype)
     fresh_start = True
     conv = False
+    level_shift = 0
 
     for icyc in range(max_cycle):
         if fresh_start:
@@ -1226,15 +1193,14 @@ def dgeev1(abop, x0, precond, type=1, tol=1e-12, max_cycle=50, max_space=12,
             conv = True
             break
 
-        dx_norm = []
-        xt = []
+        dx_norm = numpy.zeros(nroots)
+        xt = [None] * nroots
         for k, ek in enumerate(e):
             if type == 1:
-                dxtmp = ax0[k] - ek * bx0[k]
+                xt[k] = ax0[k] - ek * bx0[k]
             else:
-                dxtmp = ax0[k] - ek * x0[k]
-            xt.append(dxtmp)
-            dx_norm.append(numpy_helper.norm(dxtmp))
+                xt[k] = ax0[k] - ek * x0[k]
+            dx_norm[k] = dot(xt[k].conj(), xt[k]).real ** .5
         ax0 = bx0 = None
 
         if max(dx_norm) < toloose:
@@ -1245,28 +1211,24 @@ def dgeev1(abop, x0, precond, type=1, tol=1e-12, max_cycle=50, max_space=12,
 
         # remove subspace linear dependency
         for k, ek in enumerate(e):
-            if dx_norm[k] > toloose:
-                xt[k] = precond(xt[k], e[0], x0[k])
-                xt[k] *= 1/numpy_helper.norm(xt[k])
+            if dx_norm[k]**2 > lindep:
+                xt[k] = precond(xt[k], e[0]-level_shift, x0[k])
+                xt[k] *= dot(xt[k].conj(), xt[k]).real ** -.5
             else:
+                log.debug1('Drop eigenvector %d, norm=%4.3g', k, dx_norm[k])
                 xt[k] = None
-        xt = [xi for xi in xt if xi is not None]
-        for i in range(space):
-            xsi = numpy.asarray(xs[i])
-            for xi in xt:
-                xi -= xsi * numpy.dot(xi, xsi)
-            xsi = None
-        norm_min = 1
-        for i,xi in enumerate(xt):
-            norm = numpy_helper.norm(xi)
-            if norm > toloose:
-                xt[i] *= 1/norm
-                norm_min = min(norm_min, norm)
-            else:
-                xt[i] = None
-        xt = [xi for xi in xt if xi is not None]
+
+        xt, ill_precond = _project_xt_(xt, xs, e, lindep, dot, precond)
+        if ill_precond:
+            # Manually adjust the precond because precond function may not be
+            # able to generate linearly dependent basis vectors. e.g. issue 1362
+            log.warn('Matrix may be already a diagonal matrix. '
+                     'level_shift is applied to precond')
+            level_shift = 0.1
+
+        xt, norm_min = _normalize_xt_(xt, lindep, dot)
         log.debug('davidson %d %d  |r|= %4.3g  e= %s  max|de|= %4.3g  lindep= %4.3g',
-                  icyc, space, max(dx_norm), e, de[ide], norm)
+                  icyc, space, max(dx_norm), e, de[ide], norm_min)
         if len(xt) == 0:
             log.debug('Linear dependency in trial subspace. |r| for each state %s',
                       dx_norm)
@@ -1447,11 +1409,13 @@ def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=numpy.dot,
     return x
 
 
-def dsolve(aop, b, precond, tol=1e-12, max_cycle=30, dot=numpy.dot,
-           lindep=DSOLVE_LINDEP, verbose=0, tol_residual=None):
-    '''Davidson iteration to solve linear equation.  It works bad.
+def solve(aop, b, precond, tol=1e-12, max_cycle=30, dot=numpy.dot,
+          lindep=DSOLVE_LINDEP, verbose=0, tol_residual=None):
+    '''Davidson iteration to solve linear equation.
     '''
-
+    msg = ('linalg_helper.solve is a bad solver for linear equations. '
+           'You should not use this solver in product.')
+    warnings.warn(msg)
     if tol_residual is None:
         toloose = numpy.sqrt(tol)
     else:
@@ -1466,14 +1430,14 @@ def dsolve(aop, b, precond, tol=1e-12, max_cycle=30, dot=numpy.dot,
     aeff = numpy.zeros((max_cycle,max_cycle), dtype=dtype)
     beff = numpy.zeros((max_cycle), dtype=dtype)
     for istep in range(max_cycle):
-        beff[istep] = dot(xs[istep], b)
+        beff[istep] = dot(xs[istep].conj(), b)
         for i in range(istep+1):
-            aeff[istep,i] = dot(xs[istep], ax[i])
-            aeff[i,istep] = dot(xs[i], ax[istep])
+            aeff[istep,i] = dot(xs[istep].conj(), ax[i])
+            aeff[i,istep] = dot(xs[i].conj(), ax[istep])
 
         v = scipy.linalg.solve(aeff[:istep+1,:istep+1], beff[:istep+1])
-        xtrial = dot(v, xs)
-        dx = b - dot(v, ax)
+        xtrial = _outprod_to_subspace(v, xs)
+        dx = b - _outprod_to_subspace(v, ax)
         rr = numpy_helper.norm(dx)
         if verbose:
             print('davidson', istep, rr)
@@ -1487,6 +1451,7 @@ def dsolve(aop, b, precond, tol=1e-12, max_cycle=30, dot=numpy.dot,
 
     return xtrial
 
+dsolve = solve
 
 def cho_solve(a, b, strict_sym_pos=True):
     '''Solve ax = b, where a is a positive definite hermitian matrix
@@ -1533,14 +1498,20 @@ def _qr(xs, dot, lindep=1e-14):
             nv += 1
     return qs[:nv], numpy.linalg.inv(rmat[:nv,:nv])
 
-def _gen_x0(v, xs):
+def _outprod_to_subspace(v, xs):
+    ndim = v.ndim
+    if ndim == 1:
+        v = v[:,None]
     space, nroots = v.shape
     x0 = numpy.einsum('c,x->cx', v[space-1], numpy.asarray(xs[space-1]))
     for i in reversed(range(space-1)):
         xsi = numpy.asarray(xs[i])
         for k in range(nroots):
             x0[k] += v[i,k] * xsi
+    if ndim == 1:
+        x0 = x0[0]
     return x0
+_gen_x0 = _outprod_to_subspace
 
 def _sort_by_similarity(w, v, nroots, conv, vlast, emin=None, heff=None):
     if not any(conv) or vlast is None:
@@ -1580,7 +1551,40 @@ def _sort_elast(elast, conv_last, vlast, v, fresh_start, log):
             for i in numpy.where(ordering_diff)[0]:
                 log.debug('  %3d     ->   %3d ', idx[i], i)
 
-    return [elast[i] for i in idx], [conv_last[i] for i in idx]
+    return elast[idx], conv_last[idx]
+
+def _project_xt_(xt, xs, e, threshold, dot, precond):
+    '''Projects out existing basis vectors xs. Also checks whether the precond
+    function is ill-conditioned'''
+    ill_precond = False
+    for i, xsi in enumerate(xs):
+        xsi = numpy.asarray(xsi)
+        for k, xi in enumerate(xt):
+            if xi is None:
+                continue
+            ovlp = dot(xsi.conj(), xi)
+            # xs[i] == xt[k]
+            if abs(1 - ovlp)**2 < threshold:
+                ill_precond = True
+                # rebuild xt[k] to remove correlation between xt[k] and xs[i]
+                xi[:] = precond(xi, e[k], xi)
+                ovlp = dot(xsi.conj(), xi)
+            xi -= xsi * ovlp
+        xsi = None
+    return xt, ill_precond
+
+def _normalize_xt_(xt, threshold, dot):
+    norm_min = 1
+    out = []
+    for i, xi in enumerate(xt):
+        if xi is None:
+            continue
+        norm = dot(xi.conj(), xi).real ** .5
+        if norm**2 > threshold:
+            xt[i] *= 1/norm
+            norm_min = min(norm_min, norm)
+            out.append(xt[i])
+    return out, norm_min
 
 
 class LinearDependenceError(RuntimeError):
@@ -1596,6 +1600,9 @@ def __getitem__(self, n):
         key = self.index[n]
         return self.scr_h5[str(key)]
 
+    def __iter__(self):
+        return (self[i] for i in range(len(self)))
+
     def append(self, x):
         length = len(self.index)
         key = length + 1
@@ -1624,173 +1631,3 @@ def pop(self, index):
         del (self.scr_h5[str(key)])
 
 del (SAFE_EIGH_LINDEP, DAVIDSON_LINDEP, DSOLVE_LINDEP, MAX_MEMORY)
-
-
-if __name__ == '__main__':
-    a = numpy.random.random((9,5))+numpy.random.random((9,5))*1j
-    q, r = _qr(a.T, numpy.dot)
-    print(abs(r.T.dot(q)-a.T).max())
-
-    numpy.random.seed(12)
-    n = 1000
-    #a = numpy.random.random((n,n))
-    a = numpy.arange(n*n).reshape(n,n)
-    a = numpy.sin(numpy.sin(a)) + a*1e-3j
-    a = a + a.T.conj() + numpy.diag(numpy.random.random(n))*10
-
-    e,u = scipy.linalg.eigh(a)
-    #a = numpy.dot(u[:,:15]*e[:15], u[:,:15].T)
-    print(e[0], u[0,0])
-
-    def aop(x):
-        return numpy.dot(a, x)
-
-    def precond(r, e0, x0):
-        idx = numpy.argwhere(abs(x0)>.1).ravel()
-        #idx = numpy.arange(20)
-        m = idx.size
-        if m > 2:
-            h0 = a[idx][:,idx] - numpy.eye(m)*e0
-            h0x0 = x0 / (a.diagonal() - e0)
-            h0x0[idx] = numpy.linalg.solve(h0, h0x0[idx])
-            h0r = r / (a.diagonal() - e0)
-            h0r[idx] = numpy.linalg.solve(h0, r[idx])
-            e1 = numpy.dot(x0, h0r) / numpy.dot(x0, h0x0)
-            x1 = (r - e1*x0) / (a.diagonal() - e0)
-            x1[idx] = numpy.linalg.solve(h0, (r-e1*x0)[idx])
-            return x1
-        else:
-            return r / (a.diagonal() - e0)
-
-    x0 = [a[0]/numpy.linalg.norm(a[0]),
-          a[1]/numpy.linalg.norm(a[1]),
-          a[2]/numpy.linalg.norm(a[2]),
-          a[3]/numpy.linalg.norm(a[3])]
-    e0,x0 = dsyev(aop, x0, precond, max_cycle=30, max_space=12,
-                  max_memory=.0001, verbose=5, nroots=4, follow_state=True)
-    print(e0[0] - e[0])
-    print(e0[1] - e[1])
-    print(e0[2] - e[2])
-    print(e0[3] - e[3])
-
-##########
-    a = a + numpy.diag(numpy.random.random(n)+1.1)* 10
-    b = numpy.random.random(n)
-    def aop(x):
-        return numpy.dot(a,x)
-    def precond(x, *args):
-        return x / a.diagonal()
-    x = numpy.linalg.solve(a, b)
-    x1 = dsolve(aop, b, precond, max_cycle=50)
-    print(abs(x - x1).sum())
-    a_diag = a.diagonal()
-    log = logger.Logger(sys.stdout, 5)
-    aop = lambda x: numpy.dot(a-numpy.diag(a_diag), x.ravel())/a_diag
-    x1 = krylov(aop, b/a_diag, max_cycle=50, verbose=log)
-    print(abs(x - x1).sum())
-    x1 = krylov(aop, b/a_diag, None, max_cycle=10, verbose=log)
-    x1 = krylov(aop, b/a_diag, x1, max_cycle=30, verbose=log)
-    print(abs(x - x1).sum())
-
-##########
-    numpy.random.seed(12)
-    n = 500
-    #a = numpy.random.random((n,n))
-    a = numpy.arange(n*n).reshape(n,n)
-    a = numpy.sin(numpy.sin(a))
-    a = a + a.T + numpy.diag(numpy.random.random(n))*10
-    b = numpy.random.random((n,n))
-    b = numpy.dot(b,b.T) + numpy.eye(n)*5
-
-    def abop(x):
-        return numpy.dot(numpy.asarray(x), a.T), numpy.dot(numpy.asarray(x), b.T)
-
-    def precond(r, e0, x0):
-        return r / (a.diagonal() - e0)
-
-    e,u = scipy.linalg.eigh(a, b)
-    x0 = [a[0]/numpy.linalg.norm(a[0]),
-          a[1]/numpy.linalg.norm(a[1]),]
-    e0,x0 = dgeev1(abop, x0, precond, type=1, max_cycle=100, max_space=18,
-                   verbose=5, nroots=4)[1:]
-    print(e0[0] - e[0])
-    print(e0[1] - e[1])
-    print(e0[2] - e[2])
-    print(e0[3] - e[3])
-
-
-    e,u = scipy.linalg.eigh(a, b, type=2)
-    x0 = [a[0]/numpy.linalg.norm(a[0]),
-          a[1]/numpy.linalg.norm(a[1]),]
-    e0,x0 = dgeev1(abop, x0, precond, type=2, max_cycle=100, max_space=18,
-                   verbose=5, nroots=4)[1:]
-    print(e0[0] - e[0])
-    print(e0[1] - e[1])
-    print(e0[2] - e[2])
-    print(e0[3] - e[3])
-
-    e,u = scipy.linalg.eigh(a, b, type=2)
-    x0 = [a[0]/numpy.linalg.norm(a[0]),
-          a[1]/numpy.linalg.norm(a[1]),]
-    abdiag = numpy.dot(a,b).diagonal().copy()
-    def abop(x):
-        x = numpy.asarray(x).T
-        return numpy.dot(a, numpy.dot(b, x)).T.copy()
-    def precond(r, e0, x0):
-        return r / (abdiag-e0)
-    e0, x0 = eig(abop, x0, precond, max_cycle=100, max_space=30, verbose=5,
-                 nroots=4, pick=pick_real_eigs)
-    print(e0[0] - e[0])
-    print(e0[1] - e[1])
-    print(e0[2] - e[2])
-    print(e0[3] - e[3])
-
-    e, ul, u = scipy.linalg.eig(numpy.dot(a, b), left=True)
-    idx = numpy.argsort(e)
-    e = e[idx]
-    ul = ul[:,idx]
-    u  = u [:,idx]
-    u  /= numpy.linalg.norm(u, axis=0)
-    ul /= numpy.linalg.norm(ul, axis=0)
-    x0 = [a[0]/numpy.linalg.norm(a[0]),
-          a[1]/numpy.linalg.norm(a[1]),]
-    abdiag = numpy.dot(a,b).diagonal().copy()
-    e0, vl, vr = eig(abop, x0, precond, max_cycle=100, max_space=30, verbose=5,
-                     nroots=4, pick=pick_real_eigs, left=True)
-    print(e0[0] - e[0])
-    print(e0[1] - e[1])
-    print(e0[2] - e[2])
-    print(e0[3] - e[3])
-    print((abs(vr[0]) - abs(u[:,0])).sum())
-    print((abs(vr[1]) - abs(u[:,1])).sum())
-    print((abs(vr[2]) - abs(u[:,2])).sum())
-    print((abs(vr[3]) - abs(u[:,3])).sum())
-#    print((abs(vl[0]) - abs(ul[:,0])).max())
-#    print((abs(vl[1]) - abs(ul[:,1])).max())
-#    print((abs(vl[2]) - abs(ul[:,2])).max())
-#    print((abs(vl[3]) - abs(ul[:,3])).max())
-
-##########
-    N = 200
-    neig = 4
-    A = numpy.zeros((N,N))
-    k = N/2
-    for ii in range(N):
-        i = ii+1
-        for jj in range(N):
-            j = jj+1
-            if j <= k:
-                A[ii,jj] = i*(i==j)-(i-j-k**2)
-            else:
-                A[ii,jj] = i*(i==j)+(i-j-k**2)
-    def matvec(x):
-        return numpy.dot(A,x)
-
-    def precond(r, e0, x0):
-        return (r+e0*x0) / A.diagonal()  # Converged
-        #return (r+e0*x0) / (A.diagonal()-e0)  # Does not converge
-        #return r / (A.diagonal()-e0)  # Does not converge
-    e, c = eig(matvec, A[:,0], precond, nroots=4, verbose=5,
-                   max_cycle=200,max_space=40, tol=1e-5)
-    print("# davidson evals =", e)
-
diff --git a/pyscf/lib/mcscf/fci_contract.c b/pyscf/lib/mcscf/fci_contract.c
index 374316be87..4c05a00e9b 100644
--- a/pyscf/lib/mcscf/fci_contract.c
+++ b/pyscf/lib/mcscf/fci_contract.c
@@ -769,9 +769,10 @@ static void pick_link_by_irrep(_LinkTrilT *clink, int *link_index,
         }
 }
 
-static void ctr_rhf2esym_kern1(double *eri, double *ci0, double *ci1ab,
-                              double *ci1buf, double *t1buf, int ncol_ci1buf,
-                              int bcount, int stra_id, int strb_id,
+static void ctr_rhf2esym_kern(double *eri, double *ci0a, double *ci0b,
+                              double *ci1a, double *ci1b,
+                              double *t1buf, int ncol_ci1buf,
+                              int bcount, int intera_id, int interb_id,
                               int nnorb, int nb_intermediate,
                               int na, int nb, int nlinka, int nlinkb,
                               _LinkTrilT *clink_indexa, _LinkTrilT *clink_indexb)
@@ -783,90 +784,238 @@ static void ctr_rhf2esym_kern1(double *eri, double *ci0, double *ci1ab,
         double *vt1 = t1buf + nnorb*bcount;
 
         NPdset0(t1, nnorb*bcount);
-        FCIprog_a_t1(ci0, t1, bcount, stra_id, strb_id,
-                     0, nb, nlinka, clink_indexa);
+        if (na > 0) {
+                // (stra,interb) * ia(alpha) -> (intera,interb)
+                FCIprog_a_t1(ci0a, t1, bcount, intera_id, interb_id,
+                             0, nb_intermediate, nlinka, clink_indexa);
+        }
+        if (nb > 0) {
+                // (intera,strb) * ia(beta) -> (intera,interb)
+                FCIprog_b_t1(ci0b, t1, bcount, intera_id, interb_id,
+                             0, nb, nlinkb, clink_indexb);
+        }
         dgemm_(&TRANS_N, &TRANS_N, &bcount, &nnorb, &nnorb,
                &D1, t1, &bcount, eri, &nnorb, &D0, vt1, &bcount);
-        FCIspread_b_t1(ci1ab, vt1, bcount, stra_id, strb_id,
-                       0, nb_intermediate, nlinkb, clink_indexb);
-        spread_bufa_t1(ci1buf, vt1, bcount, bcount, stra_id, 0,
-                       0, ncol_ci1buf, nlinka, clink_indexa);
+
+        if (nb > 0) {
+                // (intera,interb) * ia(beta) -> (intera,strb)
+                FCIspread_b_t1(ci1b, vt1, bcount, intera_id, interb_id,
+                               0, nb, nlinkb, clink_indexb);
+        }
+        if (na > 0) {
+                // (intera,interb) * ia(alpha) -> (stra,interb)
+                spread_bufa_t1(ci1a, vt1, bcount, bcount, intera_id, 0,
+                               0, ncol_ci1buf, nlinka, clink_indexa);
+        }
 }
 
-static void loop_c2e_symm1(double *eri, double *ci0, double *ci1aa, double *ci1ab,
-                           int nnorb, int na_intermediate, int nb_intermediate,
-                           int na, int nb, int nlinka, int nlinkb,
-                           _LinkTrilT *clinka, _LinkTrilT *clinkb)
-{
-        double *ci1bufs[MAX_THREADS];
-#pragma omp parallel
+// ci0a and ci1a ~ (stra,interb)
+// ci0b and ci1b ~ (intera,strb)
+static void loop_c2e_symm(double *eri, double *ci0a, double *ci0b,
+                          double *ci1a, double *ci1b, double *t1buf, double **ci1bufs,
+                          int nnorb, int na, int nb, int na_intermediate, int nb_intermediate,
+                          int nlinka, int nlinkb, _LinkTrilT *clinka, _LinkTrilT *clinkb)
 {
         int strk, ib;
         size_t blen;
-        double *t1buf = malloc(sizeof(double) * (STRB_BLKSIZE*nnorb*2+2));
-        double *ci1buf = malloc(sizeof(double) * (na*STRB_BLKSIZE+2));
-        ci1bufs[omp_get_thread_num()] = ci1buf;
-        for (ib = 0; ib < nb; ib += STRB_BLKSIZE) {
-                blen = MIN(STRB_BLKSIZE, nb-ib);
-                NPdset0(ci1buf, ((size_t)na) * blen);
+        double *ci1buf = ci1bufs[omp_get_thread_num()];
+        if (na > 0) {
+                for (ib = 0; ib < nb_intermediate; ib += STRB_BLKSIZE) {
+                        blen = MIN(STRB_BLKSIZE, nb_intermediate-ib);
+                        NPdset0(ci1buf, ((size_t)na) * blen);
 #pragma omp for schedule(static)
-                for (strk = 0; strk < na_intermediate; strk++) {
-                        ctr_rhf2esym_kern1(eri, ci0, ci1ab, ci1buf, t1buf,
-                                           blen, blen, strk, ib,
-                                           nnorb, nb_intermediate, na, nb,
-                                           nlinka, nlinkb, clinka, clinkb);
-                }
-//                NPomp_dsum_reduce_inplace(ci1bufs, blen*na);
-//#pragma omp master
-//                FCIaxpy2d(ci1aa+ib, ci1buf, na, nb, blen);
+                        for (strk = 0; strk < na_intermediate; strk++) {
+                                ctr_rhf2esym_kern(eri, ci0a, ci0b, ci1buf, ci1b, t1buf,
+                                                  blen, blen, strk, ib,
+                                                  nnorb, nb_intermediate, na, nb,
+                                                  nlinka, nlinkb, clinka, clinkb);
+                        }
 #pragma omp barrier
-                _reduce(ci1aa+ib, ci1bufs, na, nb, blen);
+                        _reduce(ci1a+ib, ci1bufs, na, nb_intermediate, blen);
 // An explicit barrier to ensure ci1 is updated. Without barrier, there may
-// occur race condition between FCIaxpy2d and ctr_rhf2esym_kern1
+// occur race condition between FCIaxpy2d and ctr_rhf2esym_kern
 #pragma omp barrier
+                }
+        } else {
+                for (ib = 0; ib < nb_intermediate; ib += STRB_BLKSIZE) {
+                        blen = MIN(STRB_BLKSIZE, nb_intermediate-ib);
+#pragma omp for schedule(static)
+                        for (strk = 0; strk < na_intermediate; strk++) {
+                                ctr_rhf2esym_kern(eri, ci0a, ci0b, ci1buf, ci1b, t1buf,
+                                                  blen, blen, strk, ib,
+                                                  nnorb, nb_intermediate, na, nb,
+                                                  nlinka, nlinkb, clinka, clinkb);
+                        }
+                }
+        }
+}
+
+void FCIcontract_2e_symm1(double *eris, double *ci0, double *ci1,
+                          int *eris_ir_dims, int *ci_ir_size,
+                          int *nas, int *nbs, int *linka, int *linkb,
+                          int norb, int nlinka, int nlinkb, int nirreps, int wfnsym)
+{
+        int i;
+        int na = 0;
+        int nb = 0;
+        int *linka_loc = malloc(sizeof(int) * (nirreps*4+4));
+        int *linkb_loc = linka_loc + nirreps + 1;
+        int *eris_loc = linkb_loc + nirreps + 1;
+        int *ci_loc = eris_loc + nirreps + 1;
+        linka_loc[0] = 0;
+        linkb_loc[0] = 0;
+        eris_loc[0] = 0;
+        ci_loc[0] = 0;
+        for (i = 0; i < nirreps; i++) {
+                na = MAX(nas[i], na);
+                nb = MAX(nbs[i], nb);
+                linka_loc[i+1] = linka_loc[i] + nas[i] * nlinka * 4;
+                linkb_loc[i+1] = linkb_loc[i] + nbs[i] * nlinkb * 4;
+                eris_loc[i+1] = eris_loc[i] + eris_ir_dims[i]*eris_ir_dims[i];
+                ci_loc[i+1] = ci_loc[i] + ci_ir_size[i];
         }
+
+        double *ci1bufs[MAX_THREADS];
+#pragma omp parallel
+{
+        _LinkTrilT *clinka = malloc(sizeof(_LinkTrilT) * nlinka * na);
+        _LinkTrilT *clinkb = malloc(sizeof(_LinkTrilT) * nlinkb * nb);
+        double *t1buf = malloc(sizeof(double) * (STRB_BLKSIZE*norb*(norb+1)+2));
+        double *ci1buf = malloc(sizeof(double) * (na*STRB_BLKSIZE+2));
+        ci1bufs[omp_get_thread_num()] = ci1buf;
+
+        int ai_ir, t1_ir, intera_ir, interb_ir, stra_ir, strb_ir;
+        for (intera_ir = 0; intera_ir < nirreps; intera_ir++) {
+// TODO: pick_link_by_irrep to extract link_index for all nirreps in one pass
+        for (ai_ir = 0; ai_ir < nirreps; ai_ir++) {
+                if (eris_ir_dims[ai_ir] > 0) {
+                        t1_ir = wfnsym ^ ai_ir;
+                        interb_ir = t1_ir ^ intera_ir;
+                        stra_ir = ai_ir ^ intera_ir;
+                        strb_ir = ai_ir ^ interb_ir;
+                        if (nas[intera_ir] > 0 && nbs[interb_ir] > 0 &&
+                            (nas[stra_ir] > 0 || nbs[strb_ir] > 0)) {
+// clinka for intera_ir*ai_ir -> stra_ir
+pick_link_by_irrep(clinka, linka+linka_loc[intera_ir], nas[intera_ir], nlinka, ai_ir);
+// clinkb for interb_ir*ai_ir -> strb_ir
+pick_link_by_irrep(clinkb, linkb+linkb_loc[interb_ir], nbs[interb_ir], nlinkb, ai_ir);
+loop_c2e_symm(eris+eris_loc[ai_ir],
+              ci0+ci_loc[stra_ir], ci0+ci_loc[wfnsym^strb_ir],
+              ci1+ci_loc[stra_ir], ci1+ci_loc[wfnsym^strb_ir], t1buf, ci1bufs,
+              eris_ir_dims[ai_ir], nas[stra_ir], nbs[strb_ir],
+              nas[intera_ir], nbs[interb_ir], nlinka, nlinkb, clinka, clinkb);
+                        }
+                }
+        } }
         free(ci1buf);
         free(t1buf);
+        free(clinka);
+        free(clinkb);
 }
+        free(linka_loc);
 }
 
-#define TOTIRREPS       8
-void FCIcontract_2e_symm1(double **eris, double **ci0, double **ci1,
-                          int norb, int *nas, int *nbs, int nlinka, int nlinkb,
-                          int **linka, int **linkb, int *dimirrep, int wfnsym)
+#define IRREP_OF(l, g)  (l + max_momentum + (g) * ug_offsets)
+void FCIcontract_2e_cyl_sym(double *eris, double *ci0, double *ci1,
+                            int *eris_ir_dims, int *ci_ir_size,
+                            int *nas, int *nbs, int *linka, int *linkb,
+                            int norb, int nlinka, int nlinkb,
+                            int max_momentum, int max_gerades,
+                            int wfn_momentum, int wfn_ungerade)
 {
+        int nirreps = (max_momentum * 2 + 1) * max_gerades;
+        int ug_offsets = max_momentum * 2 + 1;
         int i;
         int na = 0;
         int nb = 0;
-        for (i = 0; i < TOTIRREPS; i++) {
+        int *linka_loc = malloc(sizeof(int) * (nirreps*4+4));
+        int *linkb_loc = linka_loc + nirreps + 1;
+        int *ci_loc = linkb_loc + nirreps + 1;
+        int *eris_loc = ci_loc + nirreps + 1;
+        linka_loc[0] = 0;
+        linkb_loc[0] = 0;
+        eris_loc[0] = 0;
+        ci_loc[0] = 0;
+        for (i = 0; i < nirreps; i++) {
                 na = MAX(nas[i], na);
                 nb = MAX(nbs[i], nb);
+                linka_loc[i+1] = linka_loc[i] + nas[i] * nlinka * 4;
+                linkb_loc[i+1] = linkb_loc[i] + nbs[i] * nlinkb * 4;
+                eris_loc[i+1] = eris_loc[i] + eris_ir_dims[i]*eris_ir_dims[i];
+                ci_loc[i+1] = ci_loc[i] + ci_ir_size[i];
         }
+
+        double *ci1bufs[MAX_THREADS];
+#pragma omp parallel
+{
         _LinkTrilT *clinka = malloc(sizeof(_LinkTrilT) * nlinka * na);
         _LinkTrilT *clinkb = malloc(sizeof(_LinkTrilT) * nlinkb * nb);
-        int ai_ir, stra_ir, strb_ir, intera_ir, interb_ir, ma, mb;
-        for (stra_ir = 0; stra_ir < TOTIRREPS; stra_ir++) {
-        for (ai_ir = 0; ai_ir < TOTIRREPS; ai_ir++) {
-                strb_ir = wfnsym^stra_ir;
-                ma = nas[stra_ir];
-                mb = nbs[strb_ir];
-                if (ma > 0 && mb > 0 && dimirrep[ai_ir] > 0) {
-                        intera_ir = ai_ir^stra_ir;
-                        interb_ir = ai_ir^strb_ir;
-                        // clinka for inter_ir*ai_ir -> stra_ir
-                        pick_link_by_irrep(clinka, linka[intera_ir],
-                                           nas[intera_ir], nlinka, ai_ir);
-                        // clinka for strb_ir*ai_ir -> inter_ir
-                        pick_link_by_irrep(clinkb, linkb[strb_ir],
-                                           nbs[strb_ir], nlinkb, ai_ir);
-                        loop_c2e_symm1(eris[ai_ir], ci0[stra_ir],
-                                       ci1[stra_ir], ci1[intera_ir],
-                                       dimirrep[ai_ir], nas[intera_ir],
-                                       nbs[interb_ir], ma, mb,
-                                       nlinka, nlinkb, clinka, clinkb);
+        double *t1buf = malloc(sizeof(double) * (STRB_BLKSIZE*norb*(norb+1)+2));
+        double *ci1buf = malloc(sizeof(double) * (na*STRB_BLKSIZE+2));
+        ci1bufs[omp_get_thread_num()] = ci1buf;
+
+        int stra_l, strb_l, stra_g, strb_g;
+        int stra_ir = 0;
+        int strb_ir = 0;
+        int intera_l, interb_l, intera_g, interb_g, intera_ir, interb_ir;
+        int ai_l, ai_g, ai_ir, t1_l, t1_g;
+        int eri_m0, eri_m1;
+        int ma, mb;
+
+        for (intera_g = 0; intera_g < max_gerades; intera_g++) {
+        for (intera_l = -max_momentum; intera_l <= max_momentum; intera_l++) {
+                // abs(ai_l) < max_momentum
+                // t1_l := wfn_momentum - ai_l
+                // abs(interb_l := t1_l-intera_l) < max_momentum
+                //      => range for ai_l
+                eri_m0 = MAX(0, wfn_momentum-intera_l) - max_momentum;;
+                eri_m1 = MIN(0, wfn_momentum-intera_l) + max_momentum;;
+// TODO: pick_link_by_irrep to extract link_index for all nirreps in one pass
+                for (ai_g = 0; ai_g < max_gerades; ai_g++) {
+                for (ai_l = eri_m0; ai_l <= eri_m1; ai_l++) {
+                        ai_ir = IRREP_OF(ai_l, ai_g);
+
+                        if (eris_ir_dims[ai_ir] > 0) {
+                                t1_l = wfn_momentum - ai_l;
+                                t1_g = wfn_ungerade ^ ai_g;
+                                interb_l = t1_l - intera_l;
+                                interb_g = t1_g ^ intera_g;
+                                intera_ir = IRREP_OF(intera_l, intera_g);
+                                interb_ir = IRREP_OF(interb_l, interb_g);
+
+                                stra_l = intera_l + ai_l;
+                                stra_g = intera_g ^ ai_g;
+                                strb_l = interb_l + ai_l; // = wfn_momentum-intera_l
+                                strb_g = interb_g ^ ai_g; // = wfn_ungerade^intera_g
+                                ma = 0;
+                                if (abs(stra_l) <= max_momentum) {
+                                        stra_ir = IRREP_OF(stra_l, stra_g);
+                                        ma = nas[stra_ir];
+                                }
+                                mb = 0;
+                                if (abs(strb_l) <= max_momentum) {
+                                        strb_ir = IRREP_OF(strb_l, strb_g);
+                                        mb = nbs[strb_ir];
+                                }
+                                if (nas[intera_ir] > 0 && nas[interb_ir] > 0 &&
+                                    (ma > 0 || mb > 0)) {
+// clinka for intera*ai -> stra.
+pick_link_by_irrep(clinka, linka+linka_loc[intera_ir], nas[intera_ir], nlinka, ai_ir);
+// clinkb for interb*ai -> strb
+pick_link_by_irrep(clinkb, linkb+linkb_loc[interb_ir], nbs[interb_ir], nlinkb, ai_ir);
+loop_c2e_symm(eris+eris_loc[ai_ir],
+              ci0+ci_loc[stra_ir], ci0+ci_loc[intera_ir],
+              ci1+ci_loc[stra_ir], ci1+ci_loc[intera_ir], t1buf, ci1bufs,
+              eris_ir_dims[ai_ir], ma, mb, nas[intera_ir], nbs[interb_ir],
+              nlinka, nlinkb, clinka, clinkb);
+                                }
+                        } }
                 }
         } }
+        free(ci1buf);
+        free(t1buf);
         free(clinka);
         free(clinkb);
 }
-
+        free(linka_loc);
+}
diff --git a/pyscf/lib/misc.py b/pyscf/lib/misc.py
index 834b06fae4..96ea5b2d4d 100644
--- a/pyscf/lib/misc.py
+++ b/pyscf/lib/misc.py
@@ -37,6 +37,41 @@
 except ImportError:
     ThreadPoolExecutor = None
 
+if sys.platform.startswith('linux'):
+    # Avoid too many threads being created in OMP loops.
+    # See issue https://github.com/pyscf/pyscf/issues/317
+    try:
+        from elftools.elf.elffile import ELFFile
+    except ImportError:
+        pass
+    else:
+        def _ldd(so_file):
+            libs = []
+            with open(so_file, 'rb') as f:
+                elf = ELFFile(f)
+                for seg in elf.iter_segments():
+                    if seg.header.p_type != 'PT_DYNAMIC':
+                        continue
+                    for t in seg.iter_tags():
+                        if t.entry.d_tag == 'DT_NEEDED':
+                            libs.append(t.needed)
+                    break
+            return libs
+
+        so_file = os.path.abspath(os.path.join(__file__, '..', 'libnp_helper.so'))
+        for p in _ldd(so_file):
+            if 'mkl' in p and 'thread' in p:
+                warnings.warn(f'PySCF C exteions are incompatible with {p}. '
+                              'MKL_NUM_THREADS is set to 1')
+                os.environ['MKL_NUM_THREADS'] = '1'
+                break
+            elif 'openblasp' in p or 'openblaso' in p:
+                warnings.warn(f'PySCF C exteions are incompatible with {p}. '
+                              'OPENBLAS_NUM_THREADS is set to 1')
+                os.environ['OPENBLAS_NUM_THREADS'] = '1'
+                break
+        del p, so_file, _ldd
+
 from pyscf.lib import param
 from pyscf import __config__
 
@@ -718,6 +753,12 @@ def fn1(self, a=None, b=None):
             if a is None: a = self.a
             if b is None: b = self.b
             return fn(a, b)
+
+    This function can be used to replace "staticmethod" when inserting a module
+    method into a class. In a child class, it allows one to call the method of a
+    base class with either "self.__class__.method_name(self, args)" or
+    "self.super().method_name(args)". For method created with "staticmethod",
+    calling "self.super().method_name(args)" is the only option.
     '''
     _locals = {}
     name = fn.__name__
diff --git a/pyscf/lib/numpy_helper.py b/pyscf/lib/numpy_helper.py
index b3ad2c0a9a..fe9b945b53 100644
--- a/pyscf/lib/numpy_helper.py
+++ b/pyscf/lib/numpy_helper.py
@@ -913,23 +913,7 @@ def frompointer(pointer, count, dtype=float):
     a = numpy.ndarray(count, dtype=numpy.int8, buffer=buf)
     return a.view(dtype)
 
-from distutils.version import LooseVersion
-if LooseVersion(numpy.__version__) <= LooseVersion('1.6.0'):
-    def norm(x, ord=None, axis=None):
-        '''numpy.linalg.norm for numpy 1.6.*
-        '''
-        if axis is None or ord is not None:
-            return numpy.linalg.norm(x, ord)
-        else:
-            x = numpy.asarray(x)
-            axes = string.ascii_lowercase[:x.ndim]
-            target = axes.replace(axes[axis], '')
-            descr = '%s,%s->%s' % (axes, axes, target)
-            xx = _numpy_einsum(descr, x.conj(), x)
-            return numpy.sqrt(xx.real)
-else:
-    norm = numpy.linalg.norm
-del (LooseVersion)
+norm = numpy.linalg.norm
 
 def cond(x, p=None):
     '''Compute the condition number'''
diff --git a/pyscf/lib/parameters.py b/pyscf/lib/parameters.py
index 6ed42d41ac..2730dab3a1 100644
--- a/pyscf/lib/parameters.py
+++ b/pyscf/lib/parameters.py
@@ -69,10 +69,10 @@
     ('',),
     ('x', 'y', 'z'),
     ('xy', 'yz', 'z^2', 'xz', 'x2-y2',),
-    ('-3', '-2', '-1', ' 0', ' 1', ' 2', ' 3'),
-    ('-4', '-3', '-2', '-1', ' 0', ' 1', ' 2', ' 3', ' 4'),
-    ('-5', '-4', '-3', '-2', '-1', ' 0', ' 1', ' 2', ' 3', ' 4', ' 5'),
-    ('-6', '-5', '-4', '-3', '-2', '-1', ' 0', ' 1', ' 2', ' 3', ' 4', ' 5', ' 6'),
+    ('-3', '-2', '-1', '+0', '+1', '+2', '+3'),
+    ('-4', '-3', '-2', '-1', '+0', '+1', '+2', '+3', '+4'),
+    ('-5', '-4', '-3', '-2', '-1', '+0', '+1', '+2', '+3', '+4', '+5'),
+    ('-6', '-5', '-4', '-3', '-2', '-1', '+0', '+1', '+2', '+3', '+4', '+5', '+6'),
 )
 
 VERBOSE_DEBUG  = 5
diff --git a/pyscf/lib/test/test_linalg_helper.py b/pyscf/lib/test/test_linalg_helper.py
index d25998987d..6cc7e463c4 100644
--- a/pyscf/lib/test/test_linalg_helper.py
+++ b/pyscf/lib/test/test_linalg_helper.py
@@ -22,6 +22,7 @@
 from pyscf import gto
 from pyscf import scf
 from pyscf import fci
+from pyscf.lib import linalg_helper
 
 class KnownValues(unittest.TestCase):
     def test_davidson(self):
@@ -44,8 +45,178 @@ def test_davidson_large_dx(self):
                     H 1.92 1.38 0
                     H -1.92 1.38 0''', verbose=0)
         ci = fci.FCI(mol.RHF().run()).run()
-        self.assertAlmostEqual(ci.e_tot, -74.74294263255416, 9)
+        self.assertAlmostEqual(ci.e_tot, -74.74294263255416, 8)
 
+    def test_linalg_qr(self):
+        a = numpy.random.random((9,5))+numpy.random.random((9,5))*1j
+        q, r = linalg_helper._qr(a.T, numpy.dot)
+        self.assertAlmostEqual(abs(r.T.dot(q)-a.T).max(), 0, 8)
+
+    def test_davidson1(self):
+        numpy.random.seed(12)
+        n = 100
+        a = numpy.random.rand(n,n)
+        a = a + a.conj().T + numpy.diag(numpy.random.random(n))*10
+        eref, u = scipy.linalg.eigh(a)
+
+        def aop(x):
+            return numpy.dot(a, x)
+        x0 = a[0]
+        e0, x0 = linalg_helper.dsyev(aop, x0, a.diagonal(), max_cycle=100,
+                                     nroots=3, follow_state=True)
+        self.assertAlmostEqual(abs(e0[:3] - eref[:3]).max(), 0, 8)
+        self.assertAlmostEqual(abs(numpy.abs(x0[:3]) - abs(u[:,:3].T)).max(), 0, 5)
+
+        x0 = a[0]
+        e0, x0 = linalg_helper.dsyev(aop, x0, a.diagonal(), max_cycle=100,
+                                     max_memory=1e-4, nroots=3, follow_state=True)
+        self.assertAlmostEqual(abs(e0[:3] - eref[:3]).max(), 0, 8)
+        self.assertAlmostEqual(abs(numpy.abs(x0[:3]) - abs(u[:,:3].T)).max(), 0, 5)
+
+    def test_davidson_diag_matrix(self):
+        numpy.random.seed(12)
+        n = 100
+        a = numpy.diag(numpy.random.random(n))
+        eref = numpy.sort(a.diagonal())
+
+        def aop(x):
+            return numpy.dot(a, x)
+        x0 = numpy.random.rand(n)
+        e0, x0 = linalg_helper.dsyev(aop, x0, a.diagonal(), nroots=3)
+        self.assertAlmostEqual(abs(e0 - eref[:3]).max(), 0, 8)
+
+        a = numpy.eye(n) * 2
+        def aop(x):
+            return numpy.dot(a, x)
+        x0 = numpy.random.rand(n)
+        e0, x0 = linalg_helper.dsyev(aop, x0, a.diagonal(), nroots=3)
+        self.assertEqual(e0.size, 1)
+        self.assertAlmostEqual(e0, 2, 8)
+
+    @unittest.skip('bad solver. experimental only')
+    def test_solve(self):
+        numpy.random.seed(12)
+        n = 100
+        a = numpy.random.rand(n,n)
+        a = a + a.conj().T
+        a += numpy.diag(numpy.random.random(n))* 10
+        b = numpy.random.random(n)
+        def aop(x):
+            return numpy.dot(a,x)
+        def precond(x, *args):
+            return x / a.diagonal()
+        xref = numpy.linalg.solve(a, b)
+        x1 = linalg_helper.dsolve(aop, b, precond, max_cycle=50)
+        self.assertAlmostEqual(abs(xref - x1).max(), 0, 6)
+        a_diag = a.diagonal()
+        aop = lambda x: numpy.dot(a-numpy.diag(a_diag), x.ravel())/a_diag
+        x1 = linalg_helper.krylov(aop, b/a_diag, max_cycle=50)
+        self.assertAlmostEqual(abs(xref - x1).max(), 0, 6)
+        x1 = linalg_helper.krylov(aop, b/a_diag, None, max_cycle=10)
+        x1 = linalg_helper.krylov(aop, b/a_diag, x1, max_cycle=30)
+        self.assertAlmostEqual(abs(xref - x1).max(), 0, 6)
+
+    def test_dgeev(self):
+        numpy.random.seed(12)
+        n = 100
+        a = numpy.random.rand(n,n)
+        a = a + a.conj().T
+        a += numpy.diag(numpy.random.random(n))* 10
+        b = numpy.random.random((n,n))
+        b = numpy.dot(b,b.T)
+
+        def abop(x):
+            return numpy.dot(numpy.asarray(x), a.T), numpy.dot(numpy.asarray(x), b.T)
+
+        eref, u = scipy.linalg.eigh(a, b)
+        x0 = a[0]
+        e0,x0 = linalg_helper.dgeev1(abop, x0, a.diagonal(), type=1,
+                                     max_cycle=100, nroots=3)[1:]
+        self.assertAlmostEqual(abs(e0 - eref[:3]).max(), 0, 8)
+        self.assertAlmostEqual(abs(numpy.abs(x0) - abs(u[:,:3].T)).max(), 0, 4)
+
+        eref, u = scipy.linalg.eigh(a, b, type=2)
+        x0 = a[0]
+        e0,x0 = linalg_helper.dgeev1(abop, x0, a.diagonal(), type=2,
+                                     max_cycle=100, nroots=3)[1:]
+        self.assertAlmostEqual(abs(e0 - eref[:3]).max(), 0, 8)
+        self.assertAlmostEqual(abs(numpy.abs(x0) - abs(u[:,:3].T)).max(), 0, 4)
+
+    def test_eig1(self):
+        numpy.random.seed(12)
+        n = 100
+        a = numpy.random.rand(n,n)
+        a = a + a.conj().T
+        a += numpy.diag(numpy.random.random(n))* 10
+        b = numpy.random.random((n,n))
+        b = numpy.dot(b,b.T) + numpy.eye(n)*5
+
+        def abop(x):
+            return numpy.dot(numpy.asarray(x), a.T), numpy.dot(numpy.asarray(x), b.T)
+
+        eref, u = scipy.linalg.eigh(a, b, type=2)
+        u /= numpy.linalg.norm(u, axis=0)
+        x0 = a[0]
+        def abop(x):
+            x = numpy.asarray(x).T
+            return numpy.dot(a, numpy.dot(b, x)).T.copy()
+        e0, x0 = linalg_helper.eig(abop, x0, a.diagonal(), max_cycle=100,
+                                   nroots=3, pick=linalg_helper.pick_real_eigs)
+        self.assertAlmostEqual(abs(e0 - eref[:3]).max(), 0, 7)
+
+    def test_eig2(self):
+        numpy.random.seed(12)
+        n = 100
+        a = numpy.random.rand(n,n)
+        a = a + a.conj().T
+        a += numpy.diag(numpy.random.random(n))* 10
+        b = numpy.random.random((n,n))
+        b = numpy.dot(b,b.T) + numpy.eye(n)*5
+
+        def abop(x):
+            x = numpy.asarray(x).T
+            return numpy.dot(a, numpy.dot(b, x)).T.copy()
+
+        e, ul, u = scipy.linalg.eig(numpy.dot(a, b), left=True)
+        idx = numpy.argsort(e)
+        e = e[idx]
+        ul = ul[:,idx]
+        u  = u [:,idx]
+        u  /= numpy.linalg.norm(u, axis=0)
+        ul /= numpy.linalg.norm(ul, axis=0)
+        x0 = a[0]
+        e0, vl, vr = linalg_helper.eig(abop, x0, a.diagonal(), max_cycle=100,
+                         nroots=3, pick=linalg_helper.pick_real_eigs, left=True)
+        self.assertAlmostEqual(abs(e0 - e[:3]).max(), 0, 7)
+        self.assertAlmostEqual(abs(numpy.abs(vr) - abs(u[:,:3].T)).max(), 0, 5)
+        # FIXME: left eigenvectors do not agree with scipy results
+        print((abs(vl[0]) - abs(ul[:,0])).max())
+        print((abs(vl[1]) - abs(ul[:,1])).max())
+        print((abs(vl[2]) - abs(ul[:,2])).max())
+
+    @unittest.skip('difficult to converge')
+    def test_eig_difficult_problem(self):
+        N = 40
+        neig = 4
+        A = numpy.zeros((N,N))
+        k = N/2
+        for ii in range(N):
+            i = ii+1
+            for jj in range(N):
+                j = jj+1
+                if j <= k:
+                    A[ii,jj] = i*(i==j)-(i-j-k**2)
+                else:
+                    A[ii,jj] = i*(i==j)+(i-j-k**2)
+        def matvec(x):
+            return numpy.dot(A,x)
+
+        def precond(r, e0, x0):
+            return (r+e0*x0) / A.diagonal()  # Converged
+            #return (r+e0*x0) / (A.diagonal()-e0)  # Does not converge
+            #return r / (A.diagonal()-e0)  # Does not converge
+        x0 = A[0]
+        e, c = linalg_helper.eig(matvec, x0, precond, nroots=2, tol=1e-6, verbose=5)
 
 if __name__ == "__main__":
     print("Full Tests for linalg_helper")
diff --git a/pyscf/lo/boys.py b/pyscf/lo/boys.py
index ea2e9ab5d9..c6449884a2 100644
--- a/pyscf/lo/boys.py
+++ b/pyscf/lo/boys.py
@@ -103,10 +103,12 @@ def kernel(localizer, mo_coeff=None, callback=None, verbose=None):
     return localizer.mo_coeff
 
 
-def dipole_integral(mol, mo_coeff):
+def dipole_integral(mol, mo_coeff, charge_center=None):
     # The gauge origin has no effects for maximization |<r>|^2
     # Set to charge center for physical significance of <r>
-    charge_center = numpy.einsum('z,zx->x', mol.atom_charges(), mol.atom_coords())
+    if charge_center is None:
+        charge_center = (numpy.einsum('z,zx->x', mol.atom_charges(), mol.atom_coords())
+                         / mol.atom_charges().sum())
     with mol.with_common_origin(charge_center):
         dip = numpy.asarray([reduce(lib.dot, (mo_coeff.conj().T, x, mo_coeff))
                              for x in mol.intor_symmetric('int1e_r', comp=3)])
@@ -291,10 +293,13 @@ def get_grad(self, u=None):
     def cost_function(self, u=None):
         if u is None: u = numpy.eye(self.mo_coeff.shape[1])
         mo_coeff = lib.dot(self.mo_coeff, u)
-        dip = dipole_integral(self.mol, mo_coeff)
-        r2 = self.mol.intor_symmetric('int1e_r2')
+        charge_center = (numpy.einsum('z,zx->x', self.mol.atom_charges(), self.mol.atom_coords())
+                         / self.mol.atom_charges().sum())
+        dip = dipole_integral(self.mol, mo_coeff, charge_center)
+        with self.mol.with_common_origin(charge_center):
+            r2 = self.mol.intor_symmetric('int1e_r2')
         r2 = numpy.einsum('pi,pi->', mo_coeff, lib.dot(r2, mo_coeff))
-        val = r2 - numpy.einsum('xii,xii->', dip, dip) * 2
+        val = r2 - numpy.einsum('xii,xii->', dip, dip)
         return val
 
     def get_init_guess(self, key='atomic'):
diff --git a/pyscf/lo/nao.py b/pyscf/lo/nao.py
index 6df3163e72..cc5ba2c48d 100644
--- a/pyscf/lo/nao.py
+++ b/pyscf/lo/nao.py
@@ -167,7 +167,7 @@ def _core_val_ryd_list(mol):
         nc = mol.bas_nctr(ib)
 
         nelec_ecp = mol.atom_nelec_core(ia)
-        ecpcore = core_configuration(nelec_ecp)
+        ecpcore = core_configuration(nelec_ecp, atom_symbol=mol.atom_pure_symbol(ia))
         coreshell = [int(x) for x in AOSHELL[nuc][0][::2]]
         cvshell = [int(x) for x in AOSHELL[nuc][1][::2]]
         if mol.cart:
diff --git a/pyscf/lo/orth.py b/pyscf/lo/orth.py
index 625169063a..70e5c894a6 100644
--- a/pyscf/lo/orth.py
+++ b/pyscf/lo/orth.py
@@ -181,7 +181,7 @@ def ecp_ano_det_ovlp(atm_ecp, atm_ano, ecpcore):
             if not PROJECT_ECP_BASIS:
                 continue
 
-            ecpcore = core_configuration(nelec_ecp_dic[symb])
+            ecpcore = core_configuration(nelec_ecp_dic[symb], atom_symbol=gto.mole._std_symbol(symb))
             # Comparing to ANO valence basis, to check whether the ECP basis set has
             # reasonable AO-character contraction.  The ANO valence AO should have
             # significant overlap to ECP basis if the ECP basis has AO-character.
diff --git a/pyscf/lo/test/test_localizer.py b/pyscf/lo/test/test_localizer.py
index 9bfc493b6a..5006519ba0 100644
--- a/pyscf/lo/test/test_localizer.py
+++ b/pyscf/lo/test/test_localizer.py
@@ -75,7 +75,7 @@ def test_edmiston(self):
         mo = loc.kernel(mf_h2o.mo_coeff[:,idx])
         dip = boys.dipole_integral(h2o, mo)
         z = numpy.einsum('xii,xii->', dip, dip)
-        self.assertAlmostEqual(z, 17.96309963411759, 4)
+        self.assertAlmostEqual(z, 1.1566988026, 4)
 
     def test_pipek(self):
         idx = numpy.array([17,20,21,22,23,30,36,41,42,47,48,49])-1
diff --git a/pyscf/mcscf/addons.py b/pyscf/mcscf/addons.py
index d696ad4686..9c9d8e20af 100644
--- a/pyscf/mcscf/addons.py
+++ b/pyscf/mcscf/addons.py
@@ -793,7 +793,7 @@ def map2hf(casscf, mf_mo=None, base=BASE, tol=MAP2HF_TOL):
     s = reduce(numpy.dot, (casscf.mo_coeff.T, s, mf_mo))
     idx = numpy.argwhere(abs(s) > tol)
     for i,j in idx:
-        logger.info(casscf, '<mo_coeff-mcscf|mo_coeff-hf>  %d  %d  %12.8f',
+        logger.info(casscf, '<mo_coeff-mcscf|mo_coeff-hf>  %-5d  %-5d  % 12.8f',
                     i+base, j+base, s[i,j])
     return idx
 
@@ -920,19 +920,20 @@ def kernel(self, h1, h2, norb, nelec, ci0=None, **kwargs):
             return numpy.einsum('i,i->', e, self.weights), c
 
         def approx_kernel(self, h1, h2, norb, nelec, ci0=None, **kwargs):
-            try:
+            if hasattr(fcibase_class, 'approx_kernel'):
                 e, c = fcibase_class.approx_kernel(self, h1, h2, norb, nelec,
                                                    ci0=ci0, nroots=self.nroots,
                                                    wfnsym=self.wfnsym,
                                                    **kwargs)
-            except AttributeError:
+            else:
                 e, c = fcibase_class.kernel(self, h1, h2, norb, nelec, ci0=ci0,
                                             nroots=self.nroots,
                                             wfnsym=self.wfnsym, **kwargs)
             return numpy.einsum('i,i->', e, self.weights), c
 
         def states_make_rdm1(self, ci0, norb, nelec, *args, **kwargs):
-            dm1 = [fcibase_class.make_rdm1(self, c, norb, nelec, *args, **kwargs) for c in ci0]
+            fcibase = super()
+            dm1 = [fcibase.make_rdm1(c, norb, nelec, *args, **kwargs) for c in ci0]
             return dm1
 
         def make_rdm1(self, ci0, norb, nelec, *args, **kwargs):
@@ -940,10 +941,11 @@ def make_rdm1(self, ci0, norb, nelec, *args, **kwargs):
                                                  self.states_make_rdm1(ci0, norb, nelec, *args, **kwargs))])
 
         def states_make_rdm1s(self, ci0, norb, nelec, *args, **kwargs):
+            fcibase = super()
             dm1a = []
             dm1b = []
             for c in ci0:
-                dm1s = fcibase_class.make_rdm1s(self, c, norb, nelec, *args, **kwargs)
+                dm1s = fcibase.make_rdm1s(c, norb, nelec, *args, **kwargs)
                 dm1a.append (dm1s[0])
                 dm1b.append (dm1s[1])
             return dm1a, dm1b
@@ -954,10 +956,11 @@ def make_rdm1s(self, ci0, norb, nelec, *args, **kwargs):
             return dm1s[0], dm1s[1]
 
         def states_make_rdm12(self, ci0, norb, nelec, *args, **kwargs):
+            fcibase = super()
             rdm1 = []
             rdm2 = []
             for c in ci0:
-                dm1, dm2 = fcibase_class.make_rdm12(self, c, norb, nelec, *args, **kwargs)
+                dm1, dm2 = fcibase.make_rdm12(c, norb, nelec, *args, **kwargs)
                 rdm1.append (dm1)
                 rdm2.append (dm2)
             return rdm1, rdm2
@@ -969,10 +972,11 @@ def make_rdm12(self, ci0, norb, nelec, *args, **kwargs):
             return rdm1, rdm2
 
         def states_make_rdm12s(self, ci0, norb, nelec, *args, **kwargs):
+            fcibase = super()
             dm1a, dm1b = [], []
             dm2aa, dm2ab, dm2bb = [], [], []
             for c in ci0:
-                dm1s, dm2s = fcibase_class.make_rdm12s(self, c, norb, nelec, *args, **kwargs)
+                dm1s, dm2s = fcibase.make_rdm12s(c, norb, nelec, *args, **kwargs)
                 dm1a.append(dm1s[0])
                 dm1b.append(dm1s[1])
                 dm2aa.append(dm2s[0])
@@ -987,10 +991,11 @@ def make_rdm12s(self, ci0, norb, nelec, *args, **kwargs):
             return rdm1s, rdm2s
 
         def states_trans_rdm12 (self, ci1, ci0, norb, nelec, *args, **kwargs):
+            fcibase = super()
             tdm1 = []
             tdm2 = []
             for c1, c0 in zip (ci1, ci0):
-                dm1, dm2 = fcibase_class.trans_rdm12 (self, c1, c0, norb, nelec)
+                dm1, dm2 = fcibase.trans_rdm12 (c1, c0, norb, nelec)
                 tdm1.append (dm1)
                 tdm2.append (dm2)
             return tdm1, tdm2
@@ -1008,7 +1013,8 @@ def spin_square(self, ci0, norb, nelec, *args, **kwargs):
                 return numpy.dot(ss, weights), numpy.dot(multip, weights)
 
             def states_spin_square(self, ci0, norb, nelec, *args, **kwargs):
-                s = [fcibase_class.spin_square(self, ci0[i], norb, nelec, *args, **kwargs)
+                fcibase = super()
+                s = [fcibase.spin_square(ci0[i], norb, nelec, *args, **kwargs)
                      for i, wi in enumerate(self.weights)]
                 return [x[0] for x in s], [x[1] for x in s]
 
@@ -1145,7 +1151,8 @@ def kernel(self, h1, h2, norb, nelec, ci0=None, **kwargs):
             log = logger.new_logger(self, kwargs.get('verbose'))
             if log.verbose >= logger.DEBUG:
                 if getattr(fcibase_class, 'spin_square', None):
-                    ss = fcibase_class.spin_square(self, c[state], norb, nelec)
+                    fcibase = super()
+                    ss = fcibase.spin_square(c[state], norb, nelec)
                     log.debug('state %d  E = %.15g S^2 = %.7f',
                               state, e[state], ss[0])
                 else:
@@ -1155,12 +1162,12 @@ def kernel(self, h1, h2, norb, nelec, ci0=None, **kwargs):
         def approx_kernel(self, h1, h2, norb, nelec, ci0=None, **kwargs):
             if self._civec is not None:
                 ci0 = self._civec
-            try:
+            if hasattr(fcibase_class, 'approx_kernel'):
                 e, c = fcibase_class.approx_kernel(self, h1, h2, norb, nelec,
                                                    ci0=ci0, nroots=self.nroots,
                                                    wfnsym=self.wfnsym,
                                                    **kwargs)
-            except AttributeError:
+            else:
                 e, c = fcibase_class.kernel(self, h1, h2, norb, nelec, ci0=ci0,
                                             nroots=self.nroots,
                                             wfnsym=self.wfnsym, **kwargs)
diff --git a/pyscf/mcscf/casci.py b/pyscf/mcscf/casci.py
index d8d2ccca7c..4ea25e6671 100644
--- a/pyscf/mcscf/casci.py
+++ b/pyscf/mcscf/casci.py
@@ -158,13 +158,13 @@ def analyze(casscf, mo_coeff=None, ci=None, verbose=None,
                 for i, civec in enumerate(ci):
                     log.info('  [alpha occ-orbitals] [beta occ-orbitals]  state %-3d CI coefficient', i)
                     for c,ia,ib in res[i]:
-                        log.info('  %-20s %-30s %.12f', ia, ib, c)
+                        log.info('  %-20s %-30s % .12f', ia, ib, c)
             else:
                 log.info('  [alpha occ-orbitals] [beta occ-orbitals]            CI coefficient')
                 res = casscf.fcisolver.large_ci(ci, casscf.ncas, casscf.nelecas,
                                                 large_ci_tol, return_strs=False)
                 for c,ia,ib in res:
-                    log.info('  %-20s %-30s %.12f', ia, ib, c)
+                    log.info('  %-20s %-30s % .12f', ia, ib, c)
 
         if with_meta_lowdin:
             casscf._scf.mulliken_meta(casscf.mol, dm1, s=ovlp_ao, verbose=log)
@@ -403,7 +403,7 @@ def _diag_subfock_(idx):
                                    mc._scf.get_ovlp(), mc._scf.mo_coeff))
             idx = numpy.argwhere(abs(s)>.4)
             for i,j in idx:
-                log.info('<CAS-nat-orb|mo-hf>  %d  %d  %12.8f',
+                log.info('<CAS-nat-orb|mo-hf>  %-5d  %-5d  % 12.8f',
                          ncore+i+1, j+1, s[i,j])
     return mo_coeff1, fcivec, mo_occ
 
@@ -991,27 +991,27 @@ def _finalize(self):
             if isinstance(self.e_cas, (float, numpy.number)):
                 try:
                     ss = self.fcisolver.spin_square(self.ci, self.ncas, self.nelecas)
-                    log.note('CASCI E = %.15g  E(CI) = %.15g  S^2 = %.7f',
+                    log.note('CASCI E = %#.15g  E(CI) = %#.15g  S^2 = %.7f',
                              self.e_tot, self.e_cas, ss[0])
                 except NotImplementedError:
-                    log.note('CASCI E = %.15g  E(CI) = %.15g',
+                    log.note('CASCI E = %#.15g  E(CI) = %#.15g',
                              self.e_tot, self.e_cas)
             else:
                 for i, e in enumerate(self.e_cas):
                     try:
                         ss = self.fcisolver.spin_square(self.ci[i], self.ncas, self.nelecas)
-                        log.note('CASCI state %d  E = %.15g  E(CI) = %.15g  S^2 = %.7f',
+                        log.note('CASCI state %3d  E = %#.15g  E(CI) = %#.15g  S^2 = %.7f',
                                  i, self.e_tot[i], e, ss[0])
                     except NotImplementedError:
-                        log.note('CASCI state %d  E = %.15g  E(CI) = %.15g',
+                        log.note('CASCI state %3d  E = %#.15g  E(CI) = %#.15g',
                                  i, self.e_tot[i], e)
 
         else:
             if isinstance(self.e_cas, (float, numpy.number)):
-                log.note('CASCI E = %.15g  E(CI) = %.15g', self.e_tot, self.e_cas)
+                log.note('CASCI E = %#.15g  E(CI) = %#.15g', self.e_tot, self.e_cas)
             else:
                 for i, e in enumerate(self.e_cas):
-                    log.note('CASCI state %d  E = %.15g  E(CI) = %.15g',
+                    log.note('CASCI state %3d  E = %#.15g  E(CI) = %#.15g',
                              i, self.e_tot[i], e)
         return self
 
diff --git a/pyscf/mcscf/casci_symm.py b/pyscf/mcscf/casci_symm.py
index 257ed595c7..0b43d9dd3d 100644
--- a/pyscf/mcscf/casci_symm.py
+++ b/pyscf/mcscf/casci_symm.py
@@ -37,6 +37,7 @@ def __init__(self, mf_or_mol, ncas, nelecas, ncore=None):
             self.fcisolver = fci.direct_spin0_symm.FCISolver(self.mol)
         else:
             self.fcisolver = fci.direct_spin1_symm.FCISolver(self.mol)
+        delattr(fcisolver, '_keys')
         self.fcisolver.__dict__.update(fcisolver.__dict__)
 
     @property
diff --git a/pyscf/mcscf/mc1step.py b/pyscf/mcscf/mc1step.py
index 561754386a..5d9ca567b2 100644
--- a/pyscf/mcscf/mc1step.py
+++ b/pyscf/mcscf/mc1step.py
@@ -266,7 +266,7 @@ def precond(x, e):
             norm_gorb = numpy.linalg.norm(g_orb)
             norm_dxi = numpy.linalg.norm(dxi)
             norm_dr = numpy.linalg.norm(dr)
-            log.debug('    imic %d(%d)  |g[o]|=%5.3g  |dxi|=%5.3g  '
+            log.debug('    imic %2d(%2d)  |g[o]|=%5.3g  |dxi|=%5.3g  '
                       'max(|x|)=%5.3g  |dr|=%5.3g  eig=%5.3g  seig=%5.3g',
                       imic, ihop, norm_gorb, norm_dxi,
                       dxmax, norm_dr, w, seig)
@@ -288,7 +288,7 @@ def precond(x, e):
                   norm_gorb < norm_gkf/casscf.kf_trust_region):
                 ikf = 0
                 u = casscf.update_rotate_matrix(dr, u)
-                t3m = log.timer('aug_hess in %d inner iters' % imic, *t3m)
+                t3m = log.timer('aug_hess in %2d inner iters' % imic, *t3m)
                 yield u, g_kf, ihop+jkcount, dxi
 
                 t3m = (logger.process_clock(), logger.perf_counter())
@@ -393,7 +393,7 @@ def kernel(casscf, mo_coeff, tol=1e-7, conv_tol_grad=None,
             norm_t = numpy.linalg.norm(u-numpy.eye(nmo))
             t3m = log.timer('orbital rotation', *t3m)
             if imicro >= max_cycle_micro:
-                log.debug('micro %d  |u-1|=%5.3g  |g[o]|=%5.3g',
+                log.debug('micro %2d  |u-1|=%5.3g  |g[o]|=%5.3g',
                           imicro, norm_t, norm_gorb)
                 break
 
@@ -405,17 +405,17 @@ def kernel(casscf, mo_coeff, tol=1e-7, conv_tol_grad=None,
             t3m = log.timer('update CAS DM', *t3m)
             if isinstance(gci, numpy.ndarray):
                 norm_gci = numpy.linalg.norm(gci)
-                log.debug('micro %d  |u-1|=%5.3g  |g[o]|=%5.3g  |g[c]|=%5.3g  |ddm|=%5.3g',
+                log.debug('micro %2d  |u-1|=%5.3g  |g[o]|=%5.3g  |g[c]|=%5.3g  |ddm|=%5.3g',
                           imicro, norm_t, norm_gorb, norm_gci, norm_ddm)
             else:
                 norm_gci = None
-                log.debug('micro %d  |u-1|=%5.3g  |g[o]|=%5.3g  |g[c]|=%s  |ddm|=%5.3g',
+                log.debug('micro %2d  |u-1|=%5.3g  |g[o]|=%5.3g  |g[c]|=%s  |ddm|=%5.3g',
                           imicro, norm_t, norm_gorb, norm_gci, norm_ddm)
 
             if callable(callback):
                 callback(locals())
 
-            t3m = log.timer('micro iter %d'%imicro, *t3m)
+            t3m = log.timer('micro iter %2d'%imicro, *t3m)
             if (norm_t < conv_tol_grad or
                 (norm_gorb < conv_tol_grad*.5 and
                  (norm_ddm < conv_tol_ddm*.4 or norm_ddm_micro < conv_tol_ddm*.4))):
@@ -459,7 +459,7 @@ def kernel(casscf, mo_coeff, tol=1e-7, conv_tol_grad=None,
         norm_ddm = numpy.linalg.norm(casdm1 - casdm1_last)
         casdm1_prev = casdm1_last = casdm1
         log.timer('CASCI solver', *t2m)
-        t3m = t2m = t1m = log.timer('macro iter %d'%imacro, *t1m)
+        t3m = t2m = t1m = log.timer('macro iter %2d'%imacro, *t1m)
 
         de, elast = e_tot - elast, e_tot
         if (abs(de) < tol and norm_gorb0 < conv_tol_grad and
@@ -474,10 +474,10 @@ def kernel(casscf, mo_coeff, tol=1e-7, conv_tol_grad=None,
             callback(locals())
 
     if conv:
-        log.info('1-step CASSCF converged in %d macro (%d JK %d micro) steps',
+        log.info('1-step CASSCF converged in %3d macro (%3d JK %3d micro) steps',
                  imacro, totinner, totmicro)
     else:
-        log.info('1-step CASSCF not converged, %d macro (%d JK %d micro) steps',
+        log.info('1-step CASSCF not converged, %3d macro (%3d JK %3d micro) steps',
                  imacro, totinner, totmicro)
 
     if casscf.canonicalization:
@@ -842,7 +842,7 @@ def kernel(self, mo_coeff=None, ci0=None, callback=None, _kern=kernel):
                 _kern(self, mo_coeff,
                       tol=self.conv_tol, conv_tol_grad=self.conv_tol_grad,
                       ci0=ci0, callback=callback, verbose=self.verbose)
-        logger.note(self, 'CASSCF energy = %.15g', self.e_tot)
+        logger.note(self, 'CASSCF energy = %#.15g', self.e_tot)
         self._finalize()
         return self.e_tot, self.e_cas, self.ci, self.mo_coeff, self.mo_energy
 
@@ -872,7 +872,7 @@ def casci(self, mo_coeff, ci0=None, eris=None, verbose=None, envs=None):
             e_cas = e_cas[0]
 
         if envs is not None and log.verbose >= logger.INFO:
-            log.debug('CAS space CI energy = %.15g', e_cas)
+            log.debug('CAS space CI energy = %#.15g', e_cas)
 
             if getattr(self.fcisolver, 'spin_square', None):
                 try:
@@ -884,18 +884,18 @@ def casci(self, mo_coeff, ci0=None, eris=None, verbose=None, envs=None):
 
             if 'imicro' in envs:  # Within CASSCF iteration
                 if ss is None:
-                    log.info('macro iter %d (%d JK  %d micro), '
-                             'CASSCF E = %.15g  dE = %.8g',
+                    log.info('macro iter %3d (%3d JK  %3d micro), '
+                             'CASSCF E = %#.15g  dE = % .8e',
                              envs['imacro'], envs['njk'], envs['imicro'],
                              e_tot, e_tot-envs['elast'])
                 else:
-                    log.info('macro iter %d (%d JK  %d micro), '
-                             'CASSCF E = %.15g  dE = %.8g  S^2 = %.7f',
+                    log.info('macro iter %3d (%3d JK  %3d micro), '
+                             'CASSCF E = %#.15g  dE = % .8e  S^2 = %.7f',
                              envs['imacro'], envs['njk'], envs['imicro'],
                              e_tot, e_tot-envs['elast'], ss[0])
                 if 'norm_gci' in envs and envs['norm_gci'] is not None:
                     log.info('               |grad[o]|=%5.3g  '
-                             '|grad[c]|= %s  |ddm|=%5.3g  |maxRot[o]|=%5.3g',
+                             '|grad[c]|=%5.3g  |ddm|=%5.3g  |maxRot[o]|=%5.3g',
                              envs['norm_gorb0'],
                              envs['norm_gci'], envs['norm_ddm'], envs['max_offdiag_u'])
                 else:
@@ -903,9 +903,9 @@ def casci(self, mo_coeff, ci0=None, eris=None, verbose=None, envs=None):
                              envs['norm_gorb0'], envs['norm_ddm'], envs['max_offdiag_u'])
             else:  # Initialization step
                 if ss is None:
-                    log.info('CASCI E = %.15g', e_tot)
+                    log.info('CASCI E = %#.15g', e_tot)
                 else:
-                    log.info('CASCI E = %.15g  S^2 = %.7f', e_tot, ss[0])
+                    log.info('CASCI E = %#.15g  S^2 = %.7f', e_tot, ss[0])
         return e_tot, e_cas, fcivec
 
     as_scanner = as_scanner
diff --git a/pyscf/mcscf/mc1step_symm.py b/pyscf/mcscf/mc1step_symm.py
index ff62e3c77c..add63790e7 100644
--- a/pyscf/mcscf/mc1step_symm.py
+++ b/pyscf/mcscf/mc1step_symm.py
@@ -38,6 +38,7 @@ def __init__(self, mf_or_mol, ncas, nelecas, ncore=None, frozen=None):
             self.fcisolver = fci.direct_spin0_symm.FCISolver(self.mol)
         else:
             self.fcisolver = fci.direct_spin1_symm.FCISolver(self.mol)
+        delattr(fcisolver, '_keys')
         self.fcisolver.__dict__.update(fcisolver.__dict__)
 
     @property
@@ -71,7 +72,7 @@ def kernel(self, mo_coeff=None, ci0=None, callback=None, _kern=None):
                 _kern(self, mo_coeff,
                       tol=self.conv_tol, conv_tol_grad=self.conv_tol_grad,
                       ci0=ci0, callback=callback, verbose=self.verbose)
-        log.note('CASSCF energy = %.15g', self.e_tot)
+        log.note('CASSCF energy = %#.15g', self.e_tot)
         self._finalize()
         return self.e_tot, self.e_cas, self.ci, self.mo_coeff, self.mo_energy
 
diff --git a/pyscf/mcscf/mc2step.py b/pyscf/mcscf/mc2step.py
index 041dd9aa63..736bedbc80 100644
--- a/pyscf/mcscf/mc2step.py
+++ b/pyscf/mcscf/mc2step.py
@@ -93,13 +93,13 @@ def kernel(casscf, mo_coeff, tol=1e-7, conv_tol_grad=None,
             eris = casscf.ao2mo(mo)
             t3m = log.timer('update eri', *t3m)
 
-            log.debug('micro %d  ~dE=%5.3g  |u-1|=%5.3g  |g[o]|=%5.3g  |dm1|=%5.3g',
+            log.debug('micro %2d  ~dE=%5.3g  |u-1|=%5.3g  |g[o]|=%5.3g  |dm1|=%5.3g',
                       imicro, de, norm_t, norm_gorb, norm_ddm)
 
             if callable(callback):
                 callback(locals())
 
-            t2m = log.timer('micro iter %d'%imicro, *t2m)
+            t2m = log.timer('micro iter %2d'%imicro, *t2m)
             if norm_t < 1e-4 or abs(de) < tol*.4 or norm_gorb < conv_tol_grad*.2:
                 break
 
@@ -127,7 +127,7 @@ def kernel(casscf, mo_coeff, tol=1e-7, conv_tol_grad=None,
 
         e_tot, e_cas, fcivec = casscf.casci(mo, fcivec, eris, log, locals())
         log.timer('CASCI solver', *t3m)
-        t2m = t1m = log.timer('macro iter %d'%imacro, *t1m)
+        t2m = t1m = log.timer('macro iter %2d'%imacro, *t1m)
 
         de, elast = e_tot - elast, e_tot
         if (abs(de) < tol and
@@ -144,10 +144,10 @@ def kernel(casscf, mo_coeff, tol=1e-7, conv_tol_grad=None,
             callback(locals())
 
     if conv:
-        log.info('2-step CASSCF converged in %d macro (%d JK %d micro) steps',
+        log.info('2-step CASSCF converged in %3d macro (%3d JK %3d micro) steps',
                  imacro, totinner, totmicro)
     else:
-        log.info('2-step CASSCF not converged, %d macro (%d JK %d micro) steps',
+        log.info('2-step CASSCF not converged, %3d macro (%3d JK %3d micro) steps',
                  imacro, totinner, totmicro)
 
     if casscf.canonicalization:
diff --git a/pyscf/mcscf/test/test_addons.py b/pyscf/mcscf/test/test_addons.py
index 7ea0c090d8..1241695a54 100644
--- a/pyscf/mcscf/test/test_addons.py
+++ b/pyscf/mcscf/test/test_addons.py
@@ -110,7 +110,7 @@ def test_canonicalize1(self):
         mo, ci, mo_e = mcr.canonicalize(mo1)
         e1 = numpy.einsum('ji,jk,ki', mo, f1, mo)
         self.assertAlmostEqual(e1, 44.2658681077, 7)
-        self.assertAlmostEqual(lib.fp(mo_e), 5.1364166175063097, 7)
+        self.assertAlmostEqual(lib.fp(mo_e), 5.1364166175063097, 5)
 
         mo, ci, mo_e = mcr.canonicalize(mo1, eris=mcr.ao2mo(mcr.mo_coeff))
         e1 = numpy.einsum('ji,jk,ki', mo, f1, mo)
diff --git a/pyscf/mcscf/test/test_h2o.py b/pyscf/mcscf/test/test_h2o.py
index 72b5fd8a61..fe22680521 100644
--- a/pyscf/mcscf/test/test_h2o.py
+++ b/pyscf/mcscf/test/test_h2o.py
@@ -50,7 +50,12 @@ def setUpModule():
     msym.scf()
 
     mc_ref = mcscf.CASSCF (m, 4, 4).state_average_([0.25,]*4)
-    mc_ref.kernel ()
+    # SA-CASSCF may be stuck at a local minimum e_tot = -75.75381945 with the
+    # default initial guess from HF orbitals. The initial guess below is closed
+    # to the single state CASSCF orbitals which can lead to a lower SA-CASSCF
+    # energy e_tot = -75.762754627
+    mo = mc_ref.sort_mo([4,5,6,10], base=1)
+    mc_ref.kernel (mo)
 
 def tearDownModule():
     global mol, molsym, m, msym, mc_ref
@@ -63,7 +68,8 @@ def tearDownModule():
 class KnownValues(unittest.TestCase):
     def test_nosymm_sa4_newton (self):
         mc = mcscf.CASSCF (m, 4, 4).state_average_([0.25,]*4).newton ()
-        mc.kernel ()
+        mo = mc.sort_mo([4,5,6,10], base=1)
+        mc.kernel(mo)
         self.assertAlmostEqual (mc.e_tot, mc_ref.e_tot, 8)
         for e1, e0 in zip (mc.e_states, mc_ref.e_states):
             self.assertAlmostEqual (e1, e0, 5)
@@ -72,7 +78,9 @@ def test_spin_sa4 (self):
         fcisolvers = [fci.solver (mol, singlet=not(bool(i)), symm=False) for i in range (2)]
         fcisolvers[0].nroots = fcisolvers[1].nroots = 2
         fcisolvers[1].spin = 2
-        mc = mcscf.addons.state_average_mix (mcscf.CASSCF (m, 4, 4), fcisolvers, [0.25,]*4).run ()
+        mc = mcscf.addons.state_average_mix (mcscf.CASSCF (m, 4, 4), fcisolvers, [0.25,]*4)
+        mo = mc.sort_mo([4,5,6,10], base=1)
+        mc.kernel(mo)
         self.assertAlmostEqual (mc.e_tot, mc_ref.e_tot, 8)
         for e1, e0 in zip (numpy.sort (mc.e_states), mc_ref.e_states):
             self.assertAlmostEqual (e1, e0, 5)
@@ -81,7 +89,9 @@ def test_spin_sa4_newton (self):
         fcisolvers = [fci.solver (mol, singlet=not(bool(i)), symm=False) for i in range (2)]
         fcisolvers[0].nroots = fcisolvers[1].nroots = 2
         fcisolvers[1].spin = 2
-        mc = mcscf.addons.state_average_mix (mcscf.CASSCF (m, 4, 4), fcisolvers, [0.25,]*4).newton ().run ()
+        mc = mcscf.addons.state_average_mix (mcscf.CASSCF (m, 4, 4), fcisolvers, [0.25,]*4).newton ()
+        mo = mc.sort_mo([4,5,6,10], base=1)
+        mc.kernel(mo)
         self.assertAlmostEqual (mc.e_tot, mc_ref.e_tot, 8)
         for e1, e0 in zip (numpy.sort (mc.e_states), mc_ref.e_states):
             self.assertAlmostEqual (e1, e0, 5)
@@ -91,7 +101,9 @@ def test_pointgroup_sa4 (self):
         fcisolvers[0].nroots = fcisolvers[1].nroots = 2
         fcisolvers[0].wfnsym = 'A1'
         fcisolvers[1].wfnsym = 'B1'
-        mc = mcscf.addons.state_average_mix (mcscf.CASSCF (msym, 4, 4), fcisolvers, [0.25,]*4).run ()
+        mc = mcscf.addons.state_average_mix (mcscf.CASSCF (msym, 4, 4), fcisolvers, [0.25,]*4)
+        mo = mc.sort_mo([4,5,6,10], base=1)
+        mc.kernel(mo)
         self.assertAlmostEqual (mc.e_tot, mc_ref.e_tot, 8)
         for e1, e0 in zip (numpy.sort (mc.e_states), mc_ref.e_states):
             self.assertAlmostEqual (e1, e0, 5)
@@ -101,7 +113,9 @@ def test_pointgroup_sa4_newton (self):
         fcisolvers[0].nroots = fcisolvers[1].nroots = 2
         fcisolvers[0].wfnsym = 'A1'
         fcisolvers[1].wfnsym = 'B1'
-        mc = mcscf.addons.state_average_mix (mcscf.CASSCF (msym, 4, 4), fcisolvers, [0.25,]*4).newton ().run ()
+        mc = mcscf.addons.state_average_mix (mcscf.CASSCF (msym, 4, 4), fcisolvers, [0.25,]*4).newton ()
+        mo = mc.sort_mo([4,5,6,10], base=1)
+        mc.kernel(mo)
         self.assertAlmostEqual (mc.e_tot, mc_ref.e_tot, 8)
         for e1, e0 in zip (numpy.sort (mc.e_states), mc_ref.e_states):
             self.assertAlmostEqual (e1, e0, 5)
@@ -111,7 +125,9 @@ def test_spin_and_pointgroup_sa4 (self):
         fcisolvers[0].wfnsym = fcisolvers[1].wfnsym = 'B1'
         fcisolvers[2].wfnsym = fcisolvers[3].wfnsym = 'A1'
         fcisolvers[1].spin = fcisolvers[3].spin = 2
-        mc = mcscf.addons.state_average_mix (mcscf.CASSCF (msym, 4, 4), fcisolvers, [0.25,]*4).run ()
+        mc = mcscf.addons.state_average_mix (mcscf.CASSCF (msym, 4, 4), fcisolvers, [0.25,]*4)
+        mo = mc.sort_mo([4,5,6,10], base=1)
+        mc.kernel(mo)
         self.assertAlmostEqual (mc.e_tot, mc_ref.e_tot, 8)
         for e1, e0 in zip (numpy.sort (mc.e_states), mc_ref.e_states):
             self.assertAlmostEqual (e1, e0, 5)
@@ -121,7 +137,9 @@ def test_spin_and_pointgroup_sa4_newton (self):
         fcisolvers[0].wfnsym = fcisolvers[1].wfnsym = 'B1'
         fcisolvers[2].wfnsym = fcisolvers[3].wfnsym = 'A1'
         fcisolvers[1].spin = fcisolvers[3].spin = 2
-        mc = mcscf.addons.state_average_mix (mcscf.CASSCF (msym, 4, 4), fcisolvers, [0.25,]*4).newton ().run ()
+        mc = mcscf.addons.state_average_mix (mcscf.CASSCF (msym, 4, 4), fcisolvers, [0.25,]*4).newton ()
+        mo = mc.sort_mo([4,5,6,10], base=1)
+        mc.kernel(mo)
         self.assertAlmostEqual (mc.e_tot, mc_ref.e_tot, 8)
         for e1, e0 in zip (numpy.sort (mc.e_states), mc_ref.e_states):
             self.assertAlmostEqual (e1, e0, 5)
@@ -129,4 +147,3 @@ def test_spin_and_pointgroup_sa4_newton (self):
 if __name__ == "__main__":
     print("Full Tests for H2O")
     unittest.main()
-
diff --git a/pyscf/mcscf/test/test_mc1step.py b/pyscf/mcscf/test/test_mc1step.py
index 0012c01dcf..d92bd98c4f 100644
--- a/pyscf/mcscf/test/test_mc1step.py
+++ b/pyscf/mcscf/test/test_mc1step.py
@@ -117,7 +117,7 @@ def test_cas_natorb(self):
         mc1.kernel(mo)
         mo0 = mc1.mo_coeff
         ci0 = mc1.ci
-        self.assertAlmostEqual(mc1.e_tot, -108.7288793597413, 8)
+        self.assertAlmostEqual(mc1.e_tot, -108.7288793597413, 7)
         casdm1 = mc1.fcisolver.make_rdm1(mc1.ci, 4, 4)
         mc1.ci = None  # Force cas_natorb_ to recompute CI coefficients
 
@@ -180,6 +180,7 @@ def test_dep4(self):
         mc1.with_dep4 = True
         mc1.max_cycle = 1
         mc1.max_cycle_micro = 6
+        mc1.fcisolver.pspace_size = 0
         mc1.kernel(mo)
         self.assertAlmostEqual(mc1.e_tot, -105.82840377848402, 6)
 
@@ -189,6 +190,7 @@ def test_dep4_df(self):
         mc1.with_dep4 = True
         mc1.max_cycle = 1
         mc1.max_cycle_micro = 6
+        mc1.fcisolver.pspace_size = 0
         mc1.kernel(mo)
         self.assertAlmostEqual(mc1.e_tot, -105.82833244029327, 6)
 
@@ -254,6 +256,7 @@ def test_trust_region(self):
         mo = mc1.sort_mo_by_irrep({'A1u':3, 'A1g':1})
         mc1.ah_grad_trust_region = 0.3
         mc1.conv_tol = 1e-7
+        mc1.fcisolver.pspace_size = 0
         tot_jk = []
         def count_jk(envs):
             tot_jk.append(envs.get('njk', 0))
diff --git a/pyscf/mcscf/test/test_newton_casscf.py b/pyscf/mcscf/test/test_newton_casscf.py
index c6eeb2bebe..db4e7c15a7 100644
--- a/pyscf/mcscf/test/test_newton_casscf.py
+++ b/pyscf/mcscf/test/test_newton_casscf.py
@@ -117,7 +117,7 @@ def test_sa_get_grad(self):
         self.assertAlmostEqual(sa.e_tot, -3.62638372957158, 7)
         # MRH 06/24/2020: convergence thresh of scf may not have consistent
         # meaning in SA problems
-        self.assertAlmostEqual(abs(sa.get_grad()).max(), 0, 5)
+        self.assertAlmostEqual(abs(sa.get_grad()).max(), 0, 4)
 
     def test_sa_mix(self):
         e = mc_N2.e_states
diff --git a/pyscf/mcscf/test/test_umc1step.py b/pyscf/mcscf/test/test_umc1step.py
index 6953974435..b4aeca34b2 100644
--- a/pyscf/mcscf/test/test_umc1step.py
+++ b/pyscf/mcscf/test/test_umc1step.py
@@ -150,4 +150,3 @@ def test_casci_in_casscf(self):
 if __name__ == "__main__":
     print("Full Tests for umc1step")
     unittest.main()
-
diff --git a/pyscf/mcscf/ucasci.py b/pyscf/mcscf/ucasci.py
index 90175ecd43..0db2e822f5 100644
--- a/pyscf/mcscf/ucasci.py
+++ b/pyscf/mcscf/ucasci.py
@@ -412,7 +412,7 @@ def analyze(self, mo_coeff=None, ci=None, verbose=None,
     def spin_square(self, fcivec=None, mo_coeff=None, ovlp=None):
         return addons.spin_square(self, mo_coeff, fcivec, ovlp)
 
-    fix_spin_ = fix_spin = None
+    fix_spin_ = fix_spin = lib.invalid_method('fix_spin')
 
     @lib.with_doc(addons.sort_mo.__doc__)
     def sort_mo(self, caslst, mo_coeff=None, base=1):
diff --git a/pyscf/mcscf/umc1step.py b/pyscf/mcscf/umc1step.py
index 5ee6c6711e..03fb1e5679 100644
--- a/pyscf/mcscf/umc1step.py
+++ b/pyscf/mcscf/umc1step.py
@@ -499,9 +499,9 @@ def casci(self, mo_coeff, ci0=None, eris=None, verbose=None, envs=None):
                          'UCASSCF E = %.15g  dE = %.8g',
                          envs['imacro'], envs['njk'], envs['imicro'],
                          e_tot, e_tot-envs['elast'])
-                if 'norm_gci' in envs:
+                if 'norm_gci' in envs and envs['norm_gci'] is not None:
                     log.info('               |grad[o]|=%5.3g  '
-                             '|grad[c]|= %s  |ddm|=%5.3g',
+                             '|grad[c]|=%5.3g  |ddm|=%5.3g',
                              envs['norm_gorb0'],
                              envs['norm_gci'], envs['norm_ddm'])
                 else:
diff --git a/pyscf/mp/dfmp2.py b/pyscf/mp/dfmp2.py
index 390834d6e8..f63d36b4a3 100644
--- a/pyscf/mp/dfmp2.py
+++ b/pyscf/mp/dfmp2.py
@@ -57,7 +57,7 @@ def kernel(mp, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2,
         p0, p1 = p1, p1 + qov.shape[0]
         Lov[p0:p1] = qov
 
-    emp2 = 0
+    emp2_ss = emp2_os = 0
 
     for i in range(nocc):
         buf = numpy.dot(Lov[:,i*nvir:(i+1)*nvir].T,
@@ -65,10 +65,17 @@ def kernel(mp, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2,
         gi = numpy.array(buf, copy=False)
         gi = gi.reshape(nvir,nocc,nvir).transpose(1,0,2)
         t2i = gi/lib.direct_sum('jb+a->jba', eia, eia[i])
-        emp2 += numpy.einsum('jab,jab', t2i, gi) * 2
-        emp2 -= numpy.einsum('jab,jba', t2i, gi)
+        edi = numpy.einsum('jab,jab', t2i, gi) * 2
+        exi = -numpy.einsum('jab,jba', t2i, gi)
+        emp2_ss += edi*0.5 + exi
+        emp2_os += edi*0.5
         if with_t2:
             t2[i] = t2i
+        buf = gi = t2i = None # free mem
+
+    emp2_ss = emp2_ss.real
+    emp2_os = emp2_os.real
+    emp2 = lib.tag_array(emp2_ss+emp2_os, e_corr_ss=emp2_ss, e_corr_os=emp2_os)
 
     return emp2, t2
 
diff --git a/pyscf/mp/mp2.py b/pyscf/mp/mp2.py
index f3cef08654..7b6cf52b66 100644
--- a/pyscf/mp/mp2.py
+++ b/pyscf/mp/mp2.py
@@ -52,7 +52,7 @@ def kernel(mp, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2, verbos
     else:
         t2 = None
 
-    emp2 = 0
+    emp2_ss = emp2_os = 0
     for i in range(nocc):
         if isinstance(eris.ovov, numpy.ndarray) and eris.ovov.ndim == 4:
             # When mf._eri is a custom integrals wiht the shape (n,n,n,n), the
@@ -63,11 +63,17 @@ def kernel(mp, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2, verbos
 
         gi = gi.reshape(nvir,nocc,nvir).transpose(1,0,2)
         t2i = gi.conj()/lib.direct_sum('jb+a->jba', eia, eia[i])
-        emp2 += numpy.einsum('jab,jab', t2i, gi) * 2
-        emp2 -= numpy.einsum('jab,jba', t2i, gi)
+        edi = numpy.einsum('jab,jab', t2i, gi) * 2
+        exi = -numpy.einsum('jab,jba', t2i, gi)
+        emp2_ss += edi*0.5 + exi
+        emp2_os += edi*0.5
         if with_t2:
             t2[i] = t2i
 
+    emp2_ss = emp2_ss.real
+    emp2_os = emp2_os.real
+    emp2 = lib.tag_array(emp2_ss+emp2_os, e_corr_ss=emp2_ss, e_corr_os=emp2_os)
+
     return emp2.real, t2
 
 
@@ -113,9 +119,12 @@ def energy(mp, t2, eris):
     '''MP2 energy'''
     nocc, nvir = t2.shape[1:3]
     eris_ovov = numpy.asarray(eris.ovov).reshape(nocc,nvir,nocc,nvir)
-    emp2  = numpy.einsum('ijab,iajb', t2, eris_ovov) * 2
-    emp2 -= numpy.einsum('ijab,ibja', t2, eris_ovov)
-    return emp2.real
+    ed = numpy.einsum('ijab,iajb', t2, eris_ovov) * 2
+    ex = -numpy.einsum('ijab,ibja', t2, eris_ovov)
+    emp2_ss = (ed*0.5 + ex).real
+    emp2_os = ed.real*0.5
+    emp2 = lib.tag_array(emp2_ss+emp2_os, e_corr_ss=emp2_ss, e_corr_os=emp2_os)
+    return emp2
 
 def update_amps(mp, t2, eris):
     '''Update non-canonical MP2 amplitudes'''
@@ -465,6 +474,8 @@ class MP2(lib.StreamObject):
 
         e_corr : float
             MP2 correlation correction
+        e_corr_ss/os : float
+            Same-spin and opposite-spin component of the MP2 correlation energy
         e_tot : float
             Total MP2 energy (HF + correlation)
         t2 :
@@ -500,6 +511,8 @@ def __init__(self, mf, frozen=None, mo_coeff=None, mo_occ=None):
         self._nmo = None
         self.e_hf = None
         self.e_corr = None
+        self.e_corr_ss = None
+        self.e_corr_os = None
         self.t2 = None
         self._keys = set(self.__dict__.keys())
 
@@ -549,10 +562,20 @@ def dump_flags(self, verbose=None):
     def emp2(self):
         return self.e_corr
 
+    @property
+    def emp2_scs(self):
+        # J. Chem. Phys. 118, 9095 (2003)
+        return self.e_corr_ss*1./3. + self.e_corr_os*1.2
+
     @property
     def e_tot(self):
         return self.e_hf + self.e_corr
 
+    @property
+    def e_tot_scs(self):
+        # J. Chem. Phys. 118, 9095 (2003)
+        return self.e_hf + self.emp2_scs
+
     def kernel(self, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2):
         '''
         Args:
@@ -574,13 +597,22 @@ def kernel(self, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2):
         else:
             self.converged, self.e_corr, self.t2 = _iterative_kernel(self, eris)
 
+        self.e_corr_ss = getattr(self.e_corr, 'e_corr_ss', 0)
+        self.e_corr_os = getattr(self.e_corr, 'e_corr_os', 0)
+        self.e_corr = float(self.e_corr)
+
         self._finalize()
         return self.e_corr, self.t2
 
     def _finalize(self):
         '''Hook for dumping results and clearing up the object.'''
-        logger.note(self, 'E(%s) = %.15g  E_corr = %.15g',
-                    self.__class__.__name__, self.e_tot, self.e_corr)
+        log = logger.new_logger(self)
+        log.note('E(%s) = %.15g  E_corr = %.15g',
+                 self.__class__.__name__, self.e_tot, self.e_corr)
+        log.note('E(SCS-%s) = %.15g  E_corr = %.15g',
+                 self.__class__.__name__, self.e_tot_scs, self.emp2_scs)
+        log.info('E_corr(same-spin) = %.15g', self.e_corr_ss)
+        log.info('E_corr(oppo-spin) = %.15g', self.e_corr_os)
         return self
 
     def ao2mo(self, mo_coeff=None):
diff --git a/pyscf/mp/test/test_gmp2.py b/pyscf/mp/test/test_gmp2.py
index abef7fd4e4..565ff39f20 100644
--- a/pyscf/mp/test/test_gmp2.py
+++ b/pyscf/mp/test/test_gmp2.py
@@ -35,10 +35,10 @@ def setUpModule():
     mol.spin = 2
     mol.build()
     mf = scf.UHF(mol)
-    mf.conv_tol = 1e-14
+    mf.conv_tol = 1e-12
     mf.scf()
     gmf = scf.GHF(mol)
-    gmf.conv_tol = 1e-14
+    gmf.conv_tol = 1e-12
     gmf.scf()
 
 def tearDownModule():
diff --git a/pyscf/mp/test/test_mp2.py b/pyscf/mp/test/test_mp2.py
index d8ca27af23..c126939fbf 100644
--- a/pyscf/mp/test/test_mp2.py
+++ b/pyscf/mp/test/test_mp2.py
@@ -61,12 +61,16 @@ def test_mp2(self):
         pt = mp.MP2(mf)
         emp2, t2 = pt.kernel(mf.mo_energy, mf.mo_coeff)
         self.assertAlmostEqual(emp2, -0.204019967288338, 8)
+        self.assertAlmostEqual(pt.e_corr_ss, -0.05153088565639835, 8)
+        self.assertAlmostEqual(pt.e_corr_os, -0.15248908163191538, 8)
         self.assertAlmostEqual(abs(t2 - t2ref0).max(), 0, 8)
 
         pt.max_memory = 1
         pt.frozen = None
         emp2, t2 = pt.kernel()
         self.assertAlmostEqual(emp2, -0.204019967288338, 8)
+        self.assertAlmostEqual(pt.e_corr_ss, -0.05153088565639835, 8)
+        self.assertAlmostEqual(pt.e_corr_os, -0.15248908163191538, 8)
         self.assertAlmostEqual(abs(t2 - t2ref0).max(), 0, 8)
 
     def test_mp2_outcore(self):
@@ -307,4 +311,3 @@ def test_non_canonical_mp2(self):
 if __name__ == "__main__":
     print("Full Tests for mp2")
     unittest.main()
-
diff --git a/pyscf/mp/test/test_ump2.py b/pyscf/mp/test/test_ump2.py
index 1f9c037101..527f81dea1 100644
--- a/pyscf/mp/test/test_ump2.py
+++ b/pyscf/mp/test/test_ump2.py
@@ -49,11 +49,15 @@ def test_ump2(self):
         pt = mp.MP2(mf)
         emp2, t2 = pt.kernel(mf.mo_energy, mf.mo_coeff)
         self.assertAlmostEqual(emp2, -0.16575150552336643, 8)
+        self.assertAlmostEqual(pt.e_corr_ss, -0.042627186675330754, 8)
+        self.assertAlmostEqual(pt.e_corr_os, -0.12312431898078077, 8)
 
         pt.max_memory = 1
         pt.frozen = None
         emp2, t2 = pt.kernel()
         self.assertAlmostEqual(emp2, -0.16575150552336643, 8)
+        self.assertAlmostEqual(pt.e_corr_ss, -0.042627186675330754, 8)
+        self.assertAlmostEqual(pt.e_corr_os, -0.12312431898078077, 8)
 
     def test_ump2_dm(self):
         pt = mp.MP2(mf)
@@ -232,7 +236,7 @@ def test_rdm_complex(self):
         vjb+= numpy.einsum('klij,lk->ij', eri_ab, dm[0])
         vka = numpy.einsum('ijkl,jk->il', eri_aa, dm[0])
         vkb = numpy.einsum('ijkl,jk->il', eri_bb, dm[1])
-        mf.get_veff = lambda *args: (vja - vka, vjb - vkb) 
+        mf.get_veff = lambda *args: (vja - vka, vjb - vkb)
         vhf = mf.get_veff()
         hcore = (numpy.diag(mo_energy[0]) - vhf[0],
                  numpy.diag(mo_energy[1]) - vhf[1])
diff --git a/pyscf/mp/ump2.py b/pyscf/mp/ump2.py
index 3512bd4622..ac1a9c1c45 100644
--- a/pyscf/mp/ump2.py
+++ b/pyscf/mp/ump2.py
@@ -60,7 +60,7 @@ def kernel(mp, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2, verbos
     else:
         t2 = None
 
-    emp2 = 0.0
+    emp2_ss = emp2_os = 0.0
     for i in range(nocca):
         if isinstance(eris.ovov, numpy.ndarray) and eris.ovov.ndim == 4:
             # When mf._eri is a custom integrals wiht the shape (n,n,n,n), the
@@ -71,8 +71,8 @@ def kernel(mp, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2, verbos
 
         eris_ovov = eris_ovov.reshape(nvira,nocca,nvira).transpose(1,0,2)
         t2i = eris_ovov.conj()/lib.direct_sum('a+jb->jab', eia_a[i], eia_a)
-        emp2 += numpy.einsum('jab,jab', t2i, eris_ovov) * .5
-        emp2 -= numpy.einsum('jab,jba', t2i, eris_ovov) * .5
+        emp2_ss += numpy.einsum('jab,jab', t2i, eris_ovov) * .5
+        emp2_ss -= numpy.einsum('jab,jba', t2i, eris_ovov) * .5
         if with_t2:
             t2aa[i] = t2i - t2i.transpose(0,2,1)
 
@@ -84,7 +84,7 @@ def kernel(mp, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2, verbos
             eris_ovov = numpy.asarray(eris.ovOV[i*nvira:(i+1)*nvira])
         eris_ovov = eris_ovov.reshape(nvira,noccb,nvirb).transpose(1,0,2)
         t2i = eris_ovov.conj()/lib.direct_sum('a+jb->jab', eia_a[i], eia_b)
-        emp2 += numpy.einsum('JaB,JaB', t2i, eris_ovov)
+        emp2_os += numpy.einsum('JaB,JaB', t2i, eris_ovov)
         if with_t2:
             t2ab[i] = t2i
 
@@ -97,12 +97,16 @@ def kernel(mp, mo_energy=None, mo_coeff=None, eris=None, with_t2=WITH_T2, verbos
             eris_ovov = numpy.asarray(eris.OVOV[i*nvirb:(i+1)*nvirb])
         eris_ovov = eris_ovov.reshape(nvirb,noccb,nvirb).transpose(1,0,2)
         t2i = eris_ovov.conj()/lib.direct_sum('a+jb->jab', eia_b[i], eia_b)
-        emp2 += numpy.einsum('jab,jab', t2i, eris_ovov) * .5
-        emp2 -= numpy.einsum('jab,jba', t2i, eris_ovov) * .5
+        emp2_ss += numpy.einsum('jab,jab', t2i, eris_ovov) * .5
+        emp2_ss -= numpy.einsum('jab,jba', t2i, eris_ovov) * .5
         if with_t2:
             t2bb[i] = t2i - t2i.transpose(0,2,1)
 
-    return emp2.real, t2
+    emp2_ss = emp2_ss.real
+    emp2_os = emp2_os.real
+    emp2 = lib.tag_array(emp2_ss+emp2_os, e_corr_ss=emp2_ss, e_corr_os=emp2_os)
+
+    return emp2, t2
 
 def energy(mp, t2, eris):
     '''MP2 energy'''
@@ -111,14 +115,16 @@ def energy(mp, t2, eris):
     eris_ovov = numpy.asarray(eris.ovov).reshape(nocca,nvira,nocca,nvira)
     eris_OVOV = numpy.asarray(eris.OVOV).reshape(noccb,nvirb,noccb,nvirb)
     eris_ovOV = numpy.asarray(eris.ovOV).reshape(nocca,nvira,noccb,nvirb)
-    e  = 0.25 * numpy.einsum('ijab,iajb->', t2aa, eris_ovov)
-    e -= 0.25 * numpy.einsum('ijab,ibja->', t2aa, eris_ovov)
-    e += 0.25 * numpy.einsum('ijab,iajb->', t2bb, eris_OVOV)
-    e -= 0.25 * numpy.einsum('ijab,ibja->', t2bb, eris_OVOV)
-    e +=        numpy.einsum('iJaB,iaJB->', t2ab, eris_ovOV)
+    ess  = 0.25 * numpy.einsum('ijab,iajb->', t2aa, eris_ovov)
+    ess -= 0.25 * numpy.einsum('ijab,ibja->', t2aa, eris_ovov)
+    ess += 0.25 * numpy.einsum('ijab,iajb->', t2bb, eris_OVOV)
+    ess -= 0.25 * numpy.einsum('ijab,ibja->', t2bb, eris_OVOV)
+    eos  =        numpy.einsum('iJaB,iaJB->', t2ab, eris_ovOV)
+    e    = ess + eos
     if abs(e.imag) > 1e-4:
         logger.warn(mp, 'Non-zero imaginary part found in UMP2 energy %s', e)
-    return e.real
+    e = lib.tag_array(e.real, e_corr_ss=ess.real, e_corr_os=eos.real)
+    return e
 
 def update_amps(mp, t2, eris):
     '''Update non-canonical MP2 amplitudes'''
diff --git a/pyscf/pbc/cc/eom_kccsd_rhf.py b/pyscf/pbc/cc/eom_kccsd_rhf.py
index d8def6bffb..350244e29f 100644
--- a/pyscf/pbc/cc/eom_kccsd_rhf.py
+++ b/pyscf/pbc/cc/eom_kccsd_rhf.py
@@ -380,6 +380,7 @@ class EOMIP(eom_kgccsd.EOMIP):
     l_matvec = lipccsd_matvec
     get_diag = ipccsd_diag
     ccsd_star_contract = ipccsd_star_contract
+    mask_frozen = eom_kgccsd.mask_frozen_ip
 
     @property
     def nkpts(self):
@@ -778,6 +779,7 @@ class EOMEA(eom_kgccsd.EOMEA):
     l_matvec = leaccsd_matvec
     get_diag = eaccsd_diag
     ccsd_star_contract = eaccsd_star_contract
+    mask_frozen = eom_kgccsd.mask_frozen_ea
 
     @property
     def nkpts(self):
diff --git a/pyscf/pbc/df/aft.py b/pyscf/pbc/df/aft.py
index 6025b07fa7..9ccd48fd35 100644
--- a/pyscf/pbc/df/aft.py
+++ b/pyscf/pbc/df/aft.py
@@ -20,6 +20,7 @@
 
 
 import copy
+import contextlib
 import numpy
 from pyscf import lib
 from pyscf import gto
@@ -372,6 +373,39 @@ def ft_loop(self, mesh=None, q=numpy.zeros(3), kpts=None, shls_slice=None,
             dat = ft_kern(Gv[p0:p1], gxyz[p0:p1], Gvbase, q, kpts, shls_slice, out=buf)
             yield dat, p0, p1
 
+    @contextlib.contextmanager
+    def range_coulomb(self, omega):
+        '''Creates a temporary density fitting object for RSH-DF integrals.
+        In this context, only LR or SR integrals for mol and auxmol are computed.
+        '''
+        key = '%.6f' % omega
+        if key in self._rsh_df:
+            rsh_df = self._rsh_df[key]
+        else:
+            rsh_df = self._rsh_df[key] = copy.copy(self).reset()
+            logger.info(self, 'Create RSH-DF object %s for omega=%s', rsh_df, omega)
+
+        cell = self.cell
+        auxcell = getattr(self, 'auxcell', None)
+
+        cell_omega = cell.omega
+        cell.omega = omega
+        auxcell_omega = None
+        if auxcell is not None:
+            auxcell_omega = auxcell.omega
+            auxcell.omega = omega
+
+        assert rsh_df.cell.omega == omega
+        if getattr(rsh_df, 'auxcell', None) is not None:
+            assert rsh_df.auxcell.omega == omega
+
+        try:
+            yield rsh_df
+        finally:
+            cell.omega = cell_omega
+            if auxcell_omega is not None:
+                auxcell.omega = auxcell_omega
+
 
 class AFTDF(lib.StreamObject, AFTDFMixin):
     '''Density expansion on plane waves
@@ -476,8 +510,9 @@ def build(self):
     def get_jk(self, dm, hermi=1, kpts=None, kpts_band=None,
                with_j=True, with_k=True, omega=None, exxdiv=None):
         if omega is not None:  # J/K for RSH functionals
-            return _sub_df_jk_(self, dm, hermi, kpts, kpts_band,
-                               with_j, with_k, omega, exxdiv)
+            with self.range_coulomb(omega) as rsh_df:
+                return rsh_df.get_jk(dm, hermi, kpts, kpts_band, with_j, with_k,
+                                     omega=None, exxdiv=exxdiv)
 
         if kpts is None:
             if numpy.all(self.kpts == 0):
@@ -546,14 +581,7 @@ def get_naoaux(self):
 
 def _sub_df_jk_(dfobj, dm, hermi=1, kpts=None, kpts_band=None,
                 with_j=True, with_k=True, omega=None, exxdiv=None):
-    key = '%.6f' % omega
-    if key in dfobj._rsh_df:
-        rsh_df = dfobj._rsh_df[key]
-    else:
-        rsh_df = dfobj._rsh_df[key] = copy.copy(dfobj).reset()
-        logger.info(dfobj, 'Create RSH-%s object %s for omega=%s',
-                    dfobj.__class__.__name__, rsh_df, omega)
-    with rsh_df.cell.with_range_coulomb(omega):
+    with dfobj.range_coulomb(omega) as rsh_df:
         return rsh_df.get_jk(dm, hermi, kpts, kpts_band, with_j, with_k,
                              omega=None, exxdiv=exxdiv)
 
diff --git a/pyscf/pbc/df/df.py b/pyscf/pbc/df/df.py
index 772ac3bada..0aa7a18964 100644
--- a/pyscf/pbc/df/df.py
+++ b/pyscf/pbc/df/df.py
@@ -55,7 +55,6 @@
 from pyscf.pbc.df.df_jk import zdotCN
 from pyscf.pbc.lib.kpts_helper import (is_zero, gamma_point, member, unique,
                                        KPT_DIFF_TOL)
-from pyscf.pbc.df.aft import _sub_df_jk_
 from pyscf.pbc.df.gdf_builder import libpbc, _CCGDFBuilder, _guess_eta
 from pyscf.pbc.df.rsdf_builder import _RSGDFBuilder
 from pyscf import __config__
@@ -128,9 +127,13 @@ def make_modrho_basis(cell, auxbasis=None, drop_eta=None):
 class GDF(lib.StreamObject, aft.AFTDFMixin):
     '''Gaussian density fitting
     '''
+    blockdim = getattr(__config__, 'pbc_df_df_DF_blockdim', 240)
+    _dataname = 'j3c'
     # Call _CCGDFBuilder if applicable. _CCGDFBuilder is slower than
     # _RSGDFBuilder but numerically more close to previous versions
     _prefer_ccdf = False
+    # If True, force using denisty matrix-based K-build
+    force_dm_kbuild = False
 
     def __init__(self, cell, kpts=numpy.zeros((1,3))):
         self.cell = cell
@@ -157,7 +160,6 @@ def __init__(self, cell, kpts=numpy.zeros((1,3))):
         # The following attributes are not input options.
         self.exxdiv = None  # to mimic KRHF/KUHF object in function get_coulG
         self.auxcell = None
-        self.blockdim = getattr(__config__, 'pbc_df_df_DF_blockdim', 240)
         self.linear_dep_threshold = LINEAR_DEP_THR
         self._j_only = False
 # If _cderi_to_save is specified, the 3C-integral tensor will be saved in this file.
@@ -182,7 +184,6 @@ def reset(self, cell=None):
             self.cell = cell
         self.auxcell = None
         self._cderi = None
-        self._cderi_to_save = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
         self._rsh_df = {}
         return self
 
@@ -284,12 +285,14 @@ def _make_j3c(self, cell=None, auxcell=None, kptij_lst=None, cderi_file=None):
         dfbuilder.mesh = self.mesh
         dfbuilder.linear_dep_threshold = self.linear_dep_threshold
         j_only = self._j_only or len(kpts_union) == 1
-        dfbuilder.make_j3c(cderi_file, j_only=j_only)
+        dfbuilder.make_j3c(cderi_file, j_only=j_only, dataname=self._dataname)
 
-    def cderi_array(self, label='j3c'):
+    def cderi_array(self, label=None):
         '''
         Returns CDERIArray object which provides numpy APIs to access cderi tensor.
         '''
+        if label is None:
+            label = self._dataname
         if self._cderi is None:
             self.build()
         return CDERIArray(self._cderi, label)
@@ -347,7 +350,7 @@ def load(aux_slice):
                     LpqI = lib.unpack_tril(LpqI, lib.ANTIHERMI).reshape(naux,nao**2)
             return LpqR, LpqI
 
-        with _load3c(self._cderi, 'j3c', kpti_kptj) as j3c:
+        with _load3c(self._cderi, self._dataname, kpti_kptj) as j3c:
             slices = lib.prange(0, j3c.shape[0], blksize)
             for LpqR, LpqI in lib.map_with_prefetch(load, slices):
                 yield LpqR, LpqI, 1
@@ -356,7 +359,8 @@ def load(aux_slice):
         if cell.dimension == 2 and cell.low_dim_ft_type != 'inf_vacuum':
             # Truncated Coulomb operator is not postive definite. Load the
             # CDERI tensor of negative part.
-            with _load3c(self._cderi, 'j3c-', kpti_kptj, ignore_key_error=True) as j3c:
+            with _load3c(self._cderi, self._dataname+'-', kpti_kptj,
+                         ignore_key_error=True) as j3c:
                 slices = lib.prange(0, j3c.shape[0], blksize)
                 for LpqR, LpqI in lib.map_with_prefetch(load, slices):
                     yield LpqR, LpqI, -1
@@ -391,8 +395,9 @@ def get_jk(self, dm, hermi=1, kpts=None, kpts_band=None,
                 mydf.mesh = tools.cutoff_to_mesh(cell.lattice_vectors(), ke_cutoff)
             else:
                 mydf = self
-            return _sub_df_jk_(mydf, dm, hermi, kpts, kpts_band,
-                               with_j, with_k, omega, exxdiv)
+            with mydf.range_coulomb(omega) as rsh_df:
+                return rsh_df.get_jk(dm, hermi, kpts, kpts_band, with_j, with_k,
+                                     omega=None, exxdiv=exxdiv)
 
         if kpts is None:
             if numpy.all(self.kpts == 0):
@@ -458,7 +463,7 @@ def get_naoaux(self):
         if self._cderi is None:
             self.build()
         # self._cderi['j3c/k_id/seg_id']
-        with addons.load(self._cderi, 'j3c/0') as feri:
+        with addons.load(self._cderi, f'{self._dataname}/0') as feri:
             if isinstance(feri, h5py.Group):
                 naux = feri['0'].shape[0]
             else:
@@ -468,8 +473,8 @@ def get_naoaux(self):
         if (cell.dimension == 2 and cell.low_dim_ft_type != 'inf_vacuum' and
             not isinstance(self._cderi, numpy.ndarray)):
             with h5py.File(self._cderi, 'r') as feri:
-                if 'j3c-/0' in feri:
-                    dat = feri['j3c-/0']
+                if f'{self._dataname}-/0' in feri:
+                    dat = feri[f'{self._dataname}-/0']
                     if isinstance(dat, h5py.Group):
                         naux += dat['0'].shape[0]
                     else:
@@ -490,8 +495,18 @@ def __init__(self, data_group, label='j3c'):
             data_group = h5py.File(data_group, 'r')
         self.data_group = data_group
         if 'kpts' not in data_group:
-            raise RuntimeError('cderi data not generated or format incompatible')
-
+            # TODO: Deprecate the v1 data format
+            self._data_version = 'v1'
+            self._cderi = data_group.file.filename
+            self._label = label
+            self._kptij_lst = data_group['j3c-kptij'][()]
+            kpts = unique(self._kptij_lst[:,0])[0]
+            self.nkpts = nkpts = len(kpts)
+            if len(self._kptij_lst) not in (nkpts, nkpts**2, nkpts*(nkpts+1)//2):
+                raise RuntimeError(f'Dimension error for CDERI {self._cderi}')
+            return
+
+        self._data_version = 'v2'
         aosym = data_group['aosym'][()]
         if isinstance(aosym, bytes):
             aosym = aosym.decode()
@@ -543,6 +558,25 @@ def __getitem__(self, slices):
         return out[k_slices]
 
     def _load_one(self, ki, kj, slices):
+        if self._data_version == 'v1':
+            with _load3c(self._cderi, self._label) as fload:
+                if len(self._kptij_lst) == self.nkpts:
+                    # kptij_lst was generated with option j_only, leading to
+                    # only the diagonal terms
+                    kikj = ki
+                    kpti, kptj = self._kptij_lst[kikj]
+                elif len(self._kptij_lst) == self.nkpts**2:
+                    kikj = ki * self.nkpts + kj
+                    kpti, kptj = self._kptij_lst[kikj]
+                elif ki >= kj:
+                    kikj = ki*(ki+1)//2 + kj
+                    kpti, kptj = self._kptij_lst[kikj]
+                else:
+                    kikj = kj*(kj+1)//2 + ki
+                    kptj, kpti = self._kptij_lst[kikj]
+                out = fload(kpti, kptj)
+                return out[slices]
+
         kikj = ki * self.nkpts + kj
         kjki = kj * self.nkpts + ki
         if self.aosym == 's1' or kikj == kjki:
@@ -565,6 +599,10 @@ def _load_one(self, ki, kj, slices):
         return out
 
     def load(self, kpti, kptj):
+        if self._data_version == 'v1':
+            with _load3c(self._cderi, self._label) as fload:
+                return numpy.asarray(fload(kpti, kptj))
+
         ki = member(kpti, self.kpts)
         kj = member(kptj, self.kpts)
         if len(ki) == 0 or len(kj) == 0:
@@ -581,7 +619,7 @@ class _load3c:
     pyscf-2.0 or older, version 2 from pyscf-2.1 or newer). This function
     can read both data formats.
     '''
-    def __init__(self, cderi, label, kpti_kptj=None, kptij_label=None,
+    def __init__(self, cderi, label, kpti_kptj=None, kptij_label='j3c-kptij',
                  ignore_key_error=False):
         self.cderi = cderi
         self.label = label
@@ -615,11 +653,7 @@ def kptij_lst(self):
             if self.data_version == 'v2':
                 self._kptij_lst = self.feri['kpts'][()]
             else:
-                if self.kptij_label is None:
-                    kptij_label = self.label + '-kptij'
-                else:
-                    kptij_label = self.kptij_label
-                self._kptij_lst = self.feri[kptij_label][()]
+                self._kptij_lst = self.feri[self.kptij_label][()]
         return self._kptij_lst
 
     @property
diff --git a/pyscf/pbc/df/df_jk.py b/pyscf/pbc/df/df_jk.py
index 043ea194f4..0de1d11585 100644
--- a/pyscf/pbc/df/df_jk.py
+++ b/pyscf/pbc/df/df_jk.py
@@ -31,6 +31,9 @@
 from pyscf.pbc import tools
 from pyscf.pbc.lib.kpts import KPoints
 from pyscf.pbc.lib.kpts_helper import is_zero, gamma_point, member
+from pyscf import __config__
+
+DM2MO_PREC = getattr(__config__, 'pbc_gto_df_df_jk_dm2mo_prec', 1e-10)
 
 def density_fit(mf, auxbasis=None, mesh=None, with_df=None):
     '''Generte density-fitting SCF object
@@ -69,14 +72,14 @@ def density_fit(mf, auxbasis=None, mesh=None, with_df=None):
 
 def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=numpy.zeros((1,3)), kpts_band=None):
     log = logger.Logger(mydf.stdout, mydf.verbose)
-    t1 = (logger.process_clock(), logger.perf_counter())
+    t0 = (logger.process_clock(), logger.perf_counter())
     if mydf._cderi is None or not mydf.has_kpts(kpts_band):
         if mydf._cderi is not None:
             log.warn('DF integrals for band k-points were not found %s. '
                      'DF integrals will be rebuilt to include band k-points.',
                      mydf._cderi)
         mydf.build(kpts_band=kpts_band)
-        t1 = log.timer_debug1('Init get_j_kpts', *t1)
+        t0 = log.timer_debug1('Init get_j_kpts', *t0)
 
     dm_kpts = lib.asarray(dm_kpts, order='C')
     dms = _format_dms(dm_kpts, kpts)
@@ -93,6 +96,7 @@ def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=numpy.zeros((1,3)), kpts_band=None):
     nband = len(kpts_band)
     j_real = gamma_point(kpts_band) and not numpy.iscomplexobj(dms)
 
+    t1 = (logger.process_clock(), logger.perf_counter())
     dmsR = dms.real.transpose(0,1,3,2).reshape(nset,nkpts,nao**2)
     dmsI = dms.imag.transpose(0,1,3,2).reshape(nset,nkpts,nao**2)
     rhoR = numpy.zeros((nset,naux))
@@ -143,6 +147,8 @@ def get_j_kpts(mydf, dm_kpts, hermi=1, kpts=numpy.zeros((1,3)), kpts_band=None):
     vj_kpts = lib.unpack_tril(vj_kpts.reshape(-1,nao_pair))
     vj_kpts = vj_kpts.reshape(nset,nband,nao,nao)
 
+    log.timer('get_j', *t0)
+
     return _format_jks(vj_kpts, dm_kpts, input_band, kpts)
 
 
@@ -156,65 +162,335 @@ def get_k_kpts(mydf, dm_kpts, hermi=1, kpts=numpy.zeros((1,3)), kpts_band=None,
                  'exxdiv needs to be "ewald" or None', exxdiv)
         raise RuntimeError('GDF does not support exxdiv %s' % exxdiv)
 
-    t1 = (logger.process_clock(), logger.perf_counter())
+    t0 = (logger.process_clock(), logger.perf_counter())
     if mydf._cderi is None or not mydf.has_kpts(kpts_band):
         if mydf._cderi is not None:
             log.warn('DF integrals for band k-points were not found %s. '
                      'DF integrals will be rebuilt to include band k-points.',
                      mydf._cderi)
         mydf.build(kpts_band=kpts_band)
-        t1 = log.timer_debug1('Init get_k_kpts', *t1)
+        t0 = log.timer_debug1('Init get_k_kpts', *t0)
+
+    mo_coeff = getattr(dm_kpts, 'mo_coeff', None)
+    if mo_coeff is not None:
+        mo_occ = dm_kpts.mo_occ
 
     dm_kpts = lib.asarray(dm_kpts, order='C')
     dms = _format_dms(dm_kpts, kpts)
     nset, nkpts, nao = dms.shape[:3]
 
+    skmoR = skmo2R = None
+    if not mydf.force_dm_kbuild:
+        if mo_coeff is not None:
+            if isinstance(mo_coeff[0], (list, tuple)):
+                mo_coeff = [mo for mo1 in mo_coeff for mo in mo1]
+            if len(mo_coeff) != nset*nkpts: # wrong shape
+                log.warn('mo_coeff from dm tag has wrong shape. '
+                         'Calculating mo from dm instead.')
+                mo_coeff = None
+            elif isinstance(mo_occ[0], (list, tuple)):
+                mo_occ = [mo for mo1 in mo_occ for mo in mo1]
+        if mo_coeff is not None:
+            skmoR, skmoI = _format_mo(mo_coeff, mo_occ, shape=(nset,nkpts), order='F',
+                                      precision=cell.precision)
+        elif hermi == 1:
+            skmoR, skmoI = _mo_from_dm(dms.reshape(-1,nao,nao), method='eigh',
+                                       shape=(nset,nkpts), order='F',
+                                       precision=cell.precision)
+            if skmoR is None:
+                log.debug1('get_k_kpts: Eigh fails for input dm due to non-PSD. '
+                           'Try SVD instead.')
+        if skmoR is None:
+            skmoR, skmoI, skmo2R, skmo2I = _mo_from_dm(dms.reshape(-1,nao,nao),
+                                                   method='svd', shape=(nset,nkpts),
+                                                   order='F', precision=cell.precision)
+            if skmoR[0,0].shape[1] > nao//2:
+                log.debug1('get_k_kpts: rank(dm) = %d exceeds half of nao = %d. '
+                           'Fall back to DM-based build.', skmoR[0,0].shape[1], nao)
+                skmoR = skmo2R = None
+
     kpts_band, input_band = _format_kpts_band(kpts_band, kpts), kpts_band
     nband = len(kpts_band)
     vkR = numpy.zeros((nset,nband,nao,nao))
     vkI = numpy.zeros((nset,nband,nao,nao))
-    dmsR = numpy.asarray(dms.real, order='C')
-    dmsI = numpy.asarray(dms.imag, order='C')
 
+    tspans = numpy.zeros((7,2))
+    tspannames = ['buf1', 'ct11', 'ct12', 'buf2', 'ct21', 'ct22', 'load']
+
+    ''' math
+    K(p,q; k2 from k1)
+        = V(r k1, q k2, p k2, s k1) * D(s,r; k1)
+        = V(L, r k1, q k2) * V(L, s k1, p k2).conj() * D(s,r; k1)         eqn (1)
+    --> in case of Hermitian & PSD DM
+        = ( V(L, s k1, p k2) * C(s,i; k1).conj() ).conj()
+          * V(L, r k1, q k2) * C(r,i; k1).conj()                          eqn (2)
+        = W(L, i k1, p k2).conj() * W(L, i k1, q k2)                      eqn (3)
+    --> in case of non-Hermitian or non-PSD DM
+        = ( V(L, s k1, p k2) * A(s,i; k1).conj() ).conj()
+          * V(L, r k1, q k2) * B(r,i; k1).conj()                          eqn (4)
+        = X(L, i k1, p k2).conj() * Y(L, i k1, q k2)                      eqn (5)
+
+    if swap_2e:
+    K(p,q; k1 from k2)
+        = V(p k1, s k2, r k2, q k1) * D(s,r; k2)
+        = V(L, p k1, s k2) * V(L, q k1, r k2).conj() * D(s,r; k2)         eqn (1')
+    --> in case of Hermitian & PSD DM
+        = V(L, p k1, s k2) * C(s,i; k2)
+          * ( V(L, q k1, r k2) * C(r,i; k2) ).conj()                      eqn (2')
+        = W(L, p k1, i k2) * W(L, q k1, i k2).conj()                      eqn (3')
+    --> in case of non-Hermitian or non-PSD DM
+        = V(L, p k1, s k2) * A(s,i; k2)
+          * ( V(L, q k1, r k2) * B(r,i; k2) ).conj()                      eqn (4')
+        = X(L, p k1, i k2) * Y(L, q k1, i k2).conj()                      eqn (5')
+
+    Mode 1: DM-based K-build uses eqn (1) and eqn (1')
+    Mode 2: Symm MO-based K-build uses eqns (2,3) and eqns (2',3')
+    Mode 3: Asymm MO-based K-build uses eqns (4,5) and eqns (4',5')
+    '''
     # K_pq = ( p{k1} i{k2} | i{k2} q{k1} )
-    bufR = numpy.empty((mydf.blockdim*nao**2))
-    bufI = numpy.empty((mydf.blockdim*nao**2))
-    max_memory = max(2000, mydf.max_memory-lib.current_memory()[0])
-    def make_kpt(ki, kj, swap_2e, inverse_idx=None):
-        kpti = kpts[ki]
-        kptj = kpts_band[kj]
-
-        for LpqR, LpqI, sign in mydf.sr_loop((kpti,kptj), max_memory, False):
-            nrow = LpqR.shape[0]
-            pLqR = numpy.ndarray((nao,nrow,nao), buffer=bufR)
-            pLqI = numpy.ndarray((nao,nrow,nao), buffer=bufI)
-            tmpR = numpy.ndarray((nao,nrow*nao), buffer=LpqR)
-            tmpI = numpy.ndarray((nao,nrow*nao), buffer=LpqI)
-            pLqR[:] = LpqR.reshape(-1,nao,nao).transpose(1,0,2)
-            pLqI[:] = LpqI.reshape(-1,nao,nao).transpose(1,0,2)
-
-            for i in range(nset):
-                zdotNN(dmsR[i,ki], dmsI[i,ki], pLqR.reshape(nao,-1),
-                       pLqI.reshape(nao,-1), 1, tmpR, tmpI)
-                zdotCN(pLqR.reshape(-1,nao).T, pLqI.reshape(-1,nao).T,
-                       tmpR.reshape(-1,nao), tmpI.reshape(-1,nao),
-                       sign, vkR[i,kj], vkI[i,kj], 1)
-
-            if swap_2e:
-                tmpR = tmpR.reshape(nao*nrow,nao)
-                tmpI = tmpI.reshape(nao*nrow,nao)
-                ki_tmp = ki
-                kj_tmp = kj
-                if inverse_idx:
-                    ki_tmp = inverse_idx[0]
-                    kj_tmp = inverse_idx[1]
+    if skmoR is None: # input dm is not Hermitian/PSD --> build K from dm
+        log.debug2('get_k_kpts: build K from dm')
+        dmsR = numpy.asarray(dms.real, order='C')
+        dmsI = numpy.asarray(dms.imag, order='C')
+        bufR = numpy.empty((mydf.blockdim*nao**2))
+        bufI = numpy.empty((mydf.blockdim*nao**2))
+        max_memory = max(2000, mydf.max_memory-lib.current_memory()[0])
+        def make_kpt(ki, kj, swap_2e, inverse_idx=None):
+            kpti = kpts[ki]
+            kptj = kpts_band[kj]
+
+            tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+
+            for LpqR, LpqI, sign in mydf.sr_loop((kpti,kptj), max_memory, False):
+                nrow = LpqR.shape[0]
+
+                tick = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                tspans[6] += tick - tock
+
+                pLqR = numpy.ndarray((nao,nrow,nao), buffer=bufR)
+                pLqI = numpy.ndarray((nao,nrow,nao), buffer=bufI)
+                tmpR = numpy.ndarray((nao,nrow*nao), buffer=LpqR)
+                tmpI = numpy.ndarray((nao,nrow*nao), buffer=LpqI)
+                pLqR[:] = LpqR.reshape(-1,nao,nao).transpose(1,0,2)
+                pLqI[:] = LpqI.reshape(-1,nao,nao).transpose(1,0,2)
+
+                tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                tspans[0] += tock - tick
+
                 for i in range(nset):
-                    zdotNN(pLqR.reshape(-1,nao), pLqI.reshape(-1,nao),
-                           dmsR[i,kj_tmp], dmsI[i,kj_tmp], 1, tmpR, tmpI)
-                    zdotNC(tmpR.reshape(nao,-1), tmpI.reshape(nao,-1),
-                           pLqR.reshape(nao,-1).T, pLqI.reshape(nao,-1).T,
-                           sign, vkR[i,ki_tmp], vkI[i,ki_tmp], 1)
+                    zdotNN(dmsR[i,ki], dmsI[i,ki], pLqR.reshape(nao,-1),
+                           pLqI.reshape(nao,-1), 1, tmpR, tmpI)
+                    tick = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                    tspans[1] += tick - tock
+                    zdotCN(pLqR.reshape(-1,nao).T, pLqI.reshape(-1,nao).T,
+                           tmpR.reshape(-1,nao), tmpI.reshape(-1,nao),
+                           sign, vkR[i,kj], vkI[i,kj], 1)
+                    tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                    tspans[2] += tock - tick
+
+                if swap_2e:
+                    tmpR = tmpR.reshape(nao*nrow,nao)
+                    tmpI = tmpI.reshape(nao*nrow,nao)
+                    tick = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                    tspans[3] += tick - tock
+                    ki_tmp = ki
+                    kj_tmp = kj
+                    if inverse_idx:
+                        ki_tmp = inverse_idx[0]
+                        kj_tmp = inverse_idx[1]
+                    for i in range(nset):
+                        zdotNN(pLqR.reshape(-1,nao), pLqI.reshape(-1,nao),
+                               dmsR[i,kj_tmp], dmsI[i,kj_tmp], 1, tmpR, tmpI)
+                        tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                        tspans[4] += tock - tick
+                        zdotNC(tmpR.reshape(nao,-1), tmpI.reshape(nao,-1),
+                               pLqR.reshape(nao,-1).T, pLqI.reshape(nao,-1).T,
+                               sign, vkR[i,ki_tmp], vkI[i,ki_tmp], 1)
+                        tick = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                        tspans[5] += tick - tock
+
+                tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+
+                LpqR = LpqI = pLqR = pLqI = tmpR = tmpI = None
+    elif skmo2R is None:
+        log.debug2('get_k_kpts: build K from symm mo coeff')
+        nmo = skmoR[0,0].shape[1]
+        log.debug2('get_k_kpts: rank(dm) = %d / %d', nmo, nao)
+        skmoI_mask = numpy.asarray([[abs(skmoI[i,k]).max() > cell.precision
+                                     for k in range(nkpts)] for i in range(nset)])
+        bufR = numpy.empty((mydf.blockdim*nao**2))
+        bufI = numpy.empty((mydf.blockdim*nao**2))
+        max_memory = max(2000, mydf.max_memory-lib.current_memory()[0])
+        def make_kpt(ki, kj, swap_2e, inverse_idx=None):
+            kpti = kpts[ki]
+            kptj = kpts_band[kj]
+
+            tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+
+            for LpqR, LpqI, sign in mydf.sr_loop((kpti,kptj), max_memory, False):
+                nrow = LpqR.shape[0]
+
+                tick = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                tspans[6] += tick - tock
+
+                pLqR = numpy.ndarray((nao,nrow,nao), buffer=bufR)
+                pLqI = numpy.ndarray((nao,nrow,nao), buffer=bufI)
+                tmpR = numpy.ndarray((nmo,nrow*nao), buffer=LpqR)
+                tmpI = numpy.ndarray((nmo,nrow*nao), buffer=LpqI)
+                pLqR[:] = LpqR.reshape(-1,nao,nao).transpose(1,0,2)
+                pLqI[:] = LpqI.reshape(-1,nao,nao).transpose(1,0,2)
+
+                tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                tspans[0] += tock - tick
 
+                for i in range(nset):
+                    moR = skmoR[i,ki]
+                    if skmoI_mask[i,ki]:
+                        moI = skmoI[i,ki]
+                        zdotCN(moR.T, moI.T, pLqR.reshape(nao,-1), pLqI.reshape(nao,-1),
+                               1, tmpR, tmpI)
+                    else:
+                        lib.ddot(moR.T, pLqR.reshape(nao,-1), 1, tmpR)
+                        lib.ddot(moR.T, pLqI.reshape(nao,-1), 1, tmpI)
+                    tick = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                    tspans[1] += tick - tock
+                    zdotCN(tmpR.reshape(-1,nao).T, tmpI.reshape(-1,nao).T,
+                           tmpR.reshape(-1,nao), tmpI.reshape(-1,nao),
+                           sign, vkR[i,kj], vkI[i,kj], 1)
+                    tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                    tspans[2] += tock - tick
+
+                if swap_2e:
+                    tmpR = tmpR.reshape(nrow*nao,nmo)
+                    tmpI = tmpI.reshape(nrow*nao,nmo)
+                    tick = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                    tspans[3] += tick - tock
+                    ki_tmp = ki
+                    kj_tmp = kj
+                    if inverse_idx:
+                        ki_tmp = inverse_idx[0]
+                        kj_tmp = inverse_idx[1]
+                    for i in range(nset):
+                        moR = skmoR[i,kj_tmp]
+                        if skmoI_mask[i,kj_tmp]:
+                            moI = skmoI[i,kj_tmp]
+                            zdotNN(pLqR.reshape(-1,nao), pLqI.reshape(-1,nao), moR, moI,
+                                   1, tmpR, tmpI)
+                        else:
+                            lib.ddot(pLqR.reshape(-1,nao), moR, 1, tmpR)
+                            lib.ddot(pLqI.reshape(-1,nao), moR, 1, tmpI)
+                        tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                        tspans[4] += tock - tick
+                        zdotNC(tmpR.reshape(nao,-1), tmpI.reshape(nao,-1),
+                               tmpR.reshape(nao,-1).T, tmpI.reshape(nao,-1).T,
+                               sign, vkR[i,ki_tmp], vkI[i,ki_tmp], 1)
+                        tick = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                        tspans[5] += tick - tock
+
+                tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+
+                LpqR = LpqI = pLqR = pLqI = tmpR = tmpI = None
+    else:
+        log.debug2('get_k_kpts: build K from asymm mo coeff')
+        skmo1R = skmoR
+        skmo1I = skmoI
+        nmo = skmoR[0,0].shape[1]
+        log.debug2('get_k_kpts: rank(dm) = %d / %d', nmo, nao)
+        skmoI_mask = numpy.asarray([[max(abs(skmo1I[i,k]).max(),
+                                         abs(skmo2I[i,k]).max()) > cell.precision
+                                     for k in range(nkpts)] for i in range(nset)])
+        bufR = numpy.empty((mydf.blockdim*nao**2))
+        bufI = numpy.empty((mydf.blockdim*nao**2))
+        max_memory = max(2000, mydf.max_memory-lib.current_memory()[0])
+        def make_kpt(ki, kj, swap_2e, inverse_idx=None):
+            kpti = kpts[ki]
+            kptj = kpts_band[kj]
+
+            tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+
+            for LpqR, LpqI, sign in mydf.sr_loop((kpti,kptj), max_memory, False):
+                nrow = LpqR.shape[0]
+
+                tick = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                tspans[6] += tick - tock
+
+                pLqR = numpy.ndarray((nao,nrow,nao), buffer=bufR)
+                pLqI = numpy.ndarray((nao,nrow,nao), buffer=bufI)
+                tmp1R = numpy.ndarray((nmo,nrow*nao), buffer=LpqR)
+                tmp1I = numpy.ndarray((nmo,nrow*nao), buffer=LpqI)
+                tmp2R = numpy.ndarray((nmo,nrow*nao),
+                                      buffer=LpqR.reshape(-1)[tmp1R.size:])
+                tmp2I = numpy.ndarray((nmo,nrow*nao),
+                                      buffer=LpqI.reshape(-1)[tmp1I.size:])
+                pLqR[:] = LpqR.reshape(-1,nao,nao).transpose(1,0,2)
+                pLqI[:] = LpqI.reshape(-1,nao,nao).transpose(1,0,2)
+
+                tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                tspans[0] += tock - tick
+
+                for i in range(nset):
+                    mo1R = skmo1R[i,ki]
+                    mo2R = skmo2R[i,ki]
+                    if skmoI_mask[i,ki]:
+                        mo1I = skmo1I[i,ki]
+                        mo2I = skmo2I[i,ki]
+                        zdotCN(mo1R.T, mo1I.T, pLqR.reshape(nao,-1), pLqI.reshape(nao,-1),
+                               1, tmp1R, tmp1I)
+                        zdotCN(mo2R.T, mo2I.T, pLqR.reshape(nao,-1), pLqI.reshape(nao,-1),
+                               1, tmp2R, tmp2I)
+                    else:
+                        lib.ddot(mo1R.T, pLqR.reshape(nao,-1), 1, tmp1R)
+                        lib.ddot(mo1R.T, pLqI.reshape(nao,-1), 1, tmp1I)
+                        lib.ddot(mo2R.T, pLqR.reshape(nao,-1), 1, tmp2R)
+                        lib.ddot(mo2R.T, pLqI.reshape(nao,-1), 1, tmp2I)
+                    tick = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                    tspans[1] += tick - tock
+                    zdotCN(tmp1R.reshape(-1,nao).T, tmp1I.reshape(-1,nao).T,
+                           tmp2R.reshape(-1,nao), tmp2I.reshape(-1,nao),
+                           sign, vkR[i,kj], vkI[i,kj], 1)
+                    tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                    tspans[2] += tock - tick
+
+                if swap_2e:
+                    tmp1R = tmp1R.reshape(nrow*nao,nmo)
+                    tmp1I = tmp1I.reshape(nrow*nao,nmo)
+                    tmp2R = tmp2R.reshape(nrow*nao,nmo)
+                    tmp2I = tmp2I.reshape(nrow*nao,nmo)
+                    tick = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                    tspans[3] += tick - tock
+                    ki_tmp = ki
+                    kj_tmp = kj
+                    if inverse_idx:
+                        ki_tmp = inverse_idx[0]
+                        kj_tmp = inverse_idx[1]
+                    for i in range(nset):
+                        mo1R = skmo1R[i,kj_tmp]
+                        mo2R = skmo2R[i,kj_tmp]
+                        if skmoI_mask[i,kj_tmp]:
+                            mo1I = skmo1I[i,kj_tmp]
+                            mo2I = skmo2I[i,kj_tmp]
+                            zdotNN(pLqR.reshape(-1,nao), pLqI.reshape(-1,nao), mo1R, mo1I,
+                                   1, tmp1R, tmp1I)
+                            zdotNN(pLqR.reshape(-1,nao), pLqI.reshape(-1,nao), mo2R, mo2I,
+                                   1, tmp2R, tmp2I)
+                        else:
+                            lib.ddot(pLqR.reshape(-1,nao), mo1R, 1, tmp1R)
+                            lib.ddot(pLqI.reshape(-1,nao), mo1R, 1, tmp1I)
+                            lib.ddot(pLqR.reshape(-1,nao), mo2R, 1, tmp2R)
+                            lib.ddot(pLqI.reshape(-1,nao), mo2R, 1, tmp2I)
+                        tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                        tspans[4] += tock - tick
+                        zdotNC(tmp1R.reshape(nao,-1), tmp1I.reshape(nao,-1),
+                               tmp2R.reshape(nao,-1).T, tmp2I.reshape(nao,-1).T,
+                               sign, vkR[i,ki_tmp], vkI[i,ki_tmp], 1)
+                        tick = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+                        tspans[5] += tick - tock
+
+                tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+
+                LpqR = LpqI = pLqR = pLqI = tmp1R = tmp1I = tmp2R = tmp2I = None
+
+    t1 = (logger.process_clock(), logger.perf_counter())
     if kpts_band is kpts:  # normal k-points HF/DFT
         for ki in range(nkpts):
             for kj in range(ki):
@@ -251,6 +527,10 @@ def make_kpt(ki, kj, swap_2e, inverse_idx=None):
                         make_kpt(ki, kj, False)
             t1 = log.timer_debug1('get_k_kpts: make_kpt (%d,*)'%ki, *t1)
 
+    for tspan, tspanname in zip(tspans,tspannames):
+        log.debug1('    CPU time for %s %10.2f sec, wall time %10.2f sec',
+                   tspanname, *tspan)
+
     if (gamma_point(kpts) and gamma_point(kpts_band) and
         not numpy.iscomplexobj(dm_kpts)):
         vk_kpts = vkR
@@ -261,6 +541,8 @@ def make_kpt(ki, kj, swap_2e, inverse_idx=None):
     if exxdiv == 'ewald':
         _ewald_exxdiv_for_G0(cell, kpts, dms, vk_kpts, kpts_band)
 
+    log.timer('get_k_kpts', *t0)
+
     return _format_jks(vk_kpts, dm_kpts, input_band, kpts)
 
 
@@ -284,21 +566,20 @@ def get_jk(mydf, dm, hermi=1, kpt=numpy.zeros(3),
 
     cell = mydf.cell
     log = logger.Logger(mydf.stdout, mydf.verbose)
-    t1 = (logger.process_clock(), logger.perf_counter())
+    t0 = (logger.process_clock(), logger.perf_counter())
     if mydf._cderi is None or not mydf.has_kpts(kpts_band):
         if mydf._cderi is not None:
             log.warn('DF integrals for band k-points were not found %s. '
                      'DF integrals will be rebuilt to include band k-points.',
                      mydf._cderi)
         mydf.build(kpts_band=kpts_band)
-        t1 = log.timer_debug1('Init get_jk', *t1)
+        t0 = log.timer_debug1('Init get_jk', *t0)
 
     dm = numpy.asarray(dm, order='C')
     dms = _format_dms(dm, [kpt])
     nset, _, nao = dms.shape[:3]
     dms = dms.reshape(nset,nao,nao)
     j_real = gamma_point(kpt)
-    k_real = gamma_point(kpt) and not numpy.iscomplexobj(dms)
     kptii = numpy.asarray((kpt,kpt))
     dmsR = dms.real.reshape(nset,nao,nao)
     dmsI = dms.imag.reshape(nset,nao,nao)
@@ -308,40 +589,166 @@ def get_jk(mydf, dm, hermi=1, kpt=numpy.zeros(3),
         vjR = numpy.zeros((nset,nao,nao))
         vjI = numpy.zeros((nset,nao,nao))
     if with_k:
+        ''' math
+        Mode 1: DM-based K-build:
+            K(p,q)
+                = V(r,q,p,s) * D(s,r)
+                = V(L,r,q) * V(L,s,p).conj() * D(s,r)    eqn (1)
+
+        Mode 2: Symm MO-based K-build:
+        In case of Hermitian & PSD DM, eqn (1) can be rewritten as
+            K(p,q)
+                = W(L,i,p).conj() * W(L,i,q)
+        where
+            W(L,i,p) = V(L,s,p) * C(s,i).conj()
+            D(s,r) = C(s,i) * C(r,i).conj()
+
+        Mode 3: Asymm MO-based K-build:
+        In case of non-Hermitian or Hermitian but non-PSD DM, eqn (1) can be rewritten as
+            K(p,q)
+                = X(L,i,p).conj() * Y(L,i,q)
+            where
+                X(L,i,p) = V(L,s,p) * A(s,i).conj()
+                Y(L,i,q) = V(L,r,q) * B(r,i).conj()
+                D(s,r) = A(s,i) * B(r,i).conj()
+        '''
+        smoR = smo2R = None
+        if not mydf.force_dm_kbuild:
+            if hermi == 1:
+                smoR, smoI = _mo_from_dm(dms.reshape(-1,nao,nao), method='eigh',
+                                           order='F', precision=cell.precision)
+                if smoR is None:
+                    log.debug1('get_jk: Eigh fails for input dm due to non-PSD. '
+                               'Try SVD instead.')
+            if smoR is None:
+                smoR, smoI, smo2R, smo2I = _mo_from_dm(dms.reshape(-1,nao,nao),
+                                                       method='svd', order='F',
+                                                       precision=cell.precision)
+                if smoR[0].shape[1] > nao//2:
+                    log.debug1('get_jk: rank(dm) = %d exceeds half of nao = %d. '
+                               'Fall back to DM-based build.', smoR[0].shape[1], nao)
+                    smoR = smo2R = None
+
         vkR = numpy.zeros((nset,nao,nao))
         vkI = numpy.zeros((nset,nao,nao))
         buf1R = numpy.empty((mydf.blockdim*nao**2))
-        buf2R = numpy.empty((mydf.blockdim*nao**2))
         buf1I = numpy.zeros((mydf.blockdim*nao**2))
-        buf2I = numpy.empty((mydf.blockdim*nao**2))
-        max_memory *= .5
-    log.debug1('max_memory = %d MB (%d in use)', max_memory, mem_now)
-    def contract_k(pLqR, pLqI, sign):
-        # K ~ 'iLj,lLk*,li->kj' + 'lLk*,iLj,li->kj'
-        #:pLq = (LpqR + LpqI.reshape(-1,nao,nao)*1j).transpose(1,0,2)
-        #:tmp = numpy.dot(dm, pLq.reshape(nao,-1))
-        #:vk += numpy.dot(pLq.reshape(-1,nao).conj().T, tmp.reshape(-1,nao))
-        nrow = pLqR.shape[1]
-        tmpR = numpy.ndarray((nao,nrow*nao), buffer=buf2R)
-        if k_real:
-            for i in range(nset):
-                lib.ddot(dmsR[i], pLqR.reshape(nao,-1), 1, tmpR)
-                lib.ddot(pLqR.reshape(-1,nao).T, tmpR.reshape(-1,nao), sign, vkR[i], 1)
+        if smoR is None:
+            # K ~ 'iLj,lLk*,li->kj' + 'lLk*,iLj,li->kj'
+            #:pLq = (LpqR + LpqI.reshape(-1,nao,nao)*1j).transpose(1,0,2)
+            #:tmp = numpy.dot(dm, pLq.reshape(nao,-1))
+            #:vk += numpy.dot(pLq.reshape(-1,nao).conj().T, tmp.reshape(-1,nao))
+            log.debug2('get_jk: build K from dm')
+            k_real = gamma_point(kpt) and not numpy.iscomplexobj(dms)
+            buf2R = numpy.empty((mydf.blockdim*nao**2))
+            buf2I = numpy.empty((mydf.blockdim*nao**2))
+            if k_real:
+                def contract_k(pLqR, pLqI, sign):
+                    nrow = pLqR.shape[1]
+                    tmpR = numpy.ndarray((nao,nrow*nao), buffer=buf2R)
+                    for i in range(nset):
+                        lib.ddot(dmsR[i], pLqR.reshape(nao,-1), 1, tmpR)
+                        lib.ddot(pLqR.reshape(-1,nao).T, tmpR.reshape(-1,nao),
+                                 sign, vkR[i], 1)
+            else:
+                buf2I = numpy.empty((mydf.blockdim*nao**2))
+                def contract_k(pLqR, pLqI, sign):
+                    nrow = pLqR.shape[1]
+                    tmpR = numpy.ndarray((nao,nrow*nao), buffer=buf2R)
+                    tmpI = numpy.ndarray((nao,nrow*nao), buffer=buf2I)
+                    for i in range(nset):
+                        zdotNN(dmsR[i], dmsI[i], pLqR.reshape(nao,-1),
+                               pLqI.reshape(nao,-1), 1, tmpR, tmpI, 0)
+                        zdotCN(pLqR.reshape(-1,nao).T, pLqI.reshape(-1,nao).T,
+                               tmpR.reshape(-1,nao), tmpI.reshape(-1,nao),
+                               sign, vkR[i], vkI[i], 1)
+        elif smo2R is None:
+            log.debug2('get_jk: build K from symm mo coeff')
+            nmo = smoR[0].shape[1]
+            log.debug2('get_jk: rank(dm) = %d / %d', nmo, nao)
+            smoI_mask = numpy.asarray([abs(moI).max() > cell.precision for moI in smoI])
+            k_real = gamma_point(kpt) and not numpy.any(smoI_mask)
+            buf2R = numpy.empty((mydf.blockdim*nao*nmo))
+            if k_real:
+                def contract_k(pLqR, pLqI, sign):
+                    nrow = pLqR.shape[1]
+                    tmpR = numpy.ndarray((nmo,nrow*nao), buffer=buf2R)
+                    for i in range(nset):
+                        lib.ddot(smoR[i].T, pLqR.reshape(nao,-1), 1, tmpR)
+                        lib.ddot(tmpR.reshape(-1,nao).T, tmpR.reshape(-1,nao),
+                                 sign, vkR[i], 1)
+                    tmpR = None
+            else:
+                buf2I = numpy.empty((mydf.blockdim*nao*nmo))
+                def contract_k(pLqR, pLqI, sign):
+                    nrow = pLqR.shape[1]
+                    tmpR = numpy.ndarray((nmo,nrow*nao), buffer=buf2R)
+                    tmpI = numpy.ndarray((nmo,nrow*nao), buffer=buf2I)
+                    for i in range(nset):
+                        zdotCN(smoR[i].T, smoI[i].T,
+                               pLqR.reshape(nao,-1), pLqI.reshape(nao,-1),
+                               1, tmpR, tmpI, 0)
+                        zdotCN(tmpR.reshape(-1,nao).T, tmpI.reshape(-1,nao).T,
+                               tmpR.reshape(-1,nao), tmpI.reshape(-1,nao),
+                               sign, vkR[i], vkI[i], 1)
+                    tmpR = tmpI = None
         else:
-            tmpI = numpy.ndarray((nao,nrow*nao), buffer=buf2I)
-            for i in range(nset):
-                zdotNN(dmsR[i], dmsI[i], pLqR.reshape(nao,-1),
-                       pLqI.reshape(nao,-1), 1, tmpR, tmpI, 0)
-                zdotCN(pLqR.reshape(-1,nao).T, pLqI.reshape(-1,nao).T,
-                       tmpR.reshape(-1,nao), tmpI.reshape(-1,nao),
-                       sign, vkR[i], vkI[i], 1)
+            log.debug2('get_jk: build K from asymm mo coeff')
+            smo1R = smoR
+            smo1I = smoI
+            nmo = smo1R[0].shape[1]
+            log.debug2('get_jk: rank(dm) = %d / %d', nmo, nao)
+            smoI_mask = numpy.asarray([max(abs(mo1I).max(),
+                                           abs(mo2I).max()) > cell.precision
+                                       for mo1I,mo2I in zip(smo1I,smo2I)])
+            k_real = gamma_point(kpt) and not numpy.any(smoI_mask)
+            buf2R = numpy.empty((mydf.blockdim*nao*nmo*2))
+            buf3R = buf2R[buf2R.size//2:]
+            if k_real:
+                def contract_k(pLqR, pLqI, sign):
+                    nrow = pLqR.shape[1]
+                    tmp1R = numpy.ndarray((nmo,nrow*nao), buffer=buf2R)
+                    tmp2R = numpy.ndarray((nmo,nrow*nao), buffer=buf3R)
+                    for i in range(nset):
+                        lib.ddot(smo1R[i].T, pLqR.reshape(nao,-1), 1, tmp1R)
+                        lib.ddot(smo2R[i].T, pLqR.reshape(nao,-1), 1, tmp2R)
+                        lib.ddot(tmp1R.reshape(-1,nao).T, tmp2R.reshape(-1,nao),
+                                 sign, vkR[i], 1)
+                    tmp1R = tmp2R = None
+            else:
+                buf2I = numpy.empty((mydf.blockdim*nao*nmo*2))
+                buf3I = buf2I[buf2I.size//2:]
+                def contract_k(pLqR, pLqI, sign):
+                    nrow = pLqR.shape[1]
+                    tmp1R = numpy.ndarray((nmo,nrow*nao), buffer=buf2R)
+                    tmp1I = numpy.ndarray((nmo,nrow*nao), buffer=buf2I)
+                    tmp2R = numpy.ndarray((nmo,nrow*nao), buffer=buf3R)
+                    tmp2I = numpy.ndarray((nmo,nrow*nao), buffer=buf3I)
+                    for i in range(nset):
+                        zdotCN(smo1R[i].T, smo1I[i].T,
+                               pLqR.reshape(nao,-1), pLqI.reshape(nao,-1),
+                               1, tmp1R, tmp1I, 0)
+                        zdotCN(smo2R[i].T, smo2I[i].T,
+                               pLqR.reshape(nao,-1), pLqI.reshape(nao,-1),
+                               1, tmp2R, tmp2I, 0)
+                        zdotCN(tmp1R.reshape(-1,nao).T, tmp1I.reshape(-1,nao).T,
+                               tmp2R.reshape(-1,nao), tmp2I.reshape(-1,nao),
+                               sign, vkR[i], vkI[i], 1)
+                    tmp1R = tmp1I = tmp2R = tmp2I = None
+        max_memory *= .5
+    log.debug1('get_jk: max_memory = %d MB (%d in use)', max_memory, mem_now)
+
+    tspans = numpy.zeros((3,2))
+    tspannames = ['  load', 'with_j', 'with_k']
+    tspanmasks = [True, with_j, with_k]
+
+    tick = numpy.asarray((logger.process_clock(), logger.perf_counter()))
     pLqI = None
     thread_k = None
     for LpqR, LpqI, sign in mydf.sr_loop(kptii, max_memory, False):
         LpqR = LpqR.reshape(-1,nao,nao)
-        t1 = log.timer_debug1('        load', *t1)
-        if thread_k is not None:
-            thread_k.join()
+        tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+        tspans[0] += tock - tick
         if with_j:
             #:rho_coeff = numpy.einsum('Lpq,xqp->xL', Lpq, dms)
             #:vj += numpy.dot(rho_coeff, Lpq.reshape(-1,nao**2))
@@ -357,7 +764,11 @@ def contract_k(pLqR, pLqI, sign):
                 vjI += sign * numpy.einsum('xL,Lpq->xpq', rhoR, LpqI)
                 vjI += sign * numpy.einsum('xL,Lpq->xpq', rhoI, LpqR)
 
-        t1 = log.timer_debug1('        with_j', *t1)
+        tick = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+        tspans[1] += tick - tock
+
+        if thread_k is not None:
+            thread_k.join()
         if with_k:
             nrow = LpqR.shape[0]
             pLqR = numpy.ndarray((nao,nrow,nao), buffer=buf1R)
@@ -368,12 +779,25 @@ def contract_k(pLqR, pLqI, sign):
                     pLqI[:] = LpqI.reshape(-1,nao,nao).transpose(1,0,2)
 
             thread_k = lib.background_thread(contract_k, pLqR, pLqI, sign)
-            t1 = log.timer_debug1('        with_k', *t1)
+
+        tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+        tspans[2] += tock - tick
+
         LpqR = LpqI = pLqR = pLqI = None
+
+    tick = numpy.asarray((logger.process_clock(), logger.perf_counter()))
     if thread_k is not None:
         thread_k.join()
     thread_k = None
 
+    tock = numpy.asarray((logger.process_clock(), logger.perf_counter()))
+    tspans[2] += tock - tick
+
+    for tspan,tspanname,tspanmask in zip(tspans,tspannames,tspanmasks):
+        if tspanmask:
+            log.debug1('    CPU time for %s %9.2f sec, wall time %9.2f sec',
+                       tspanname, *tspan)
+
     if with_j:
         if j_real:
             vj = vjR
@@ -389,9 +813,88 @@ def contract_k(pLqR, pLqI, sign):
             _ewald_exxdiv_for_G0(cell, kpt, dms, vk)
         vk = vk.reshape(dm.shape)
 
-    t1 = log.timer('sr jk', *t1)
+    log.timer('sr jk', *t0)
     return vj, vk
 
+def _sep_real_imag(a, ncolmax, order):
+    nrow = a.shape[0]
+    aR = numpy.zeros((nrow,ncolmax), dtype=numpy.float64)
+    aI = numpy.zeros((nrow,ncolmax), dtype=numpy.float64)
+    ncol = a.shape[1]
+    aR[:,:ncol] = numpy.asarray(a.real, order=order)
+    aI[:,:ncol] = numpy.asarray(a.imag, order=order)
+    return aR, aI
+def _format_mo(mo_coeff, mo_occ, shape=None, order='F', precision=DM2MO_PREC):
+    mos = [mo[:,mocc>precision]*mocc[mocc>precision]**0.5
+           for mo,mocc in zip(mo_coeff,mo_occ)]
+    nkpts = len(mos)
+    nmomax = numpy.max([mo.shape[1] for mo in mos])
+    moRs = numpy.empty(nkpts, dtype=object)
+    moIs = numpy.empty(nkpts, dtype=object)
+    for k in range(nkpts):
+        moRs[k], moIs[k] = _sep_real_imag(mos[k], nmomax, order)
+    if shape is not None:
+        moRs = moRs.reshape(*shape)
+        moIs = moIs.reshape(*shape)
+    return moRs, moIs
+def _mo_from_dm(dms, method='eigh', shape=None, order='C', precision=DM2MO_PREC):
+    import scipy.linalg
+    nkpts = len(dms)
+    precision *= 1e-2
+
+    if method == 'eigh':
+        def feigh(dm):
+            e, u = scipy.linalg.eigh(dm)
+            if numpy.any(e < -precision): # PSD matrix
+                mo = None
+            else:
+                mask = e > precision
+                mo = u[:,mask] * e[mask]**0.5
+            return mo
+
+        mos = numpy.empty(nkpts, dtype=object)
+        for k,dm in enumerate(dms):
+            mo = feigh(dm)
+            if mo is None:
+                return None, None
+            mos[k] = mo
+
+        nmos = [mo.shape[1] for mo in mos]
+        nmomax = max(nmos)
+        moRs = numpy.empty(nkpts, dtype=object)
+        moIs = numpy.empty(nkpts, dtype=object)
+        for k,mo in enumerate(mos):
+            moRs[k], moIs[k] = _sep_real_imag(mo, nmomax, order)
+        if shape is not None:
+            moRs = moRs.reshape(*shape)
+            moIs = moIs.reshape(*shape)
+        return moRs, moIs
+    elif method == 'svd':
+        def fsvd(dm):
+            u, e, vt = scipy.linalg.svd(dm)
+            mask = e > precision
+            mo1 = u[:,mask] * e[mask]
+            mo2 = vt[mask].T.conj()
+            return mo1, mo2
+
+        mos = [fsvd(dm) for k,dm in enumerate(dms)]
+        nmos = [x[0].shape[1] for x in mos]
+        nmomax = max(nmos)
+        mo1Rs = numpy.empty(nkpts, dtype=object)
+        mo1Is = numpy.empty(nkpts, dtype=object)
+        mo2Rs = numpy.empty(nkpts, dtype=object)
+        mo2Is = numpy.empty(nkpts, dtype=object)
+        for k,(mo1,mo2) in enumerate(mos):
+            mo1Rs[k], mo1Is[k] = _sep_real_imag(mo1, nmomax, order)
+            mo2Rs[k], mo2Is[k] = _sep_real_imag(mo2, nmomax, order)
+        if shape is not None:
+            mo1Rs = mo1Rs.reshape(*shape)
+            mo1Is = mo1Is.reshape(*shape)
+            mo2Rs = mo2Rs.reshape(*shape)
+            mo2Is = mo2Is.reshape(*shape)
+        return mo1Rs, mo1Is, mo2Rs, mo2Is
+    else:
+        raise RuntimeError('Unknown method %s' % method)
 
 def _format_dms(dm_kpts, kpts):
     nkpts = len(kpts)
diff --git a/pyscf/pbc/df/fft.py b/pyscf/pbc/df/fft.py
index 7ac1063b47..e16d64cb07 100644
--- a/pyscf/pbc/df/fft.py
+++ b/pyscf/pbc/df/fft.py
@@ -26,7 +26,7 @@
 from pyscf.pbc.gto import pseudo, estimate_ke_cutoff, error_for_ke_cutoff
 from pyscf.pbc.df import ft_ao
 from pyscf.pbc.df import fft_ao2mo
-from pyscf.pbc.df.aft import _sub_df_jk_
+from pyscf.pbc.df import aft
 from pyscf.pbc.lib.kpts_helper import gamma_point
 from pyscf import __config__
 
@@ -316,8 +316,9 @@ def get_jk(self, dm, hermi=1, kpts=None, kpts_band=None,
                with_j=True, with_k=True, omega=None, exxdiv=None):
         from pyscf.pbc.df import fft_jk
         if omega is not None:  # J/K for RSH functionals
-            return _sub_df_jk_(self, dm, hermi, kpts, kpts_band,
-                               with_j, with_k, omega, exxdiv)
+            with self.range_coulomb(omega) as rsh_df:
+                return rsh_df.get_jk(dm, hermi, kpts, kpts_band, with_j, with_k,
+                                     omega=None, exxdiv=exxdiv)
 
         if kpts is None:
             if numpy.all(self.kpts == 0): # Gamma-point J/K by default
@@ -378,6 +379,8 @@ def get_naoaux(self):
         ngrids = numpy.prod(mesh)
         return ngrids * 2
 
+    range_coulomb = aft.AFTDF.range_coulomb
+
 
 if __name__ == '__main__':
     from pyscf.pbc import gto as pbcgto
diff --git a/pyscf/pbc/df/mdf.py b/pyscf/pbc/df/mdf.py
index c9961cf70a..155fe586e6 100644
--- a/pyscf/pbc/df/mdf.py
+++ b/pyscf/pbc/df/mdf.py
@@ -155,8 +155,9 @@ def get_jk(self, dm, hermi=1, kpts=None, kpts_band=None,
                 mydf.mesh = pbctools.cutoff_to_mesh(cell.lattice_vectors(), ke_cutoff)
             else:
                 mydf = self
-            return _sub_df_jk_(mydf, dm, hermi, kpts, kpts_band,
-                               with_j, with_k, omega, exxdiv)
+            with mydf.range_coulomb(omega) as rsh_df:
+                return rsh_df.get_jk(dm, hermi, kpts, kpts_band, with_j, with_k,
+                                     omega=None, exxdiv=exxdiv)
 
         if kpts is None:
             if np.all(self.kpts == 0):
diff --git a/pyscf/pbc/df/rsdf.py b/pyscf/pbc/df/rsdf.py
index 3eccad1239..790bbe8d15 100644
--- a/pyscf/pbc/df/rsdf.py
+++ b/pyscf/pbc/df/rsdf.py
@@ -433,7 +433,8 @@ def weighted_ft_ao(self, kpt):
         GauxI = np.asarray(Gaux.imag, order='C')
         return GauxR, GauxI
 
-    def gen_j3c_loader(self, h5group, kpt, kpt_ij_idx, ijlst_mapping, aosym):
+    def gen_j3c_loader(self, h5group, kpt, kpt_ij_idx, ijlst_mapping, aosym,
+                       dataname='j3c'):
         cell = self.cell
         kpts = self.kpts
         nkpts = len(self.kpts)
@@ -448,13 +449,13 @@ def gen_j3c_loader(self, h5group, kpt, kpt_ij_idx, ijlst_mapping, aosym):
             else:
                 ovlp = [s.ravel() for s in ovlp]
 
-        nsegs = len(h5group['j3c-junk/0'])
+        nsegs = len(h5group[f'{dataname}-junk/0'])
 
         def load_j3c(col0, col1):
             j3cR = []
             j3cI = []
             for kk in kpt_ij_idx:
-                v = np.hstack([h5group[f'j3c-junk/{ijlst_mapping[kk]}/{i}'][0,col0:col1]
+                v = np.hstack([h5group[f'{dataname}-junk/{ijlst_mapping[kk]}/{i}'][0,col0:col1]
                                for i in range(nsegs)])
                 vR = np.asarray(v.real, order='C')
                 kj = kk % nkpts
@@ -476,7 +477,7 @@ def load_j3c(col0, col1):
         return load_j3c
 
     def make_j3c(self, cderi_file, intor='int3c2e', aosym='s2', comp=None,
-                 j_only=False, shls_slice=None, kptij_lst=None):
+                 j_only=False, dataname='j3c', shls_slice=None, kptij_lst=None):
         if self.cell.omega != 0:
             raise RuntimeError('RSGDF cannot be used to evaluate the long-range '
                                'HF exchange in RSH functionals.')
@@ -521,9 +522,8 @@ def members(kptis, kpts):
             kptj_idx = members(kptij_lst[:,1], kpts)
             ijlst_mapping[kpti_idx * nkpts + kptj_idx] = np.arange(len(kptij_lst))
 
-        dataname = 'j3c'
         fswap = self.outcore_auxe2(cderi_file, intor, aosym, comp,
-                                   kptij_lst, j_only, 'j3c-junk', shls_slice)
+                                   kptij_lst, j_only, f'{dataname}-junk', shls_slice)
         cpu1 = log.timer_debug1('3c2e', *cpu1)
 
         supmol_ft = ft_ao._ExtendedMole.from_cell(self.rs_cell, self.bvk_kmesh, verbose=log)
diff --git a/pyscf/pbc/df/rsdf_builder.py b/pyscf/pbc/df/rsdf_builder.py
index e3cdc2ce1c..5288f98fbd 100644
--- a/pyscf/pbc/df/rsdf_builder.py
+++ b/pyscf/pbc/df/rsdf_builder.py
@@ -941,7 +941,7 @@ def gen_uniq_kpts_groups(self, j_only, h5swap):
                 yield -uniq_kpts[k], kpt_ji_idx, _conj_j2c(cd_j2c)
 
     def make_j3c(self, cderi_file, intor='int3c2e', aosym='s2', comp=None,
-                 j_only=False, shls_slice=None):
+                 j_only=False, dataname='j3c', shls_slice=None):
         if self.rs_cell is None:
             self.build()
         log = logger.new_logger(self)
@@ -957,7 +957,6 @@ def make_j3c(self, cderi_file, intor='int3c2e', aosym='s2', comp=None,
         else:
             ish0, ish1 = shls_slice[:2]
 
-        dataname = 'j3c'
         fswap = self.outcore_auxe2(cderi_file, intor, aosym, comp, j_only,
                                    dataname, shls_slice)
         cpu1 = log.timer('pass1: real space int3c2e', *cpu0)
diff --git a/pyscf/pbc/dft/gks.py b/pyscf/pbc/dft/gks.py
index 6ade248edc..c2af146e70 100644
--- a/pyscf/pbc/dft/gks.py
+++ b/pyscf/pbc/dft/gks.py
@@ -43,9 +43,13 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
     if kpt is None: kpt = ks.kpt
     t0 = (logger.process_clock(), logger.perf_counter())
 
+    ni = ks._numint
+    if ks.nlc or ni.libxc.is_nlc(ks.xc):
+        raise NotImplementedError(f'NLC functional {ks.xc} + {ks.nlc}')
+
+    hybrid = ni.libxc.is_hybrid_xc(ks.xc)
+
     # TODO GKS with hybrid functional
-    omega, alpha, hyb = ks._numint.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
-    hybrid = abs(hyb) > 1e-10 or abs(alpha) > 1e-10
     if hybrid:
         raise NotImplementedError
 
@@ -82,9 +86,10 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         vj = ks.get_j(cell, dm, hermi, kpt, kpts_band)
         vxc += vj
     else:
+        omega, alpha, hyb = ks._numint.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
         vj, vk = ks.get_jk(cell, dm, hermi, kpt, kpts_band)
         vk *= hyb
-        if abs(omega) > 1e-10:
+        if omega != 0:
             vklr = ks.get_k(cell, dm, hermi, kpt, kpts_band, omega=omega)
             vklr *= (alpha - hyb)
             vk += vklr
diff --git a/pyscf/pbc/dft/kgks.py b/pyscf/pbc/dft/kgks.py
index 69a7bf21c7..b4fe385359 100644
--- a/pyscf/pbc/dft/kgks.py
+++ b/pyscf/pbc/dft/kgks.py
@@ -55,9 +55,14 @@ def get_veff(ks, cell=None, dm_kpts=None, dm_last=0, vhf_last=0, hermi=1,
     if kpts is None: kpts = ks.kpts
     t0 = (logger.process_clock(), logger.perf_counter())
 
+    ni = ks._numint
+    if ks.nlc or ni.libxc.is_nlc(ks.xc):
+        raise NotImplementedError(f'NLC functional {ks.xc} + {ks.nlc}')
+
+    hybrid = ni.libxc.is_hybrid_xc(ks.xc)
+
     # TODO GKS with hybrid functional
-    omega, alpha, hyb = ks._numint.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
-    hybrid = abs(hyb) > 1e-10 or abs(alpha) > 1e-10
+    hybrid = ks._numint.libxc.is_hybrid_xc(ks.xc)
     if hybrid:
         raise NotImplementedError
 
@@ -97,18 +102,22 @@ def get_veff(ks, cell=None, dm_kpts=None, dm_last=0, vhf_last=0, hermi=1,
         vxc.append(vxc_k)
     vxc = lib.asarray(vxc)
 
-    weight = 1./len(kpts)
+    nkpts = len(kpts)
+    weight = 1. / nkpts
     if not hybrid:
-        ks.with_df._j_only = False
         vj = ks.get_j(cell, dm_kpts, hermi, kpts, kpts_band)
         vxc += vj
     else:
-        if getattr(ks.with_df, '_j_only', False):  # for GDF and MDF
-            logger.warn(ks, 'df.j_only cannot be used with hybrid functional')
+        omega, alpha, hyb = ks._numint.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
+        if getattr(ks.with_df, '_j_only', False) and nkpts > 1: # for GDF and MDF:
             ks.with_df._j_only = False
+            if ks.with_df._cderi is not None:
+                logger.warn(ks, 'df.j_only cannot be used with hybrid '
+                            'functional. Rebuild cderi')
+                ks.with_df.build()
         vj, vk = ks.get_jk(cell, dm_kpts, hermi, kpts, kpts_band)
         vk *= hyb
-        if abs(omega) > 1e-10:
+        if omega != 0:
             vklr = ks.get_k(cell, dm_kpts, hermi, kpts, kpts_band, omega=omega)
             vklr *= (alpha - hyb)
             vk += vklr
diff --git a/pyscf/pbc/dft/krks.py b/pyscf/pbc/dft/krks.py
index 8cfb330c6a..883bce8be9 100644
--- a/pyscf/pbc/dft/krks.py
+++ b/pyscf/pbc/dft/krks.py
@@ -60,8 +60,11 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
     if kpts is None: kpts = ks.kpts
     t0 = (logger.process_clock(), logger.perf_counter())
 
-    omega, alpha, hyb = ks._numint.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
-    hybrid = abs(hyb) > 1e-10 or abs(alpha) > 1e-10
+    ni = ks._numint
+    if ks.nlc or ni.libxc.is_nlc(ks.xc):
+        raise NotImplementedError(f'NLC functional {ks.xc} + {ks.nlc}')
+
+    hybrid = ni.libxc.is_hybrid_xc(ks.xc)
 
     if not hybrid and isinstance(ks.with_df, multigrid.MultiGridFFTDF):
         n, exc, vxc = multigrid.nr_rks(ks.with_df, ks.xc, dm, hermi,
@@ -92,18 +95,22 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         logger.debug(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
-    weight = 1./len(kpts)
+    nkpts = len(kpts)
+    weight = 1. / nkpts
     if not hybrid:
-        ks.with_df._j_only = False
         vj = ks.get_j(cell, dm, hermi, kpts, kpts_band)
         vxc += vj
     else:
-        if getattr(ks.with_df, '_j_only', False):  # for GDF and MDF
-            logger.warn(ks, 'df.j_only cannot be used with hybrid functional')
+        omega, alpha, hyb = ks._numint.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
+        if getattr(ks.with_df, '_j_only', False) and nkpts > 1: # for GDF and MDF
             ks.with_df._j_only = False
+            if ks.with_df._cderi is not None:
+                logger.warn(ks, 'df.j_only cannot be used with hybrid '
+                            'functional. Rebuild cderi')
+                ks.with_df.build()
         vj, vk = ks.get_jk(cell, dm, hermi, kpts, kpts_band)
         vk *= hyb
-        if abs(omega) > 1e-10:
+        if omega != 0:
             vklr = ks.get_k(cell, dm, hermi, kpts, kpts_band, omega=omega)
             vklr *= (alpha - hyb)
             vk += vklr
diff --git a/pyscf/pbc/dft/krks_ksymm.py b/pyscf/pbc/dft/krks_ksymm.py
index d3d63bacff..b8098581a6 100644
--- a/pyscf/pbc/dft/krks_ksymm.py
+++ b/pyscf/pbc/dft/krks_ksymm.py
@@ -48,8 +48,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
                                % (len(dm), kpts.nkpts_ibz))
         dm_bz = kpts.transform_dm(dm)
 
-    omega, alpha, hyb = ks._numint.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
-    hybrid = abs(hyb) > 1e-10 or abs(alpha) > 1e-10
+    hybrid = ks._numint.libxc.is_hybrid_xc(ks.xc)
 
     if not hybrid and isinstance(ks.with_df, multigrid.MultiGridFFTDF):
         n, exc, vxc = multigrid.nr_rks(ks.with_df, ks.xc, dm_bz, hermi,
@@ -82,11 +81,12 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         vj = ks.get_j(cell, dm, hermi, kpts, kpts_band)
         vxc += vj
     else:
+        omega, alpha, hyb = ks._numint.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
         if getattr(ks.with_df, '_j_only', False):  # for GDF and MDF
             ks.with_df._j_only = False
         vj, vk = ks.get_jk(cell, dm, hermi, kpts, kpts_band)
         vk *= hyb
-        if abs(omega) > 1e-10:
+        if omega != 0:
             vklr = ks.get_k(cell, dm, hermi, kpts, kpts_band, omega=omega)
             vklr *= (alpha - hyb)
             vk += vklr
diff --git a/pyscf/pbc/dft/kuks.py b/pyscf/pbc/dft/kuks.py
index 23dfba2487..d49fc853e4 100644
--- a/pyscf/pbc/dft/kuks.py
+++ b/pyscf/pbc/dft/kuks.py
@@ -45,8 +45,11 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
     if kpts is None: kpts = ks.kpts
     t0 = (logger.process_clock(), logger.perf_counter())
 
-    omega, alpha, hyb = ks._numint.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
-    hybrid = abs(hyb) > 1e-10
+    ni = ks._numint
+    if ks.nlc or ni.libxc.is_nlc(ks.xc):
+        raise NotImplementedError(f'NLC functional {ks.xc} + {ks.nlc}')
+
+    hybrid = ni.libxc.is_hybrid_xc(ks.xc)
 
     if not hybrid and isinstance(ks.with_df, multigrid.MultiGridFFTDF):
         n, exc, vxc = multigrid.nr_uks(ks.with_df, ks.xc, dm, hermi,
@@ -75,20 +78,24 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         logger.debug(ks, 'nelec by numeric integration = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
-    weight = 1./len(kpts)
+    nkpts = len(kpts)
+    weight = 1. / nkpts
 
     if not hybrid:
-        ks.with_df._j_only = False
         vj = ks.get_j(cell, dm[0]+dm[1], hermi, kpts, kpts_band)
         vxc += vj
     else:
-        if getattr(ks.with_df, '_j_only', False):  # for GDF and MDF
-            logger.warn(ks, 'df.j_only cannot be used with hybrid functional')
+        omega, alpha, hyb = ks._numint.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
+        if getattr(ks.with_df, '_j_only', False) and nkpts > 1: # for GDF and MDF
             ks.with_df._j_only = False
+            if ks.with_df._cderi is not None:
+                logger.warn(ks, 'df.j_only cannot be used with hybrid '
+                            'functional. Rebuild cderi')
+                ks.with_df.build()
         vj, vk = ks.get_jk(cell, dm, hermi, kpts, kpts_band)
         vj = vj[0] + vj[1]
         vk *= hyb
-        if abs(omega) > 1e-10:
+        if omega != 0:
             vklr = ks.get_k(cell, dm, hermi, kpts, kpts_band, omega=omega)
             vklr *= (alpha - hyb)
             vk += vklr
diff --git a/pyscf/pbc/dft/kuks_ksymm.py b/pyscf/pbc/dft/kuks_ksymm.py
index a842305c55..183a6e2613 100644
--- a/pyscf/pbc/dft/kuks_ksymm.py
+++ b/pyscf/pbc/dft/kuks_ksymm.py
@@ -47,9 +47,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
                                % (len(dm[0]), kpts.nkpts_ibz))
         dm_bz = kpts.transform_dm(dm)
 
-
-    omega, alpha, hyb = ks._numint.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
-    hybrid = abs(hyb) > 1e-10
+    hybrid = ks._numint.libxc.is_hybrid_xc(ks.xc)
 
     if not hybrid and isinstance(ks.with_df, multigrid.MultiGridFFTDF):
         n, exc, vxc = multigrid.nr_uks(ks.with_df, ks.xc, dm_bz, hermi,
@@ -82,12 +80,13 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         vj = ks.get_j(cell, dm[0]+dm[1], hermi, kpts, kpts_band)
         vxc += vj
     else:
+        omega, alpha, hyb = ks._numint.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
         if getattr(ks.with_df, '_j_only', False):  # for GDF and MDF
             ks.with_df._j_only = False
         vj, vk = ks.get_jk(cell, dm, hermi, kpts, kpts_band)
         vj = vj[0] + vj[1]
         vk *= hyb
-        if abs(omega) > 1e-10:
+        if omega != 0:
             vklr = ks.get_k(cell, dm, hermi, kpts, kpts_band, omega=omega)
             vklr *= (alpha - hyb)
             vk += vklr
diff --git a/pyscf/pbc/dft/multigrid.py b/pyscf/pbc/dft/multigrid.py
index de9c5bebd4..845aebc669 100644
--- a/pyscf/pbc/dft/multigrid.py
+++ b/pyscf/pbc/dft/multigrid.py
@@ -1096,6 +1096,10 @@ def nr_rks(mydf, xc_code, dm_kpts, hermi=1, kpts=None,
     else:
         vj = None
 
+    shape = list(dm_kpts.shape)
+    if len(shape) == 3 and shape[0] != kpts_band.shape[0]:
+        shape[0] = kpts_band.shape[0]
+    veff = veff.reshape(shape)
     veff = lib.tag_array(veff, ecoul=ecoul, exc=excsum, vj=vj, vk=None)
     return nelec, excsum, veff
 
@@ -1201,6 +1205,10 @@ def nr_uks(mydf, xc_code, dm_kpts, hermi=1, kpts=None,
     else:
         vj = None
 
+    shape = list(dm_kpts.shape)
+    if len(shape) == 4 and shape[1] != kpts_band.shape[0]:
+        shape[1] = kpts_band.shape[0]
+    veff = veff.reshape(shape)
     veff = lib.tag_array(veff, ecoul=ecoul, exc=excsum, vj=vj, vk=None)
     return nelec, excsum, veff
 
diff --git a/pyscf/pbc/dft/numint.py b/pyscf/pbc/dft/numint.py
index 74ba8349fe..59c7ac0b90 100644
--- a/pyscf/pbc/dft/numint.py
+++ b/pyscf/pbc/dft/numint.py
@@ -115,11 +115,7 @@ def eval_rho(cell, ao, dm, non0tab=None, xctype='LDA', hermi=0, with_lapl=True,
         pyscf.dft.numint.eval_rho
 
     '''
-
-    if xctype == 'LDA' or xctype == 'HF':
-        ngrids, nao = ao.shape
-    else:
-        ngrids, nao = ao[0].shape
+    ngrids, nao = ao.shape[-2:]
 
     # complex orbitals or density matrix
     if numpy.iscomplexobj(ao) or numpy.iscomplexobj(dm):
@@ -196,10 +192,7 @@ def eval_rho2(cell, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
     '''Refer to `pyscf.dft.numint.eval_rho2` for full documentation.
     '''
     xctype = xctype.upper()
-    if xctype == 'LDA' or xctype == 'HF':
-        ngrids, nao = ao.shape
-    else:
-        ngrids, nao = ao[0].shape
+    ngrids, nao = ao.shape[-2:]
 
     # complex orbitals or density matrix
     if numpy.iscomplexobj(ao) or numpy.iscomplexobj(mo_coeff):
@@ -247,7 +240,7 @@ def eval_rho2(cell, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
                 rho = numpy.zeros(ngrids)
             elif xctype == 'GGA':
                 rho = numpy.zeros((4,ngrids))
-            if with_lapl:
+            elif with_lapl:
                 # rho[4] = \nabla^2 rho, rho[5] = 1/2 |nabla f|^2
                 rho = numpy.zeros((6,ngrids))
                 tau_idx = 5
@@ -860,14 +853,24 @@ def cache_xc_kernel(ni, cell, grids, xc_code, mo_coeff, mo_occ, spin=0,
     else:
         ao_deriv = 0
 
+    if isinstance(ni, KNumInt):
+        # mo_coeff of KRHF has [mo_k1, mo_k2, ...]
+        is_rhf = mo_coeff[0][0].ndim == 1
+    else:
+        is_rhf = mo_coeff[0].ndim == 1
+
     nao = cell.nao_nr()
-    if spin == 0:
+    if is_rhf:
         rho = []
         for ao_k1, ao_k2, mask, weight, coords \
                 in ni.block_loop(cell, grids, nao, ao_deriv, kpts, None, max_memory):
             rho.append(ni.eval_rho2(cell, ao_k1, mo_coeff, mo_occ, mask, xctype))
         rho = numpy.hstack(rho)
+        if spin == 1:
+            rho *= .5
+            rho = numpy.repeat(rho[numpy.newaxis], 2, axis=0)
     else:
+        assert spin == 1
         rhoa = []
         rhob = []
         for ao_k1, ao_k2, mask, weight, coords \
diff --git a/pyscf/pbc/dft/rks.py b/pyscf/pbc/dft/rks.py
index 15c9fc6069..38cc054c40 100644
--- a/pyscf/pbc/dft/rks.py
+++ b/pyscf/pbc/dft/rks.py
@@ -64,8 +64,11 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
     if kpt is None: kpt = ks.kpt
     t0 = (logger.process_clock(), logger.perf_counter())
 
-    omega, alpha, hyb = ks._numint.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
-    hybrid = abs(hyb) > 1e-10 or abs(alpha) > 1e-10
+    ni = ks._numint
+    if ks.nlc or ni.libxc.is_nlc(ks.xc):
+        raise NotImplementedError(f'NLC functional {ks.xc} + {ks.nlc}')
+
+    hybrid = ni.libxc.is_hybrid_xc(ks.xc)
 
     if not hybrid and isinstance(ks.with_df, multigrid.MultiGridFFTDF):
         n, exc, vxc = multigrid.nr_rks(ks.with_df, ks.xc, dm, hermi,
@@ -101,9 +104,10 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         vj = ks.get_j(cell, dm, hermi, kpt, kpts_band)
         vxc += vj
     else:
+        omega, alpha, hyb = ks._numint.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
         vj, vk = ks.get_jk(cell, dm, hermi, kpt, kpts_band)
         vk *= hyb
-        if abs(omega) > 1e-10:
+        if omega != 0:
             vklr = ks.get_k(cell, dm, hermi, kpt, kpts_band, omega=omega)
             vklr *= (alpha - hyb)
             vk += vklr
@@ -123,7 +127,7 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
 def _patch_df_beckegrids(density_fit):
     def new_df(self, auxbasis=None, with_df=None, *args, **kwargs):
         mf = density_fit(self, auxbasis, with_df, *args, **kwargs)
-        mf.with_df._j_only = True
+        mf.with_df._j_only = not self._numint.libxc.is_hybrid_xc(self.xc)
         mf.grids = gen_grid.BeckeGrids(self.cell)
         mf.grids.level = getattr(__config__, 'pbc_dft_rks_RKS_grids_level',
                                  mf.grids.level)
@@ -158,7 +162,9 @@ def get_rho(mf, dm=None, grids=None, kpt=None):
 
 def _dft_common_init_(mf, xc='LDA,VWN'):
     mf.xc = xc
+    mf.nlc = ''
     mf.grids = gen_grid.UniformGrids(mf.cell)
+    mf.nlcgrids = None
     # Use rho to filter grids
     mf.small_rho_cutoff = getattr(__config__,
                                   'pbc_dft_rks_RKS_small_rho_cutoff', 1e-7)
diff --git a/pyscf/pbc/dft/test/test_krks.py b/pyscf/pbc/dft/test/test_krks.py
index 7f95c24a4e..e91cdabeb0 100644
--- a/pyscf/pbc/dft/test/test_krks.py
+++ b/pyscf/pbc/dft/test/test_krks.py
@@ -80,6 +80,13 @@ def tearDownModule():
 
 
 class KnownValues(unittest.TestCase):
+    def test_klda(self):
+        cell = pbcgto.M(atom='H 0 0 0; H 1 0 0', a=np.eye(3)*2, basis=[[0, [1, 1]]])
+        cell.build()
+        mf = cell.KRKS(kpts=cell.make_kpts([2,2,1]))
+        mf.run()
+        self.assertAlmostEqual(mf.e_tot, -0.3846075202893169, 7)
+
     def test_klda8_cubic_gamma(self):
         cell = build_cell([17]*3)
         mf = pbcdft.RKS(cell)
diff --git a/pyscf/pbc/dft/test/test_kuks.py b/pyscf/pbc/dft/test/test_kuks.py
index a4a9417a26..85093713f6 100644
--- a/pyscf/pbc/dft/test/test_kuks.py
+++ b/pyscf/pbc/dft/test/test_kuks.py
@@ -38,6 +38,13 @@ def tearDownModule():
 
 
 class KnownValues(unittest.TestCase):
+    def test_klda(self):
+        cell = pbcgto.M(atom='H 0 0 0; H 1 0 0', a=np.eye(3)*2, basis=[[0, [1, 1]]])
+        cell.build()
+        mf = cell.KUKS(kpts=cell.make_kpts([2,2,1]))
+        mf.run()
+        self.assertAlmostEqual(mf.e_tot, -0.3846075202893169, 7)
+
     def test_klda8_cubic_kpt_222_high_cost(self):
         cell = pbcgto.Cell()
         cell.unit = 'A'
diff --git a/pyscf/pbc/dft/test/test_multigrid.py b/pyscf/pbc/dft/test/test_multigrid.py
index 01614eb7e2..391bf91413 100644
--- a/pyscf/pbc/dft/test/test_multigrid.py
+++ b/pyscf/pbc/dft/test/test_multigrid.py
@@ -82,21 +82,25 @@ class KnownValues(unittest.TestCase):
     def test_orth_get_pp(self):
         ref = df.FFTDF(cell_orth).get_pp()
         out = multigrid.MultiGridFFTDF(cell_orth).get_pp()
+        self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
 
     def test_nonorth_get_pp(self):
         ref = df.FFTDF(cell_nonorth).get_pp()
         out = multigrid.MultiGridFFTDF(cell_nonorth).get_pp()
+        self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
 
     def test_orth_get_nuc_kpts(self):
         ref = df.FFTDF(cell_orth).get_nuc(kpts)
         out = multigrid.MultiGridFFTDF(cell_orth).get_nuc(kpts)
+        self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
 
     def test_orth_get_j_kpts(self):
         ref = df.FFTDF(cell_orth).get_jk(dm, kpts=kpts, with_k=False)[0]
         out = multigrid.MultiGridFFTDF(cell_orth).get_jk(dm, kpts=kpts)[0]
+        self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
 
 #        mydf = multigrid.MultiGridFFTDF(cell_orth)
@@ -105,11 +109,13 @@ def test_orth_get_j_kpts(self):
     def test_nonorth_get_j_kpts(self):
         ref = df.FFTDF(cell_nonorth).get_jk(dm, kpts=kpts, with_k=False)[0]
         out = multigrid.MultiGridFFTDF(cell_nonorth, kpts=kpts).get_jk(dm)[0]
+        self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
 
     def test_nonorth_get_j(self):
         ref = df.FFTDF(cell_nonorth).get_jk(dm[0], with_k=False)[0]
         out = multigrid.MultiGridFFTDF(cell_nonorth).get_jk(dm)[0]
+        self.assertEqual(out.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
 
     def test_orth_rks_lda_kpts(self):
@@ -119,7 +125,8 @@ def test_orth_rks_lda_kpts(self):
         n, exc0, ref = ni.nr_rks(cell_orth, mydf.grids, xc, dm, 1, kpts=kpts)
         mydf = multigrid.MultiGridFFTDF(cell_orth)
         n, exc1, vxc = multigrid.nr_rks(mydf, xc, dm, kpts=kpts)
-        self.assertAlmostEqual(float(abs(ref-vxc).max()), 0, 7)
+        self.assertEqual(vxc.shape, ref.shape)
+        self.assertAlmostEqual(abs(ref-vxc).max(), 0, 7)
         self.assertAlmostEqual(abs(exc0-exc1).max(), 0, 7)
 
     def test_multigrid_kuks(self):
@@ -127,7 +134,8 @@ def test_multigrid_kuks(self):
         mf.xc = 'lda,'
         ref = mf.get_veff(cell_he, numpy.array((dm_he,dm_he)), kpts=kpts)
         out = multigrid.multigrid(mf).get_veff(cell_he, (dm_he,dm_he), kpts=kpts)
-        self.assertAlmostEqual(float(abs(ref-out).max()), 0, 8)
+        self.assertEqual(out.shape, ref.shape)
+        self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 8)
         self.assertAlmostEqual(abs(ref.ecoul-out.ecoul).max(), 0, 8)
 
@@ -136,7 +144,8 @@ def test_multigrid_krks(self):
         mf.xc = 'lda,'
         ref = mf.get_veff(cell_he, dm_he, kpts=kpts)
         out = multigrid.multigrid(mf).get_veff(cell_he, dm_he, kpts=kpts)
-        self.assertAlmostEqual(float(abs(ref-out).max()), 0, 8)
+        self.assertEqual(out.shape, ref.shape)
+        self.assertAlmostEqual(abs(ref-out).max(), 0, 8)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 8)
         self.assertAlmostEqual(abs(ref.ecoul-out.ecoul).max(), 0, 8)
 
@@ -151,7 +160,8 @@ def test_multigrid_kroks(self):
                             mo_occ=mo_occ*2)
         ref = mf.get_veff(cell_he, dm1, kpts=kpts)
         out = multigrid.multigrid(mf).get_veff(cell_he, dm1, kpts=kpts)
-        self.assertAlmostEqual(float(abs(ref-out).max()), 0, 7)
+        self.assertEqual(out.shape, ref.shape)
+        self.assertAlmostEqual(abs(ref-out).max(), 0, 7)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 7)
         self.assertAlmostEqual(abs(ref.ecoul-out.ecoul).max(), 0, 7)
 
@@ -160,7 +170,8 @@ def test_multigrid_uks(self):
         mf.xc = 'lda,'
         ref = mf.get_veff(cell_he, numpy.array((dm_he[0],dm_he[0])))
         out = multigrid.multigrid(mf).get_veff(cell_he, (dm_he[0], dm_he[0]))
-        self.assertAlmostEqual(float(abs(ref-out).max()), 0, 7)
+        self.assertEqual(out.shape, ref.shape)
+        self.assertAlmostEqual(abs(ref-out).max(), 0, 7)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 7)
         self.assertAlmostEqual(abs(ref.ecoul-out.ecoul).max(), 0, 7)
 
@@ -169,7 +180,8 @@ def test_multigrid_rks(self):
         mf.xc = 'lda,'
         ref = mf.get_veff(cell_he, dm_he[0])
         out = multigrid.multigrid(mf).get_veff(cell_he, dm_he[0])
-        self.assertAlmostEqual(float(abs(ref-out).max()), 0, 7)
+        self.assertEqual(out.shape, ref.shape)
+        self.assertAlmostEqual(abs(ref-out).max(), 0, 7)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 7)
         self.assertAlmostEqual(abs(ref.ecoul-out.ecoul).max(), 0, 7)
 
@@ -184,7 +196,8 @@ def test_multigrid_roks(self):
                             mo_occ=mo_occ*2)
         ref = mf.get_veff(cell_he, dm1)
         out = multigrid.multigrid(mf).get_veff(cell_he, dm1)
-        self.assertAlmostEqual(float(abs(ref-out).max()), 0, 7)
+        self.assertEqual(out.shape, ref.shape)
+        self.assertAlmostEqual(abs(ref-out).max(), 0, 7)
         self.assertAlmostEqual(abs(ref.exc-out.exc).max(), 0, 7)
         self.assertAlmostEqual(abs(ref.ecoul-out.ecoul).max(), 0, 7)
 
@@ -196,6 +209,7 @@ def test_orth_rks_gga_kpts(self):
         ref += mydf.get_jk(dm, hermi=1, with_k=False, kpts=kpts)[0]
         mydf = multigrid.MultiGridFFTDF(cell_orth)
         n, exc1, vxc = multigrid.nr_rks(mydf, xc, dm, hermi=1, kpts=kpts, with_j=True)
+        self.assertEqual(vxc.shape, ref.shape)
         self.assertAlmostEqual(abs(ref-vxc).max(), 0, 7)
         self.assertAlmostEqual(abs(exc0-exc1).max(), 0, 7)
         self.assertAlmostEqual(lib.fp(ref), -0.05697304864467462+0.6990367789096609j, 7)
diff --git a/pyscf/pbc/dft/uks.py b/pyscf/pbc/dft/uks.py
index dcdd87a661..fef67160f2 100644
--- a/pyscf/pbc/dft/uks.py
+++ b/pyscf/pbc/dft/uks.py
@@ -46,8 +46,11 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
     if kpt is None: kpt = ks.kpt
     t0 = (logger.process_clock(), logger.perf_counter())
 
-    omega, alpha, hyb = ks._numint.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
-    hybrid = abs(hyb) > 1e-10 or abs(alpha) > 1e-10
+    ni = ks._numint
+    if ks.nlc or ni.libxc.is_nlc(ks.xc):
+        raise NotImplementedError(f'NLC functional {ks.xc} + {ks.nlc}')
+
+    hybrid = ni.libxc.is_hybrid_xc(ks.xc)
 
     if not hybrid and isinstance(ks.with_df, multigrid.MultiGridFFTDF):
         n, exc, vxc = multigrid.nr_uks(ks.with_df, ks.xc, dm, hermi,
@@ -85,10 +88,11 @@ def get_veff(ks, cell=None, dm=None, dm_last=0, vhf_last=0, hermi=1,
         vj = ks.get_j(cell, dm[0]+dm[1], hermi, kpt, kpts_band)
         vxc += vj
     else:
+        omega, alpha, hyb = ks._numint.rsh_and_hybrid_coeff(ks.xc, spin=cell.spin)
         vj, vk = ks.get_jk(cell, dm, hermi, kpt, kpts_band)
         vj = vj[0] + vj[1]
         vk *= hyb
-        if abs(omega) > 1e-10:
+        if omega != 0:
             vklr = ks.get_k(cell, dm, hermi, kpt, kpts_band, omega=omega)
             vklr *= (alpha - hyb)
             vk += vklr
diff --git a/pyscf/pbc/grad/krks.py b/pyscf/pbc/grad/krks.py
index 226e6ae8c6..ddfe7c6e30 100644
--- a/pyscf/pbc/grad/krks.py
+++ b/pyscf/pbc/grad/krks.py
@@ -43,8 +43,6 @@ def get_veff(ks_grad, dm=None, kpts=None):
     if grids.coords is None:
         grids.build(with_non0tab=True)
 
-    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=cell.spin)
-
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, ks_grad.max_memory*.9-mem_now)
     if ks_grad.grid_response:
@@ -53,13 +51,14 @@ def get_veff(ks_grad, dm=None, kpts=None):
         vxc = get_vxc(ni, cell, grids, mf.xc, dm, kpts,
                            max_memory=max_memory, verbose=ks_grad.verbose)
     t0 = logger.timer(ks_grad, 'vxc', *t0)
-    if abs(hyb) < 1e-10 and abs(alpha) < 1e-10:
+    if not ni.libxc.is_hybrid_xc(mf.xc):
         vj = ks_grad.get_j(dm, kpts)
         vxc += vj
     else:
+        omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=cell.spin)
         vj, vk = ks_grad.get_jk(dm, kpts)
         vk *= hyb
-        if abs(omega) > 1e-10:  # For range separated Coulomb operator
+        if omega != 0:
             with cell.with_range_coulomb(omega):
                 vk += ks_grad.get_k(dm, kpts) * (alpha - hyb)
         vxc += vj - vk * .5
diff --git a/pyscf/pbc/grad/kuks.py b/pyscf/pbc/grad/kuks.py
index 8e06dbfcea..d73aa3990f 100644
--- a/pyscf/pbc/grad/kuks.py
+++ b/pyscf/pbc/grad/kuks.py
@@ -44,24 +44,23 @@ def get_veff(ks_grad, dm=None, kpts=None):
     if grids.coords is None:
         grids.build(with_non0tab=True)
 
-    omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=cell.spin)
-
     mem_now = lib.current_memory()[0]
     max_memory = max(2000, ks_grad.max_memory*.9-mem_now)
     if ks_grad.grid_response:
         raise NotImplementedError
     else:
         vxc =  get_vxc(ni, cell, grids, mf.xc, dm, kpts,
-                           max_memory=max_memory, verbose=ks_grad.verbose)
+                       max_memory=max_memory, verbose=ks_grad.verbose)
     t0 = logger.timer(ks_grad, 'vxc', *t0)
 
-    if abs(hyb) < 1e-10 and abs(alpha) < 1e-10:
+    if not ni.libxc.is_hybrid_xc(mf.xc):
         vj = ks_grad.get_j(dm, kpts)
         vxc += vj[:,0][:,None] + vj[:,1][:,None]
     else:
+        omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=cell.spin)
         vj, vk = ks_grad.get_jk(dm, kpts)
         vk *= hyb
-        if abs(omega) > 1e-10:  # For range separated Coulomb operator
+        if omega != 0:
             with cell.with_range_coulomb(omega):
                 vk += ks_grad.get_k(dm, kpts) * (alpha - hyb)
         vxc += vj[:,0][:,None] + vj[:,1][:,None] - vk
diff --git a/pyscf/pbc/mp/kmp2.py b/pyscf/pbc/mp/kmp2.py
index f90f81ba68..a3bc9ecf84 100644
--- a/pyscf/pbc/mp/kmp2.py
+++ b/pyscf/pbc/mp/kmp2.py
@@ -90,7 +90,6 @@ def kernel(mp, mo_energy, mo_coeff, verbose=logger.NOTE, with_t2=WITH_T2):
 
     fao2mo = mp._scf.with_df.ao2mo
     kconserv = mp.khelper.kconserv
-    emp2 = 0.
     oovv_ij = np.zeros((nkpts,nocc,nocc,nvir,nvir), dtype=mo_coeff[0].dtype)
 
     mo_e_o = [mo_energy[k][:nocc] for k in range(nkpts)]
@@ -108,6 +107,7 @@ def kernel(mp, mo_energy, mo_coeff, verbose=logger.NOTE, with_t2=WITH_T2):
     if with_df_ints:
         Lov = _init_mp_df_eris(mp)
 
+    emp2_ss = emp2_os = 0.
     for ki in range(nkpts):
         for kj in range(nkpts):
             for ka in range(nkpts):
@@ -139,12 +139,16 @@ def kernel(mp, mo_energy, mo_coeff, verbose=logger.NOTE, with_t2=WITH_T2):
                 t2_ijab = np.conj(oovv_ij[ka]/eijab)
                 if with_t2:
                     t2[ki, kj, ka] = t2_ijab
-                woovv = 2*oovv_ij[ka] - oovv_ij[kb].transpose(0,1,3,2)
-                emp2 += einsum('ijab,ijab', t2_ijab, woovv).real
+                edi = einsum('ijab,ijab', t2_ijab, oovv_ij[ka]).real * 2
+                exi = -einsum('ijab,ijba', t2_ijab, oovv_ij[kb]).real
+                emp2_ss += edi*0.5 + exi
+                emp2_os += edi*0.5
 
     log.timer("KMP2", *cput0)
 
-    emp2 /= nkpts
+    emp2_ss /= nkpts
+    emp2_os /= nkpts
+    emp2 = lib.tag_array(emp2_ss+emp2_os, e_corr_ss=emp2_ss, e_corr_os=emp2_os)
 
     return emp2, t2
 
@@ -725,6 +729,8 @@ def __init__(self, mf, frozen=None, mo_coeff=None, mo_occ=None):
         self._nmo = None
         self.e_hf = None
         self.e_corr = None
+        self.e_corr_ss = None
+        self.e_corr_os = None
         self.t2 = None
         self._keys = set(self.__dict__.keys())
 
@@ -769,7 +775,13 @@ def kernel(self, mo_energy=None, mo_coeff=None, with_t2=WITH_T2):
 
         self.e_corr, self.t2 = \
                 kernel(self, mo_energy, mo_coeff, verbose=self.verbose, with_t2=with_t2)
-        logger.log(self, 'KMP2 energy = %.15g', self.e_corr)
+
+        self.e_corr_ss = getattr(self.e_corr, 'e_corr_ss', 0)
+        self.e_corr_os = getattr(self.e_corr, 'e_corr_os', 0)
+        self.e_corr = float(self.e_corr)
+
+        self._finalize()
+
         return self.e_corr, self.t2
 
 KRMP2 = KMP2
@@ -806,4 +818,3 @@ def kernel(self, mo_energy=None, mo_coeff=None, with_t2=WITH_T2):
     mymp = mp.KMP2(kmf)
     emp2, t2 = mymp.kernel()
     print(emp2 - -0.204721432828996)
-
diff --git a/pyscf/pbc/mp/kmp2_ksymm.py b/pyscf/pbc/mp/kmp2_ksymm.py
index c2b5399b16..22ec1f6322 100644
--- a/pyscf/pbc/mp/kmp2_ksymm.py
+++ b/pyscf/pbc/mp/kmp2_ksymm.py
@@ -44,7 +44,6 @@ def kernel(mp, mo_energy, mo_coeff, verbose=logger.NOTE, with_t2=WITH_T2):
     eijab = np.zeros((nocc,nocc,nvir,nvir))
 
     fao2mo = mp._scf.with_df.ao2mo
-    emp2 = 0.
     oovv_ij = np.zeros((nkpts,nocc,nocc,nvir,nvir), dtype=mo_coeff[0].dtype)
 
     mo_e_o = [mo_energy[k][:nocc] for k in range(nkpts)]
@@ -57,6 +56,7 @@ def kernel(mp, mo_energy, mo_coeff, verbose=logger.NOTE, with_t2=WITH_T2):
     _, igroup = np.unique(kijab[:,:2], axis=0, return_index=True)
     igroup = list(igroup) + [len(kijab)]
 
+    emp2_ss = emp2_os = 0.
     nao2mo = 0
     icount = 0
     for i in range(len(igroup)-1):
@@ -102,11 +102,15 @@ def kernel(mp, mo_energy, mo_coeff, verbose=logger.NOTE, with_t2=WITH_T2):
             t2_ijab = np.conj(oovv_ij[ka]/eijab)
             idx_ibz = k4_bz2ibz[ki*nkpts**2 + kj*nkpts + ka]
             assert(icount == idx_ibz)
-            woovv = 2*oovv_ij[ka] - oovv_ij[kb].transpose(0,1,3,2)
-            emp2 += np.einsum('ijab,ijab', t2_ijab, woovv).real * weight[idx_ibz] * nkpts**3
+            edi = einsum('ijab,ijab', t2_ijab, oovv_ij[ka]).real * 2
+            exi = -einsum('ijab,ijba', t2_ijab, oovv_ij[kb]).real
+            emp2_ss += (edi*0.5 + exi) * weight[idx_ibz] * nkpts**3
+            emp2_os += edi*0.5 * weight[idx_ibz] * nkpts**3
             icount += 1
 
-    emp2 /= nkpts
+    emp2_ss /= nkpts
+    emp2_os /= nkpts
+    emp2 = lib.tag_array(emp2_ss+emp2_os, e_corr_ss=emp2_ss, e_corr_os=emp2_os)
     assert(icount == len(kijab))
     logger.debug(mp, "Number of ao2mo transformations performed in KMP2: %d", nao2mo)
     logger.timer(mp, 'KMP2', *t0)
@@ -234,7 +238,13 @@ def kernel(self, mo_energy=None, mo_coeff=None, with_t2=WITH_T2):
 
         self.e_corr, self.t2 = \
                 kernel(self, mo_energy, mo_coeff, verbose=self.verbose, with_t2=with_t2)
-        logger.log(self, 'KMP2 energy = %.15g', self.e_corr)
+
+        self.e_corr_ss = getattr(self.e_corr, 'e_corr_ss', 0)
+        self.e_corr_os = getattr(self.e_corr, 'e_corr_os', 0)
+        self.e_corr = float(self.e_corr)
+
+        self._finalize()
+
         return self.e_corr, self.t2
 
     make_rdm1 = make_rdm1
diff --git a/pyscf/pbc/mp/test/test_scs.py b/pyscf/pbc/mp/test/test_scs.py
new file mode 100644
index 0000000000..8fb023f40e
--- /dev/null
+++ b/pyscf/pbc/mp/test/test_scs.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+# Copyright 2014-2018 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+import numpy as np
+
+from pyscf.pbc import gto as pbcgto
+from pyscf.pbc import scf as pbcscf
+import pyscf.pbc.mp
+import pyscf.pbc.mp.kmp2
+
+
+def build_cell(space_group_symmetry=False):
+    atom = 'C 0 0 0'
+    a = np.eye(3) * 5
+    basis = 'cc-pvdz'
+    if space_group_symmetry:
+        return pbcgto.M(atom=atom, basis=basis, a=a, output='/dev/null',
+                        space_group_symmetry=True,
+                        symmorphic=False).set(precision=1e-12, verbose=4)
+    return pbcgto.M(atom=atom, basis=basis, a=a, output='/dev/null').set(precision=1e-12, verbose=4)
+
+
+class KnownValues(unittest.TestCase):
+    def test_mp2(self):
+        cell = build_cell()
+        mf = pbcscf.RHF(cell).density_fit()
+        mf.conv_tol = 1e-10
+        mf.kernel()
+        pt = pyscf.pbc.mp.mp2.RMP2(mf).run()
+
+        self.assertAlmostEqual(pt.e_corr, -0.0634551885557889, 7)
+        self.assertAlmostEqual(pt.e_corr_ss, -0.00561754117341521, 7)
+        self.assertAlmostEqual(pt.e_corr_os, -0.0578376473823737, 7)
+
+    def test_kmp2(self):
+        def run_k(cell, kmesh):
+            kpts = cell.make_kpts(kmesh)
+            mf = pbcscf.KRHF(cell, kpts).density_fit()
+            mf.conv_tol = 1e-10
+            mf.kernel()
+            pt = pyscf.pbc.mp.kmp2.KMP2(mf).run()
+            return pt
+
+        cell = build_cell()
+
+        pt = run_k(cell, (1,1,1))
+        self.assertAlmostEqual(pt.e_corr, -0.0634551885557889, 7)
+        self.assertAlmostEqual(pt.e_corr_ss, -0.00561754117341521, 7)
+        self.assertAlmostEqual(pt.e_corr_os, -0.0578376473823737, 7)
+
+        pt = run_k(cell, (2,1,1))
+        self.assertAlmostEqual(pt.e_corr, -0.0640728626841088, 7)
+        self.assertAlmostEqual(pt.e_corr_ss, -0.00558491559563941, 7)
+        self.assertAlmostEqual(pt.e_corr_os, -0.0584879470884693, 7)
+
+    def test_ksymm(self):
+        def run_k(cell, kmesh):
+            kpts = cell.make_kpts(kmesh, space_group_symmetry=True)
+            mf = pbcscf.KRHF(cell, kpts).density_fit()
+            mf.conv_tol = 1e-10
+            mf.kernel()
+            pt = pyscf.pbc.mp.kmp2_ksymm.KMP2(mf).run()
+            return pt
+
+        cell = build_cell(space_group_symmetry=True)
+
+        pt = run_k(cell, (1,1,1))
+        self.assertAlmostEqual(pt.e_corr, -0.0634551885557889, 7)
+        self.assertAlmostEqual(pt.e_corr_ss, -0.00561754117341521, 7)
+        self.assertAlmostEqual(pt.e_corr_os, -0.0578376473823737, 7)
+
+        pt = run_k(cell, (2,1,1))
+        self.assertAlmostEqual(pt.e_corr, -0.0640728626841088, 7)
+        self.assertAlmostEqual(pt.e_corr_ss, -0.00558491559563941, 7)
+        self.assertAlmostEqual(pt.e_corr_os, -0.0584879470884693, 7)
+
+
+if __name__ == '__main__':
+    print("Full kpoint test")
+    unittest.main()
diff --git a/pyscf/pbc/mpicc/kccsd_rhf.py b/pyscf/pbc/mpicc/kccsd_rhf.py
index 0937dc92fd..7741ae0e9f 100644
--- a/pyscf/pbc/mpicc/kccsd_rhf.py
+++ b/pyscf/pbc/mpicc/kccsd_rhf.py
@@ -42,8 +42,8 @@
 from pyscf.pbc.tools.tril import tril_index, unpack_tril
 from pyscf.pbc.lib import kpts_helper
 import pyscf.pbc.cc.kccsd_rhf
-from pyscf.pbc.cc.eom_kccsd_rhf_ea import mask_frozen as mask_frozen_ea
-from pyscf.pbc.cc.eom_kccsd_rhf_ip import mask_frozen as mask_frozen_ip
+from pyscf.pbc.cc.eom_kccsd_ghf import mask_frozen_ea
+from pyscf.pbc.cc.eom_kccsd_ghf import mask_frozen_ip
 
 from mpi4py import MPI
 
diff --git a/pyscf/pbc/scf/_response_functions.py b/pyscf/pbc/scf/_response_functions.py
index c7ef53e53a..2926d72753 100644
--- a/pyscf/pbc/scf/_response_functions.py
+++ b/pyscf/pbc/scf/_response_functions.py
@@ -38,8 +38,8 @@ def _gen_rhf_response(mf, mo_coeff=None, mo_occ=None,
         ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
 
         omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=cell.spin)
-        hybrid = abs(hyb) > 1e-10
-        if abs(omega) > 1e-10:  # For range separated Coulomb
+        hybrid = ni.libxc.is_hybrid_xc(mf.xc)
+        if omega != 0:  # For range separated Coulomb
             raise NotImplementedError
 
         if not hybrid and isinstance(mf.with_df, multigrid.MultiGridFFTDF):
@@ -50,13 +50,8 @@ def _gen_rhf_response(mf, mo_coeff=None, mo_occ=None,
             rho0, vxc, fxc = ni.cache_xc_kernel(cell, mf.grids, mf.xc, mo_coeff,
                                                 mo_occ, 0, kpts)
         else:
-            if isinstance(mo_occ, numpy.ndarray):
-                mo_occ = mo_occ*.5
-            else:
-                mo_occ = [x*.5 for x in mo_occ]
-            rho0, vxc, fxc = ni.cache_xc_kernel(cell, mf.grids, mf.xc,
-                                                [mo_coeff]*2, [mo_occ]*2,
-                                                spin=1, kpts=kpts)
+            rho0, vxc, fxc = ni.cache_xc_kernel(cell, mf.grids, mf.xc, mo_coeff,
+                                                mo_occ, 1, kpts)
         dm0 = None #mf.make_rdm1(mo_coeff, mo_occ)
 
         if max_memory is None:
@@ -139,8 +134,8 @@ def _gen_uhf_response(mf, mo_coeff=None, mo_occ=None,
         ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
 
         omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=cell.spin)
-        hybrid = abs(hyb) > 1e-10
-        if abs(omega) > 1e-10:  # For range separated Coulomb
+        hybrid = ni.libxc.is_hybrid_xc(mf.xc)
+        if omega != 0:  # For range separated Coulomb
             raise NotImplementedError
 
         if not hybrid and isinstance(mf.with_df, multigrid.MultiGridFFTDF):
@@ -216,8 +211,8 @@ def _gen_rhf_response_gam(mf, mo_coeff=None, mo_occ=None,
         ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
 
         omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=cell.spin)
-        hybrid = abs(hyb) > 1e-10
-        if abs(omega) > 1e-10:  # For range separated Coulomb
+        hybrid = ni.libxc.is_hybrid_xc(mf.xc)
+        if omega != 0:  # For range separated Coulomb
             raise NotImplementedError
 
         if not hybrid and isinstance(mf.with_df, multigrid.MultiGridFFTDF):
@@ -317,8 +312,8 @@ def _gen_uhf_response_gam(mf, mo_coeff=None, mo_occ=None,
         ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
 
         omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=cell.spin)
-        hybrid = abs(hyb) > 1e-10
-        if abs(omega) > 1e-10:  # For range separated Coulomb
+        hybrid = ni.libxc.is_hybrid_xc(mf.xc)
+        if omega != 0:  # For range separated Coulomb
             raise NotImplementedError
 
         if not hybrid and isinstance(mf.with_df, multigrid.MultiGridFFTDF):
diff --git a/pyscf/pbc/tdscf/krhf.py b/pyscf/pbc/tdscf/krhf.py
index ea4c6dd583..e7740a6652 100644
--- a/pyscf/pbc/tdscf/krhf.py
+++ b/pyscf/pbc/tdscf/krhf.py
@@ -26,23 +26,26 @@
 from pyscf.lib import linalg_helper
 from pyscf.lib import logger
 from pyscf.tdscf import rhf
+from pyscf.pbc import scf
+from pyscf.pbc.tdscf.rhf import TDMixin
 from pyscf.pbc.scf import _response_functions  # noqa
 from pyscf.pbc.lib.kpts_helper import gamma_point
+from pyscf.pbc.df.df_ao2mo import warn_pbc2d_eri
 from pyscf import __config__
 
 REAL_EIG_THRESHOLD = getattr(__config__, 'pbc_tdscf_rhf_TDDFT_pick_eig_threshold', 1e-3)
 
-class TDA(rhf.TDA):
-    conv_tol = getattr(__config__, 'pbc_tdscf_rhf_TDA_conv_tol', 1e-6)
-
+class KTDMixin(TDMixin):
     def __init__(self, mf):
-        from pyscf.pbc import scf
-        assert (isinstance(mf, scf.khf.KSCF))
-        self.cell = mf.cell
-        rhf.TDA.__init__(self, mf)
-        from pyscf.pbc.df.df_ao2mo import warn_pbc2d_eri
+        assert isinstance(mf, scf.khf.KSCF)
+        TDMixin.__init__(self, mf)
         warn_pbc2d_eri(mf)
 
+    get_nto = lib.invalid_method('get_nto')
+
+class TDA(KTDMixin):
+    conv_tol = getattr(__config__, 'pbc_tdscf_rhf_TDA_conv_tol', 1e-6)
+
     def gen_vind(self, mf):
         # exxdiv corrections are kept in hdiag while excluding them when calling
         # the contractions between two-electron integrals and X/Y amplitudes.
@@ -87,9 +90,6 @@ def vind(zs):
             return lib.asarray(v1s).reshape(nz,-1)
         return vind, hdiag
 
-    def get_ab(self, mf=None):
-        raise NotImplementedError
-
     def init_guess(self, mf, nstates=None):
         if nstates is None: nstates = self.nstates
 
@@ -291,7 +291,6 @@ def fill_heff(heff, xs, ax, xt, axt, dot):
     return fill_heff
 
 
-from pyscf.pbc import scf
 scf.khf.KRHF.TDA  = lib.class_as_method(KTDA)
 scf.khf.KRHF.TDHF = lib.class_as_method(KTDHF)
 scf.krohf.KROHF.TDA  = None
diff --git a/pyscf/pbc/tdscf/kuhf.py b/pyscf/pbc/tdscf/kuhf.py
index 2d0f8a22ea..e6f88ba03e 100644
--- a/pyscf/pbc/tdscf/kuhf.py
+++ b/pyscf/pbc/tdscf/kuhf.py
@@ -22,24 +22,16 @@
 from pyscf.lib import logger
 from pyscf.tdscf import uhf
 from pyscf.pbc import scf
-from pyscf.pbc.tdscf.krhf import _get_e_ia, purify_krlyov_heff
+from pyscf.pbc.tdscf.krhf import KTDMixin, _get_e_ia, purify_krlyov_heff
 from pyscf.pbc.lib.kpts_helper import gamma_point
 from pyscf.pbc.scf import _response_functions  # noqa
 from pyscf import __config__
 
 REAL_EIG_THRESHOLD = getattr(__config__, 'pbc_tdscf_uhf_TDDFT_pick_eig_threshold', 1e-3)
 
-class TDA(uhf.TDA):
-
+class TDA(KTDMixin):
     conv_tol = getattr(__config__, 'pbc_tdscf_rhf_TDA_conv_tol', 1e-6)
 
-    def __init__(self, mf):
-        from pyscf.pbc.df.df_ao2mo import warn_pbc2d_eri
-        assert (isinstance(mf, scf.khf.KSCF))
-        self.cell = mf.cell
-        uhf.TDA.__init__(self, mf)
-        warn_pbc2d_eri(mf)
-
     def gen_vind(self, mf):
         '''Compute Ax'''
         mo_coeff = mf.mo_coeff
@@ -96,9 +88,6 @@ def vind(zs):
 
         return vind, hdiag
 
-    def get_ab(self, mf=None):
-        raise NotImplementedError
-
     def init_guess(self, mf, nstates=None):
         if nstates is None: nstates = self.nstates
 
diff --git a/pyscf/pbc/tdscf/rhf.py b/pyscf/pbc/tdscf/rhf.py
index 163dcdc525..a20846e2b0 100644
--- a/pyscf/pbc/tdscf/rhf.py
+++ b/pyscf/pbc/tdscf/rhf.py
@@ -22,9 +22,39 @@
 
 from pyscf import lib
 from pyscf.tdscf import rhf
+from pyscf import __config__
 
+class TDMixin(rhf.TDMixin):
+    def __init__(self, mf):
+        rhf.TDMixin.__init__(self, mf)
+        self.cell = mf.cell
+        self._keys = self._keys.union(['cell'])
+
+    def get_ab(self, mf=None):
+        raise NotImplementedError
+
+    def nuc_grad_method(self):
+        raise NotImplementedError
+
+    get_nto = rhf.TDMixin.get_nto
+    analyze = lib.invalid_method('analyze')
+    oscillator_strength = lib.invalid_method('oscillator_strength')
+    transition_dipole              = lib.invalid_method('transition_dipole')
+    transition_quadrupole          = lib.invalid_method('transition_quadrupole')
+    transition_octupole            = lib.invalid_method('transition_octupole')
+    transition_velocity_dipole     = lib.invalid_method('transition_velocity_dipole')
+    transition_velocity_quadrupole = lib.invalid_method('transition_velocity_quadrupole')
+    transition_velocity_octupole   = lib.invalid_method('transition_velocity_octupole')
+    transition_magnetic_dipole     = lib.invalid_method('transition_magnetic_dipole')
+    transition_magnetic_quadrupole = lib.invalid_method('transition_magnetic_quadrupole')
+
+
+class TDA(TDMixin):
+
+    init_guess = rhf.TDA.init_guess
+    kernel = rhf.TDA.kernel
+    _gen_vind = rhf.TDA.gen_vind
 
-class TDA(rhf.TDA):
     def gen_vind(self, mf):
         # gen_vind calls get_jk functions to compute the contraction between
         # two-electron integrals and X,Y amplitudes. There are two choices for
@@ -46,34 +76,21 @@ def gen_vind(self, mf):
         #
         # See also issue https://github.com/pyscf/pyscf/issues/1187
 
-        vind, hdiag = rhf.TDA.gen_vind(self, mf)
+        vind, hdiag = self._gen_vind(mf)
         def vindp(x):
             with lib.temporary_env(mf, exxdiv=None):
                 return vind(x)
         return vindp, hdiag
 
-    def get_ab(self, mf=None):
-        raise NotImplementedError
-
-    def nuc_grad_method(self):
-        raise NotImplementedError
-
 CIS = TDA
 
 
-class TDHF(rhf.TDHF):
-    def gen_vind(self, mf):
-        vind, hdiag = rhf.TDHF.gen_vind(self, mf)
-        def vindp(x):
-            with lib.temporary_env(mf, exxdiv=None):
-                return vind(x)
-        return vindp, hdiag
-
-    def get_ab(self, mf=None):
-        raise NotImplementedError
+class TDHF(TDA):
 
-    def nuc_grad_method(self):
-        raise NotImplementedError
+    init_guess = rhf.TDHF.init_guess
+    kernel = rhf.TDHF.kernel
+    _gen_vind = rhf.TDHF.gen_vind
+    gen_vind = TDA.gen_vind
 
 RPA = TDRHF = TDHF
 
diff --git a/pyscf/pbc/tdscf/rks.py b/pyscf/pbc/tdscf/rks.py
index 941fb3c9a5..9995b45bd1 100644
--- a/pyscf/pbc/tdscf/rks.py
+++ b/pyscf/pbc/tdscf/rks.py
@@ -24,16 +24,9 @@
 
 RPA = TDRKS = TDDFT
 
-class CasidaTDDFT(rks.CasidaTDDFT):
-    def gen_vind(self, mf):
-        vind, hdiag = rks.TDDFTNoHybrid.gen_vind(self, mf)
-        def vindp(x):
-            with lib.temporary_env(mf, exxdiv=None):
-                return vind(x)
-        return vindp, hdiag
-
-    def nuc_grad_method(self):
-        raise NotImplementedError
+class CasidaTDDFT(TDA):
+    _gen_vind = rks.TDDFTNoHybrid.gen_vind
+    gen_vind = TDA.gen_vind
 
 TDDFTNoHybrid = CasidaTDDFT
 
diff --git a/pyscf/pbc/tdscf/uhf.py b/pyscf/pbc/tdscf/uhf.py
index ed762b0f10..55ad2a484c 100644
--- a/pyscf/pbc/tdscf/uhf.py
+++ b/pyscf/pbc/tdscf/uhf.py
@@ -18,38 +18,30 @@
 
 from pyscf import lib
 from pyscf.tdscf import uhf
+from pyscf.pbc.tdscf import rhf as td_rhf
+from pyscf.pbc.tdscf.rhf import TDMixin
 
 
-class TDA(uhf.TDA):
-    def gen_vind(self, mf):
-        vind, hdiag = uhf.TDA.gen_vind(self, mf)
-        def vindp(x):
-            with lib.temporary_env(mf, exxdiv=None):
-                return vind(x)
-        return vindp, hdiag
+class TDA(TDMixin):
 
-    def get_ab(self, mf=None):
-        raise NotImplementedError
+    singlet = None
 
-    def nuc_grad_method(self):
-        raise NotImplementedError
+    init_guess = uhf.TDA.init_guess
+    kernel = uhf.TDA.kernel
+    _gen_vind = uhf.TDA.gen_vind
+    gen_vind = td_rhf.TDA.gen_vind
 
 CIS = TDA
 
 
-class TDHF(uhf.TDHF):
-    def gen_vind(self, mf):
-        vind, hdiag = uhf.TDHF.gen_vind(self, mf)
-        def vindp(x):
-            with lib.temporary_env(mf, exxdiv=None):
-                return vind(x)
-        return vindp, hdiag
+class TDHF(TDA):
 
-    def get_ab(self, mf=None):
-        raise NotImplementedError
+    singlet = None
 
-    def nuc_grad_method(self):
-        raise NotImplementedError
+    init_guess = uhf.TDHF.init_guess
+    kernel = uhf.TDHF.kernel
+    _gen_vind = uhf.TDHF.gen_vind
+    gen_vind = td_rhf.TDA.gen_vind
 
 RPA = TDUHF = TDHF
 
diff --git a/pyscf/pbc/tdscf/uks.py b/pyscf/pbc/tdscf/uks.py
index 158706800a..05a518ea06 100644
--- a/pyscf/pbc/tdscf/uks.py
+++ b/pyscf/pbc/tdscf/uks.py
@@ -25,16 +25,9 @@
 RPA = TDUKS = TDDFT
 
 
-class CasidaTDDFT(uks.CasidaTDDFT):
-    def gen_vind(self, mf):
-        vind, hdiag = uks.CasidaTDDFT.gen_vind(self, mf)
-        def vindp(x):
-            with lib.temporary_env(mf, exxdiv=None):
-                return vind(x)
-        return vindp, hdiag
-
-    def nuc_grad_method(self):
-        raise NotImplementedError
+class CasidaTDDFT(TDA):
+    _gen_vind = uks.TDDFTNoHybrid.gen_vind
+    gen_vind = TDA.gen_vind
 
 TDDFTNoHybrid = CasidaTDDFT
 
diff --git a/pyscf/qmmm/itrf.py b/pyscf/qmmm/itrf.py
index 9b8bd83102..1833150078 100644
--- a/pyscf/qmmm/itrf.py
+++ b/pyscf/qmmm/itrf.py
@@ -131,33 +131,13 @@ def get_hcore(self, mol=None):
 
             coords = self.mm_mol.atom_coords()
             charges = self.mm_mol.atom_charges()
-            if pyscf.DEBUG:
-                v = 0
-                for i,q in enumerate(charges):
-                    mol.set_rinv_origin(coords[i])
-                    v += mol.intor('int1e_rinv') * -q
-            else:
-                if mol.cart:
-                    intor = 'int3c2e_cart'
-                else:
-                    intor = 'int3c2e_sph'
-                nao = mol.nao
-                max_memory = self.max_memory - lib.current_memory()[0]
-                blksize = int(min(max_memory*1e6/8/nao**2, 200))
-                if max_memory <= 0:
-                    blksize = 1
-                    logger.warn(self, 'Memory estimate for reading point charges is negative. '
-                                'Trying to read point charges one by one.')
-                cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas,
-                                                     mol._env, intor)
-                v = 0
-                for i0, i1 in lib.prange(0, charges.size, blksize):
-                    fakemol = gto.fakemol_for_charges(coords[i0:i1])
-                    j3c = df.incore.aux_e2(mol, fakemol, intor=intor,
-                                           aosym='s2ij', cintopt=cintopt)
-                    v += numpy.einsum('xk,k->x', j3c, -charges[i0:i1])
-                v = lib.unpack_tril(v)
-            return h1e + v
+            nao = mol.nao
+            max_memory = self.max_memory - lib.current_memory()[0]
+            blksize = int(min(max_memory*1e6/8/nao**2, 200))
+            for i0, i1 in lib.prange(0, charges.size, blksize):
+                j3c = mol.intor('int1e_grids', hermi=1, grids=coords[i0:i1])
+                h1e += numpy.einsum('kpq,k->pq', j3c, -charges[i0:i1])
+            return h1e
 
         def energy_nuc(self):
             # interactions between QM nuclei and MM particles
@@ -262,30 +242,14 @@ def get_hcore(self, mol=None):
             coords = self.base.mm_mol.atom_coords()
             charges = self.base.mm_mol.atom_charges()
 
+            nao = mol.nao
+            max_memory = self.max_memory - lib.current_memory()[0]
+            blksize = int(min(max_memory*1e6/8/nao**2/3, 200))
             g_qm = grad_class.get_hcore(self, mol)
-            nao = g_qm.shape[1]
-            if pyscf.DEBUG:
-                v = 0
-                for i,q in enumerate(charges):
-                    mol.set_rinv_origin(coords[i])
-                    v += mol.intor('int1e_iprinv', comp=3) * q
-            else:
-                if mol.cart:
-                    intor = 'int3c2e_ip1_cart'
-                else:
-                    intor = 'int3c2e_ip1_sph'
-                nao = mol.nao
-                max_memory = self.max_memory - lib.current_memory()[0]
-                blksize = int(min(max_memory*1e6/8/nao**2, 200))
-                cintopt = gto.moleintor.make_cintopt(mol._atm, mol._bas,
-                                                     mol._env, intor)
-                v = 0
-                for i0, i1 in lib.prange(0, charges.size, blksize):
-                    fakemol = gto.fakemol_for_charges(coords[i0:i1])
-                    j3c = df.incore.aux_e2(mol, fakemol, intor, aosym='s1',
-                                           comp=3, cintopt=cintopt)
-                    v += numpy.einsum('ipqk,k->ipq', j3c, charges[i0:i1])
-            return g_qm + v
+            for i0, i1 in lib.prange(0, charges.size, blksize):
+                j3c = mol.intor('int1e_grids_ip', grids=coords[i0:i1])
+                g_qm += numpy.einsum('ikpq,k->ipq', j3c, charges[i0:i1])
+            return g_qm
 
         def grad_nuc(self, mol=None, atmlst=None):
             if mol is None: mol = self.mol
diff --git a/pyscf/scf/_response_functions.py b/pyscf/scf/_response_functions.py
index a916e40ef0..213770e2d6 100644
--- a/pyscf/scf/_response_functions.py
+++ b/pyscf/scf/_response_functions.py
@@ -45,12 +45,12 @@ def _gen_rhf_response(mf, mo_coeff=None, mo_occ=None,
         from pyscf.dft import numint
         ni = mf._numint
         ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
-        if getattr(mf, 'nlc', '') != '':
+        if mf.nlc or ni.libxc.is_nlc(mf.xc):
             logger.warn(mf, 'NLC functional found in DFT object.  Its second '
                         'deriviative is not available. Its contribution is '
                         'not included in the response function.')
         omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin)
-        hybrid = abs(hyb) > 1e-10
+        hybrid = ni.libxc.is_hybrid_xc(mf.xc)
 
         # mf can be pbc.dft.RKS object with multigrid
         if (not hybrid and
@@ -62,10 +62,10 @@ def _gen_rhf_response(mf, mo_coeff=None, mo_occ=None,
         if singlet is None:
             # for ground state orbital hessian
             rho0, vxc, fxc = ni.cache_xc_kernel(mol, mf.grids, mf.xc,
-                                                mo_coeff, mo_occ, 0)
+                                                mo_coeff, mo_occ, spin=0)
         else:
             rho0, vxc, fxc = ni.cache_xc_kernel(mol, mf.grids, mf.xc,
-                                                [mo_coeff]*2, [mo_occ*.5]*2, spin=1)
+                                                mo_coeff, mo_occ, spin=1)
         dm0 = None  #mf.make_rdm1(mo_coeff, mo_occ)
 
         if max_memory is None:
@@ -85,7 +85,7 @@ def vind(dm1):
                     if hermi != 2:
                         vj, vk = mf.get_jk(mol, dm1, hermi=hermi)
                         vk *= hyb
-                        if omega > 1e-10:  # For range separated Coulomb
+                        if abs(omega) > 1e-10:  # For range separated Coulomb
                             vk += mf.get_k(mol, dm1, hermi, omega) * (alpha-hyb)
                         v1 += vj - .5 * vk
                     else:
@@ -107,7 +107,7 @@ def vind(dm1):
                     if hermi != 2:
                         vj, vk = mf.get_jk(mol, dm1, hermi=hermi)
                         vk *= hyb
-                        if omega > 1e-10:  # For range separated Coulomb
+                        if abs(omega) > 1e-10:  # For range separated Coulomb
                             vk += mf.get_k(mol, dm1, hermi, omega) * (alpha-hyb)
                         v1 += vj - .5 * vk
                     else:
@@ -127,7 +127,7 @@ def vind(dm1):
                 if hybrid:
                     vk = mf.get_k(mol, dm1, hermi=hermi)
                     vk *= hyb
-                    if omega > 1e-10:  # For range separated Coulomb
+                    if abs(omega) > 1e-10:  # For range separated Coulomb
                         vk += mf.get_k(mol, dm1, hermi, omega) * (alpha-hyb)
                     v1 += -.5 * vk
                 return v1
@@ -155,12 +155,12 @@ def _gen_uhf_response(mf, mo_coeff=None, mo_occ=None,
     if isinstance(mf, hf.KohnShamDFT):
         ni = mf._numint
         ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
-        if getattr(mf, 'nlc', '') != '':
+        if mf.nlc or ni.libxc.is_nlc(mf.xc):
             logger.warn(mf, 'NLC functional found in DFT object.  Its second '
                         'deriviative is not available. Its contribution is '
                         'not included in the response function.')
         omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin)
-        hybrid = abs(hyb) > 1e-10
+        hybrid = ni.libxc.is_hybrid_xc(mf.xc)
 
         # mf can be pbc.dft.UKS object with multigrid
         if (not hybrid and
@@ -228,10 +228,10 @@ def _gen_ghf_response(mf, mo_coeff=None, mo_occ=None,
         ni = mf._numint
         assert isinstance(ni, (numint2c.NumInt2C, r_numint.RNumInt))
         ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
-        if getattr(mf, 'nlc', '') != '':
+        if mf.nlc or ni.libxc.is_nlc(mf.xc):
             raise NotImplementedError('NLC')
         omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin)
-        hybrid = abs(hyb) > 1e-10
+        hybrid = ni.libxc.is_hybrid_xc(mf.xc)
 
         # mf can be pbc.dft.UKS object with multigrid
         if (not hybrid and
diff --git a/pyscf/scf/addons.py b/pyscf/scf/addons.py
index 01af1fcce5..81b2e32fa5 100644
--- a/pyscf/scf/addons.py
+++ b/pyscf/scf/addons.py
@@ -268,9 +268,9 @@ def get_occ(mo_energy, mo_coeff=None):
 
 def mom_occ_(mf, occorb, setocc):
     '''Use maximum overlap method to determine occupation number for each orbital in every
-    iteration. It can be applied to unrestricted HF/KS and restricted open-shell
-    HF/KS.'''
+    iteration.'''
     from pyscf.scf import uhf, rohf
+    log = logger.Logger(mf.stdout, mf.verbose)
     if isinstance(mf, uhf.UHF):
         coef_occ_a = occorb[0][:, setocc[0]>0]
         coef_occ_b = occorb[1][:, setocc[1]>0]
@@ -279,44 +279,59 @@ def mom_occ_(mf, occorb, setocc):
             raise ValueError('Wrong occupation setting for restricted open-shell calculation.')
         coef_occ_a = occorb[:, setocc[0]>0]
         coef_occ_b = occorb[:, setocc[1]>0]
+    else: # GHF, and DHF
+        assert setocc.ndim == 1
+
+    if isinstance(mf, (uhf.UHF, rohf.ROHF)):
+        def get_occ(mo_energy=None, mo_coeff=None):
+            if mo_energy is None: mo_energy = mf.mo_energy
+            if mo_coeff is None: mo_coeff = mf.mo_coeff
+            if isinstance(mf, rohf.ROHF):
+                mo_coeff = numpy.array([mo_coeff, mo_coeff])
+            mo_occ = numpy.zeros_like(setocc)
+            nocc_a = int(numpy.sum(setocc[0]))
+            nocc_b = int(numpy.sum(setocc[1]))
+            s_a = reduce(numpy.dot, (coef_occ_a.conj().T, mf.get_ovlp(), mo_coeff[0]))
+            s_b = reduce(numpy.dot, (coef_occ_b.conj().T, mf.get_ovlp(), mo_coeff[1]))
+            #choose a subset of mo_coeff, which maximizes <old|now>
+            idx_a = numpy.argsort(numpy.einsum('ij,ij->j', s_a, s_a))[::-1]
+            idx_b = numpy.argsort(numpy.einsum('ij,ij->j', s_b, s_b))[::-1]
+            mo_occ[0,idx_a[:nocc_a]] = 1.
+            mo_occ[1,idx_b[:nocc_b]] = 1.
+
+            log.debug(' New alpha occ pattern: %s', mo_occ[0])
+            log.debug(' New beta occ pattern: %s', mo_occ[1])
+            if isinstance(mf.mo_energy, numpy.ndarray) and mf.mo_energy.ndim == 1:
+                log.debug1(' Current mo_energy(sorted) = %s', mo_energy)
+            else:
+                log.debug1(' Current alpha mo_energy(sorted) = %s', mo_energy[0])
+                log.debug1(' Current beta mo_energy(sorted) = %s', mo_energy[1])
+
+            if (int(numpy.sum(mo_occ[0])) != nocc_a):
+                log.error('mom alpha electron occupation numbers do not match: %d, %d',
+                          nocc_a, int(numpy.sum(mo_occ[0])))
+            if (int(numpy.sum(mo_occ[1])) != nocc_b):
+                log.error('mom beta electron occupation numbers do not match: %d, %d',
+                          nocc_b, int(numpy.sum(mo_occ[1])))
+
+            #output 1-dimension occupation number for restricted open-shell
+            if isinstance(mf, rohf.ROHF): mo_occ = mo_occ[0, :] + mo_occ[1, :]
+            return mo_occ
     else:
-        raise RuntimeError('Cannot support this class of instance %s' % mf)
-    log = logger.Logger(mf.stdout, mf.verbose)
-    def get_occ(mo_energy=None, mo_coeff=None):
-        if mo_energy is None: mo_energy = mf.mo_energy
-        if mo_coeff is None: mo_coeff = mf.mo_coeff
-        if isinstance(mf, rohf.ROHF): mo_coeff = numpy.array([mo_coeff, mo_coeff])
-        mo_occ = numpy.zeros_like(setocc)
-        nocc_a = int(numpy.sum(setocc[0]))
-        nocc_b = int(numpy.sum(setocc[1]))
-        s_a = reduce(numpy.dot, (coef_occ_a.T, mf.get_ovlp(), mo_coeff[0]))
-        s_b = reduce(numpy.dot, (coef_occ_b.T, mf.get_ovlp(), mo_coeff[1]))
-        #choose a subset of mo_coeff, which maximizes <old|now>
-        idx_a = numpy.argsort(numpy.einsum('ij,ij->j', s_a, s_a))[::-1]
-        idx_b = numpy.argsort(numpy.einsum('ij,ij->j', s_b, s_b))[::-1]
-        mo_occ[0][idx_a[:nocc_a]] = 1.
-        mo_occ[1][idx_b[:nocc_b]] = 1.
-
-        log.debug(' New alpha occ pattern: %s', mo_occ[0])
-        log.debug(' New beta occ pattern: %s', mo_occ[1])
-        if isinstance(mf.mo_energy, numpy.ndarray) and mf.mo_energy.ndim == 1:
-            log.debug1(' Current mo_energy(sorted) = %s', mo_energy)
-        else:
-            log.debug1(' Current alpha mo_energy(sorted) = %s', mo_energy[0])
-            log.debug1(' Current beta mo_energy(sorted) = %s', mo_energy[1])
-
-        if (int(numpy.sum(mo_occ[0])) != nocc_a):
-            log.error('mom alpha electron occupation numbers do not match: %d, %d',
-                      nocc_a, int(numpy.sum(mo_occ[0])))
-        if (int(numpy.sum(mo_occ[1])) != nocc_b):
-            log.error('mom beta electron occupation numbers do not match: %d, %d',
-                      nocc_b, int(numpy.sum(mo_occ[1])))
-
-        #output 1-dimension occupation number for restricted open-shell
-        if isinstance(mf, rohf.ROHF): mo_occ = mo_occ[0, :] + mo_occ[1, :]
-        return mo_occ
+        def get_occ(mo_energy=None, mo_coeff=None):
+            if mo_energy is None: mo_energy = mf.mo_energy
+            if mo_coeff is None: mo_coeff = mf.mo_coeff
+            mo_occ = numpy.zeros_like(setocc)
+            nocc = int(setocc.sum())
+            s = occorb[:,setocc>0].conj().T.dot(mf.get_ovlp()).dot(mo_coeff)
+            #choose a subset of mo_coeff, which maximizes <old|now>
+            idx = numpy.argsort(numpy.einsum('ij,ij->j', s, s))[::-1]
+            mo_occ[idx[:nocc]] = 1.
+            return mo_occ
+
     mf.get_occ = get_occ
     return mf
+
 mom_occ = mom_occ_
 
 def project_mo_nr2nr(mol1, mo1, mol2):
diff --git a/pyscf/scf/atom_hf.py b/pyscf/scf/atom_hf.py
index 94cc2883ff..94d303f8ca 100644
--- a/pyscf/scf/atom_hf.py
+++ b/pyscf/scf/atom_hf.py
@@ -133,7 +133,7 @@ def get_occ(self, mo_energy=None, mo_coeff=None):
         symb = mol.atom_symbol(0)
 
         nelec_ecp = mol.atom_nelec_core(0)
-        coreshl = gto.ecp.core_configuration(nelec_ecp)
+        coreshl = gto.ecp.core_configuration(nelec_ecp, atom_symbol=gto.mole._std_symbol(symb))
 
         occ = []
         for l in range(param.L_MAX):
@@ -147,11 +147,12 @@ def get_occ(self, mo_energy=None, mo_coeff=None):
 
                 logger.debug1(self, 'l = %d  occ = %d + %.4g', l, n2occ, frac)
 
-                occ_l = numpy.zeros(nbas_l)
-                occ_l[:n2occ] = 2
-                if frac > 0:
-                    occ_l[n2occ] = frac
-                occ.append(numpy.repeat(occ_l, degen))
+                if nbas_l > 0:
+                    occ_l = numpy.zeros(nbas_l)
+                    occ_l[:n2occ] = 2
+                    if frac > 0:
+                        occ_l[n2occ] = frac
+                    occ.append(numpy.repeat(occ_l, degen))
             else:
                 occ.append(numpy.zeros(nbas_l * degen))
 
@@ -166,10 +167,10 @@ def scf(self, *args, **kwargs):
 
     def _finalize(self):
         if self.converged:
-            logger.info(self, 'Atomic HF for atom  %s  converged. SCF energy = %.15g',
+            logger.info(self, 'Atomic HF for atom  %s  converged. SCF energy = %.15g\n',
                         self.mol.atom_symbol(0), self.e_tot)
         else:
-            logger.info(self, 'Atomic HF for atom  %s  not converged. SCF energy = %.15g',
+            logger.info(self, 'Atomic HF for atom  %s  not converged. SCF energy = %.15g\n',
                         self.mol.atom_symbol(0), self.e_tot)
         return self
 
diff --git a/pyscf/scf/diis.py b/pyscf/scf/diis.py
index 21dba408b6..27e6e8cc0a 100644
--- a/pyscf/scf/diis.py
+++ b/pyscf/scf/diis.py
@@ -43,6 +43,10 @@ def __init__(self, mf=None, filename=None, Corth=None):
         self.rollback = 0
         self.space = 8
         self.Corth = Corth
+        #?self._scf = mf
+        #?if hasattr(self._scf, 'get_orbsym'): # Symmetry adapted SCF objects
+        #?    self.orbsym = mf.get_orbsym(Corth)
+        #?    sym_forbid = self.orbsym[:,None] != self.orbsym
 
     def update(self, s, d, f, *args, **kwargs):
         errvec = get_err_vec(s, d, f, self.Corth)
@@ -83,16 +87,24 @@ def get_err_vec_orig(s, d, f):
 
 def get_err_vec_orth(s, d, f, Corth):
     '''error vector in orthonormal basis = C.T.conj() (SDF - FDS) C'''
+    # Symmetry information to reduce numerical error in DIIS (issue #1524)
+    orbsym = getattr(Corth, 'orbsym', None)
+    if orbsym is not None:
+        sym_forbid = orbsym[:,None] != orbsym
+
     if isinstance(f, numpy.ndarray) and f.ndim == 2:
-        sdf = reduce(numpy.dot, (s,d,f))
-        errvec = Corth.conj().T.dot(sdf.conj().T - sdf).dot(Corth).ravel()
+        sdf = reduce(numpy.dot, (Corth.conj().T, s, d, f, Corth))
+        if orbsym is not None:
+            sdf[sym_forbid] = 0
+        errvec = (sdf.conj().T - sdf).ravel()
 
     elif isinstance(f, numpy.ndarray) and f.ndim == 3 and s.ndim == 3:
         errvec = []
         for i in range(f.shape[0]):
-            sdf = reduce(numpy.dot, (s[i], d[i], f[i]))
-            errvec.append(
-                Corth[i].conj().T.dot(sdf.conj().T - sdf).dot(Corth[i]).ravel())
+            sdf = reduce(numpy.dot, (Corth[i].conj().T, s[i], d[i], f[i], Corth[i]))
+            if orbsym is not None:
+                sdf[sym_forbid] = 0
+            errvec.append((sdf.conj().T - sdf).ravel())
         errvec = numpy.vstack(errvec).ravel()
 
     elif f.ndim == s.ndim+1 and f.shape[0] == 2:  # for UHF
diff --git a/pyscf/scf/ghf.py b/pyscf/scf/ghf.py
index 7da7185921..2a48428def 100644
--- a/pyscf/scf/ghf.py
+++ b/pyscf/scf/ghf.py
@@ -442,20 +442,15 @@ def get_jk(self, mol=None, dm=None, hermi=0, with_j=True, with_k=True,
         if dm is None: dm = self.make_rdm1()
         nao = mol.nao
         dm = numpy.asarray(dm)
+        # nao = 0 for HF with custom Hamiltonian
+        if dm.shape[-1] != nao * 2 and nao != 0:
+            raise ValueError('Dimension inconsistent '
+                             f'dm.shape = {dm.shape}, mol.nao = {nao}')
 
         def jkbuild(mol, dm, hermi, with_j, with_k, omega=None):
-            if (not omega and
-                (self._eri is not None or mol.incore_anyway or self._is_mem_enough())):
-                if self._eri is None:
-                    self._eri = mol.intor('int2e', aosym='s8')
-                return hf.dot_eri_dm(self._eri, dm, hermi, with_j, with_k)
-            else:
-                return hf.SCF.get_jk(self, mol, dm, hermi, with_j, with_k, omega)
+            return hf.RHF.get_jk(self, mol, dm, hermi, with_j, with_k, omega)
 
-        if nao == dm.shape[-1]:
-            vj, vk = jkbuild(mol, dm, hermi, with_j, with_k, omega)
-        else:  # GHF density matrix, shape (2N,2N)
-            vj, vk = get_jk(mol, dm, hermi, with_j, with_k, jkbuild, omega)
+        vj, vk = get_jk(mol, dm, hermi, with_j, with_k, jkbuild, omega)
         return vj, vk
 
     def get_veff(self, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
@@ -554,29 +549,3 @@ def _from_rhf_init_dm(dm, breaksym=True):
 
 class HF1e(GHF):
     scf = hf._hf1e_scf
-
-
-del (PRE_ORTH_METHOD)
-
-
-if __name__ == '__main__':
-    mol = gto.Mole()
-    mol.verbose = 3
-    mol.atom = 'H 0 0 0; H 0 0 1; O .5 .6 .2'
-    mol.basis = 'ccpvdz'
-    mol.build()
-
-    mf = GHF(mol)
-    mf.kernel()
-
-    dm = mf.init_guess_by_1e(mol)
-    dm = dm + 0j
-    nao = mol.nao_nr()
-    numpy.random.seed(12)
-    dm[:nao,nao:] = numpy.random.random((nao,nao)) * .1j
-    dm[nao:,:nao] = dm[:nao,nao:].T.conj()
-    mf.kernel(dm)
-    mf.canonicalize(mf.mo_coeff, mf.mo_occ)
-    mf.analyze()
-    print(mf.spin_square())
-    print(mf.e_tot - -75.9125824421352)
diff --git a/pyscf/scf/ghf_symm.py b/pyscf/scf/ghf_symm.py
index d0a5709e4b..8ddc0a6c11 100644
--- a/pyscf/scf/ghf_symm.py
+++ b/pyscf/scf/ghf_symm.py
@@ -39,9 +39,6 @@
 def analyze(mf, verbose=logger.DEBUG, with_meta_lowdin=WITH_META_LOWDIN,
             **kwargs):
     mol = mf.mol
-    if not mol.symmetry:
-        return ghf.analyze(mf, verbose, **kwargs)
-
     mo_energy = mf.mo_energy
     mo_occ = mf.mo_occ
     mo_coeff = mf.mo_coeff
@@ -83,7 +80,7 @@ def canonicalize(mf, mo_coeff, mo_occ, fock=None):
     '''
     mol = mf.mol
     if not mol.symmetry:
-        return ghf.canonicalize(mf, mo_coeff, mo_occ, fock)
+        raise RuntimeError('mol.symmetry not enabled')
 
     if getattr(mo_coeff, 'orbsym', None) is not None:
         return hf_symm.canonicalize(mf, mo_coeff, mo_occ, fock)
@@ -113,42 +110,42 @@ def dump_flags(self, verbose=None):
 
     def build(self, mol=None):
         if mol is None: mol = self.mol
-        if mol.symmetry:
-            for irname in self.irrep_nelec:
-                if irname not in mol.irrep_name:
-                    logger.warn(self, 'Molecule does not have irrep %s', irname)
-
-            nelec_fix = self.irrep_nelec.values()
-            if any(isinstance(x, (tuple, list)) for x in nelec_fix):
-                msg =('Number of alpha/beta electrons cannot be assigned '
-                      'separately in GHF.  irrep_nelec = %s' % self.irrep_nelec)
-                raise ValueError(msg)
-            nelec_fix = sum(nelec_fix)
-            float_irname = set(mol.irrep_name) - set(self.irrep_nelec)
-            if nelec_fix > mol.nelectron:
-                msg =('More electrons defined by irrep_nelec than total num electrons. '
-                      'mol.nelectron = %d  irrep_nelec = %s' %
-                      (mol.nelectron, self.irrep_nelec))
-                raise ValueError(msg)
-            else:
-                logger.info(mol, 'Freeze %d electrons in irreps %s',
-                            nelec_fix, self.irrep_nelec.keys())
-
-            if len(float_irname) == 0 and nelec_fix != mol.nelectron:
-                msg =('Num electrons defined by irrep_nelec != total num electrons. '
-                      'mol.nelectron = %d  irrep_nelec = %s' %
-                      (mol.nelectron, self.irrep_nelec))
-                raise ValueError(msg)
-            else:
-                logger.info(mol, '    %d free electrons in irreps %s',
-                            mol.nelectron-nelec_fix, ' '.join(float_irname))
+        if not mol.symmetry:
+            raise RuntimeError('mol.symmetry not enabled')
+
+        for irname in self.irrep_nelec:
+            if irname not in mol.irrep_name:
+                logger.warn(self, 'Molecule does not have irrep %s', irname)
+
+        nelec_fix = self.irrep_nelec.values()
+        if any(isinstance(x, (tuple, list)) for x in nelec_fix):
+            msg =('Number of alpha/beta electrons cannot be assigned '
+                  'separately in GHF.  irrep_nelec = %s' % self.irrep_nelec)
+            raise ValueError(msg)
+        nelec_fix = sum(nelec_fix)
+        float_irname = set(mol.irrep_name) - set(self.irrep_nelec)
+        if nelec_fix > mol.nelectron:
+            msg =('More electrons defined by irrep_nelec than total num electrons. '
+                  'mol.nelectron = %d  irrep_nelec = %s' %
+                  (mol.nelectron, self.irrep_nelec))
+            raise ValueError(msg)
+        else:
+            logger.info(mol, 'Freeze %d electrons in irreps %s',
+                        nelec_fix, self.irrep_nelec.keys())
+
+        if len(float_irname) == 0 and nelec_fix != mol.nelectron:
+            msg =('Num electrons defined by irrep_nelec != total num electrons. '
+                  'mol.nelectron = %d  irrep_nelec = %s' %
+                  (mol.nelectron, self.irrep_nelec))
+            raise ValueError(msg)
+        else:
+            logger.info(mol, '    %d free electrons in irreps %s',
+                        mol.nelectron-nelec_fix, ' '.join(float_irname))
         return ghf.GHF.build(self, mol)
 
     def eig(self, h, s, symm_orb=None, irrep_id=None):
         if symm_orb is None or irrep_id is None:
             mol = self.mol
-            if not mol.symmetry:
-                return self._eigh(h, s)
             symm_orb = mol.symm_orb
             irrep_id = mol.irrep_id
 
@@ -171,12 +168,11 @@ def eig(self, h, s, symm_orb=None, irrep_id=None):
 
     def get_grad(self, mo_coeff, mo_occ, fock=None):
         g = ghf.GHF.get_grad(self, mo_coeff, mo_occ, fock)
-        if self.mol.symmetry:
-            occidx = mo_occ > 0
-            viridx = ~occidx
-            orbsym = self.get_orbsym(mo_coeff, self.get_ovlp())
-            sym_forbid = orbsym[viridx].reshape(-1,1) != orbsym[occidx]
-            g[sym_forbid.ravel()] = 0
+        occidx = mo_occ > 0
+        viridx = ~occidx
+        orbsym = self.get_orbsym(mo_coeff)
+        sym_forbid = orbsym[viridx].reshape(-1,1) != orbsym[occidx]
+        g[sym_forbid.ravel()] = 0
         return g
 
     def get_occ(self, mo_energy=None, mo_coeff=None):
@@ -186,9 +182,9 @@ def get_occ(self, mo_energy=None, mo_coeff=None):
         if mo_energy is None: mo_energy = self.mo_energy
         mol = self.mol
         if not mol.symmetry:
-            return ghf.GHF.get_occ(self, mo_energy, mo_coeff)
+            raise RuntimeError('mol.symmetry not enabled')
 
-        orbsym = self.get_orbsym(mo_coeff, self.get_ovlp())
+        orbsym = self.get_orbsym(mo_coeff)
         mo_occ = numpy.zeros_like(mo_energy)
         rest_idx = numpy.ones(mo_occ.size, dtype=bool)
         nelec_fix = 0
@@ -243,7 +239,7 @@ def _finalize(self):
         # ordering of the symmetry labels when two orbitals are degenerated.
         o_sort = numpy.argsort(self.mo_energy[self.mo_occ> 0].round(9), kind='mergesort')
         v_sort = numpy.argsort(self.mo_energy[self.mo_occ==0].round(9), kind='mergesort')
-        orbsym = self.get_orbsym(self.mo_coeff, self.get_ovlp())
+        orbsym = self.get_orbsym(self.mo_coeff)
         self.mo_energy = numpy.hstack((self.mo_energy[self.mo_occ> 0][o_sort],
                                        self.mo_energy[self.mo_occ==0][v_sort]))
         self.mo_coeff = numpy.hstack((self.mo_coeff[:,self.mo_occ> 0].take(o_sort, axis=1),
@@ -276,6 +272,8 @@ def get_irrep_nelec(self, mol=None, mo_coeff=None, mo_occ=None, s=None):
     def get_orbsym(self, mo_coeff=None, s=None):
         if mo_coeff is None:
             mo_coeff = self.mo_coeff
+        if getattr(mo_coeff, 'orbsym', None) is not None:
+            return mo_coeff.orbsym
         if s is None:
             s = self.get_ovlp()
         return numpy.asarray(get_orbsym(self.mol, mo_coeff, s))
diff --git a/pyscf/scf/hf.py b/pyscf/scf/hf.py
index b584dd8548..a14fa57aaf 100644
--- a/pyscf/scf/hf.py
+++ b/pyscf/scf/hf.py
@@ -367,16 +367,19 @@ def minao_basis(symb, nelec_ecp):
         stdsymb = gto.mole._std_symbol(symb)
         basis_add = gto.basis.load('ano', stdsymb)
 # coreshl defines the core shells to be removed in the initial guess
-        coreshl = gto.ecp.core_configuration(nelec_ecp)
-        #coreshl = (0,0,0,0)  # it keeps all core electrons in the initial guess
+        coreshl = gto.ecp.core_configuration(nelec_ecp, atom_symbol=stdsymb)
+        # coreshl = (0,0,0,0)  # it keeps all core electrons in the initial guess
         for l in range(4):
             ndocc, frac = atom_hf.frac_occ(stdsymb, l)
-            assert ndocc >= coreshl[l]
-            degen = l * 2 + 1
-            occ_l = [2,]*(ndocc-coreshl[l]) + [frac,]
-            occ.append(numpy.repeat(occ_l, degen))
-            basis_ano.append([l] + [b[:1] + b[1+coreshl[l]:ndocc+2]
-                                    for b in basis_add[l][1:]])
+            if ndocc >= coreshl[l]:
+                degen = l * 2 + 1
+                occ_l = [2, ]*(ndocc-coreshl[l]) + [frac, ]
+                occ.append(numpy.repeat(occ_l, degen))
+                basis_ano.append([l] + [b[:1] + b[1+coreshl[l]:ndocc+2]
+                                        for b in basis_add[l][1:]])
+            else:
+                logger.debug(mol, '*** ECP incorporates partially occupied '
+                             'shell of l = %d for atom %s ***', l, symb)
         occ = numpy.hstack(occ)
 
         if nelec_ecp > 0:
@@ -400,11 +403,12 @@ def minao_basis(symb, nelec_ecp):
                 ndocc -= coreshl[l]
                 assert ndocc <= nbas_l
 
-                occ_l = numpy.zeros(nbas_l)
-                occ_l[:ndocc] = 2
-                if frac > 0:
-                    occ_l[ndocc] = frac
-                occ4ecp.append(numpy.repeat(occ_l, l * 2 + 1))
+                if nbas_l > 0:
+                    occ_l = numpy.zeros(nbas_l)
+                    occ_l[:ndocc] = 2
+                    if frac > 0:
+                        occ_l[ndocc] = frac
+                    occ4ecp.append(numpy.repeat(occ_l, l * 2 + 1))
 
             occ4ecp = numpy.hstack(occ4ecp)
             basis4ecp = lib.flatten(basis4ecp)
@@ -1105,7 +1109,7 @@ def mulliken_pop(mol, dm, s=None, verbose=logger.DEBUG):
 
     log.info(' ** Mulliken pop  **')
     for i, s in enumerate(mol.ao_labels()):
-        log.info('pop of  %s %10.5f', s, pop[i])
+        log.info('pop of  %-14s %10.5f', s, pop[i])
 
     log.note(' ** Mulliken atomic charges  **')
     chg = numpy.zeros(mol.natm)
@@ -1114,7 +1118,7 @@ def mulliken_pop(mol, dm, s=None, verbose=logger.DEBUG):
     chg = mol.atom_charges() - chg
     for ia in range(mol.natm):
         symb = mol.atom_symbol(ia)
-        log.note('charge of  %d%s =   %10.5f', ia, symb, chg[ia])
+        log.note('charge of  %3d%s =   %10.5f', ia, symb, chg[ia])
     return pop, chg
 
 
@@ -1309,6 +1313,7 @@ def as_scanner(mf):
     class SCF_Scanner(mf.__class__, lib.SinglePointScanner):
         def __init__(self, mf_obj):
             self.__dict__.update(mf_obj.__dict__)
+            self._last_mol_fp = mf.mol.ao_loc
 
         def __call__(self, mol_or_geom, **kwargs):
             if isinstance(mol_or_geom, gto.Mole):
@@ -1326,7 +1331,7 @@ def __call__(self, mol_or_geom, **kwargs):
             elif self.chkfile and h5py.is_hdf5(self.chkfile):
                 dm0 = self.from_chk(self.chkfile)
             else:
-                dm0 = self.make_rdm1()
+                dm0 = None
                 # dm0 form last calculation cannot be used in the current
                 # calculation if a completely different system is given.
                 # Obviously, the systems are very different if the number of
@@ -1334,18 +1339,11 @@ def __call__(self, mol_or_geom, **kwargs):
                 # TODO: A robust check should include more comparison on
                 # various attributes between current `mol` and the `mol` in
                 # last calculation.
-                if dm0.shape[-1] != mol.nao:
-                    #TODO:
-                    #from pyscf.scf import addons
-                    #if numpy.any(last_mol.atom_charges() != mol.atom_charges()):
-                    #    dm0 = None
-                    #elif non-relativistic:
-                    #    addons.project_dm_nr2nr(last_mol, dm0, last_mol)
-                    #else:
-                    #    addons.project_dm_r2r(last_mol, dm0, last_mol)
-                    dm0 = None
+                if numpy.array_equal(self._last_mol_fp, mol.ao_loc):
+                    dm0 = self.make_rdm1()
             self.mo_coeff = None  # To avoid last mo_coeff being used by SOSCF
             e_tot = self.kernel(dm0=dm0, **kwargs)
+            self._last_mol_fp = mol.ao_loc
             return e_tot
 
     return SCF_Scanner(mf)
@@ -1570,6 +1568,7 @@ def dump_chk(self, envs):
     @lib.with_doc(init_guess_by_minao.__doc__)
     def init_guess_by_minao(self, mol=None):
         if mol is None: mol = self.mol
+        logger.info(self, 'Initial guess from minao.')
         return init_guess_by_minao(mol)
 
     @lib.with_doc(init_guess_by_atom.__doc__)
@@ -1849,7 +1848,19 @@ def update_(self, chkfile=None):
         '''
         from pyscf.scf import chkfile as chkmod
         if chkfile is None: chkfile = self.chkfile
-        self.__dict__.update(chkmod.load(chkfile, 'scf'))
+        chk_scf = chkmod.load(chkfile, 'scf')
+        nao = self.mol.nao
+        mo = chk_scf['mo_coeff']
+        if isinstance(mo, numpy.ndarray): # RHF
+            mo_nao = mo.shape[-2]
+        elif isinstance(mo[0], numpy.ndarray): # UHF
+            mo_nao = mo[0].shape[-2]
+        else: # KUHF
+            mo_nao = mo[0][0].shape[-2]
+        if mo_nao not in (nao, nao*2):
+            logger.warn(self, 'Current mol is inconsistent with SCF object in '
+                        'chkfile %s', chkfile)
+        self.__dict__.update(chk_scf)
         return self
     update_from_chk = update_from_chk_ = update = update_
 
diff --git a/pyscf/scf/hf_symm.py b/pyscf/scf/hf_symm.py
index 2e5d577bd5..40eab05874 100644
--- a/pyscf/scf/hf_symm.py
+++ b/pyscf/scf/hf_symm.py
@@ -53,9 +53,6 @@ def analyze(mf, verbose=logger.DEBUG, with_meta_lowdin=WITH_META_LOWDIN,
     from pyscf.lo import orth
     from pyscf.tools import dump_mat
     mol = mf.mol
-    if not mol.symmetry:
-        return hf.analyze(mf, verbose, with_meta_lowdin, **kwargs)
-
     mo_energy = mf.mo_energy
     mo_occ = mf.mo_occ
     mo_coeff = mf.mo_coeff
@@ -153,8 +150,7 @@ def canonicalize(mf, mo_coeff, mo_occ, fock=None):
     '''
     mol = mf.mol
     if not mol.symmetry:
-        return hf.canonicalize(mf, mo_coeff, mo_occ, fock)
-
+        raise RuntimeError('mol.symmetry not enabled')
     if fock is None:
         dm = mf.make_rdm1(mo_coeff, mo_occ)
         fock = mf.get_hcore() + mf.get_veff(mf.mol, dm)
@@ -305,7 +301,7 @@ def eig(mf, h, s, symm_orb=None, irrep_id=None):
     mol = mf.mol
     if symm_orb is None or irrep_id is None:
         if not mol.symmetry:
-            return mf._eigh(h, s)
+            raise RuntimeError('mol.symmetry not enabled')
         symm_orb = mol.symm_orb
         irrep_id = mol.irrep_id
 
@@ -344,14 +340,15 @@ def eig(mf, h, s, symm_orb=None, irrep_id=None):
     return e, c
 
 def get_orbsym(mol, mo_coeff, s=None, check=False, symm_orb=None, irrep_id=None):
+    if getattr(mo_coeff, 'orbsym', None) is not None:
+        return mo_coeff.orbsym
+
     if symm_orb is None or irrep_id is None:
         symm_orb = mol.symm_orb
         irrep_id = mol.irrep_id
     if mo_coeff is None:
         orbsym = numpy.hstack([[ir] * symm_orb[i].shape[1]
                                for i, ir in enumerate(irrep_id)])
-    elif getattr(mo_coeff, 'orbsym', None) is not None:
-        orbsym = mo_coeff.orbsym
     else:
         orbsym = symm.label_orb_symm(mol, irrep_id, symm_orb,
                                      mo_coeff, s, check)
@@ -445,33 +442,32 @@ def __init__(self, mol):
 
     def build(self, mol=None):
         if mol is None: mol = self.mol
-        if mol.symmetry:
-            check_irrep_nelec(mol, self.irrep_nelec, self.mol.nelectron)
+        if not mol.symmetry:
+            raise RuntimeError('mol.symmetry not enabled')
+        check_irrep_nelec(mol, self.irrep_nelec, self.mol.nelectron)
         return hf.RHF.build(self, mol)
 
     eig = eig
 
     def get_grad(self, mo_coeff, mo_occ, fock=None):
         g = hf.RHF.get_grad(self, mo_coeff, mo_occ, fock)
-        if self.mol.symmetry:
-            occidx = mo_occ > 0
-            viridx = ~occidx
-            orbsym = self.get_orbsym(mo_coeff, self.get_ovlp())
-            sym_forbid = orbsym[viridx].reshape(-1,1) != orbsym[occidx]
-            g[sym_forbid.ravel()] = 0
+        occidx = mo_occ > 0
+        viridx = ~occidx
+        orbsym = self.get_orbsym(mo_coeff)
+        sym_forbid = orbsym[viridx].reshape(-1,1) != orbsym[occidx]
+        g[sym_forbid.ravel()] = 0
         return g
 
     def get_occ(self, mo_energy=None, mo_coeff=None):
         ''' We assumed mo_energy are grouped by symmetry irreps, (see function
         self.eig). The orbitals are sorted after SCF.
         '''
-
         if mo_energy is None: mo_energy = self.mo_energy
         mol = self.mol
         if not mol.symmetry:
-            return hf.RHF.get_occ(self, mo_energy, mo_coeff)
+            raise RuntimeError('mol.symmetry not enabled')
 
-        orbsym = self.get_orbsym(mo_coeff, self.get_ovlp())
+        orbsym = self.get_orbsym(mo_coeff)
         mo_occ = numpy.zeros_like(mo_energy)
         rest_idx = numpy.ones(mo_occ.size, dtype=bool)
         nelec_fix = 0
@@ -528,7 +524,7 @@ def _finalize(self):
                             idx[self.mo_occ==0][v_sort]))
         self.mo_energy = self.mo_energy[idx]
         self.mo_occ = self.mo_occ[idx]
-        orbsym = self.get_orbsym(self.mo_coeff, self.get_ovlp())
+        orbsym = self.get_orbsym(self.mo_coeff)
         orbsym = orbsym[idx]
         degen_mapping = None
         if self.mol.groupname in ('Dooh', 'Coov'):
@@ -563,6 +559,8 @@ def get_irrep_nelec(self, mol=None, mo_coeff=None, mo_occ=None, s=None):
     def get_orbsym(self, mo_coeff=None, s=None):
         if mo_coeff is None:
             mo_coeff = self.mo_coeff
+        if getattr(mo_coeff, 'orbsym', None) is not None:
+            return mo_coeff.orbsym
         if s is None:
             s = self.get_ovlp()
         return numpy.asarray(get_orbsym(self.mol, mo_coeff, s))
@@ -615,21 +613,22 @@ def dump_flags(self, verbose=None):
 
     def build(self, mol=None):
         if mol is None: mol = self.mol
-        if mol.symmetry:
-            fix_na, fix_nb = check_irrep_nelec(mol, self.irrep_nelec, self.nelec)[:2]
-            alpha_open = beta_open = False
-            for ne in self.irrep_nelec.values():
-                if not isinstance(ne, (int, numpy.integer)):
-                    alpha_open |= ne[0] > ne[1]
-                    beta_open  |= ne[0] < ne[1]
-
-            frozen_spin = fix_na - fix_nb
-            if ((alpha_open and beta_open) or
-                (0 < mol.spin < frozen_spin) or (frozen_spin < 0 < mol.spin) or
-                (frozen_spin < mol.spin < 0) or (mol.spin < 0 < frozen_spin)):
-                raise ValueError('Low-spin configuration was found in '
-                                 'the irrep_nelec input. ROHF does not '
-                                 'support low-spin configuration.')
+        if not mol.symmetry:
+            raise RuntimeError('mol.symmetry not enabled')
+        fix_na, fix_nb = check_irrep_nelec(mol, self.irrep_nelec, self.nelec)[:2]
+        alpha_open = beta_open = False
+        for ne in self.irrep_nelec.values():
+            if not isinstance(ne, (int, numpy.integer)):
+                alpha_open |= ne[0] > ne[1]
+                beta_open  |= ne[0] < ne[1]
+
+        frozen_spin = fix_na - fix_nb
+        if ((alpha_open and beta_open) or
+            (0 < mol.spin < frozen_spin) or (frozen_spin < 0 < mol.spin) or
+            (frozen_spin < mol.spin < 0) or (mol.spin < 0 < frozen_spin)):
+            raise ValueError('Low-spin configuration was found in '
+                             'the irrep_nelec input. ROHF does not '
+                             'support low-spin configuration.')
         return hf.RHF.build(self, mol)
 
     @lib.with_doc(eig.__doc__)
@@ -643,25 +642,24 @@ def eig(self, fock, s):
 
     def get_grad(self, mo_coeff, mo_occ, fock=None):
         g = rohf.ROHF.get_grad(self, mo_coeff, mo_occ, fock)
-        if self.mol.symmetry:
-            occidxa = mo_occ > 0
-            occidxb = mo_occ == 2
-            viridxa = ~occidxa
-            viridxb = ~occidxb
-            uniq_var_a = viridxa.reshape(-1,1) & occidxa
-            uniq_var_b = viridxb.reshape(-1,1) & occidxb
-
-            orbsym = self.get_orbsym(mo_coeff, self.get_ovlp())
-            sym_forbid = orbsym.reshape(-1,1) != orbsym
-            sym_forbid = sym_forbid[uniq_var_a | uniq_var_b]
-            g[sym_forbid.ravel()] = 0
+        occidxa = mo_occ > 0
+        occidxb = mo_occ == 2
+        viridxa = ~occidxa
+        viridxb = ~occidxb
+        uniq_var_a = viridxa.reshape(-1,1) & occidxa
+        uniq_var_b = viridxb.reshape(-1,1) & occidxb
+
+        orbsym = self.get_orbsym(mo_coeff)
+        sym_forbid = orbsym.reshape(-1,1) != orbsym
+        sym_forbid = sym_forbid[uniq_var_a | uniq_var_b]
+        g[sym_forbid.ravel()] = 0
         return g
 
     def get_occ(self, mo_energy=None, mo_coeff=None):
         if mo_energy is None: mo_energy = self.mo_energy
         mol = self.mol
         if not mol.symmetry:
-            return rohf.ROHF.get_occ(self, mo_energy, mo_coeff)
+            raise RuntimeError('mol.symmetry not enabled')
 
         if getattr(mo_energy, 'mo_ea', None) is not None:
             mo_ea = mo_energy.mo_ea
@@ -670,8 +668,7 @@ def get_occ(self, mo_energy=None, mo_coeff=None):
             mo_ea = mo_eb = mo_energy
         nmo = mo_ea.size
         mo_occ = numpy.zeros(nmo)
-
-        orbsym = self.get_orbsym(mo_coeff, self.get_ovlp())
+        orbsym = self.get_orbsym(mo_coeff)
 
         rest_idx = numpy.ones(mo_occ.size, dtype=bool)
         neleca_fix = 0
@@ -790,7 +787,7 @@ def _finalize(self):
         else:
             self.mo_energy = self.mo_energy[idx]
         self.mo_occ = self.mo_occ[idx]
-        orbsym = self.get_orbsym(self.mo_coeff, self.get_ovlp())
+        orbsym = self.get_orbsym(self.mo_coeff)
         orbsym = orbsym[idx]
         degen_mapping = None
         if self.mol.groupname in ('Dooh', 'Coov'):
@@ -813,9 +810,6 @@ def analyze(self, verbose=None, with_meta_lowdin=WITH_META_LOWDIN,
         if verbose is None: verbose = self.verbose
         from pyscf.lo import orth
         from pyscf.tools import dump_mat
-        if not self.mol.symmetry:
-            return rohf.ROHF.analyze(self, verbose, with_meta_lowdin, **kwargs)
-
         mol = self.mol
         mo_energy = self.mo_energy
         mo_occ = self.mo_occ
@@ -826,7 +820,7 @@ def analyze(self, verbose=None, with_meta_lowdin=WITH_META_LOWDIN,
             self.dump_scf_summary(log)
 
             nirrep = len(mol.irrep_id)
-            orbsym = self.get_orbsym(mo_coeff, self.get_ovlp())
+            orbsym = self.get_orbsym(mo_coeff)
             irreps = numpy.asarray(mol.irrep_id)
             ndoccs = numpy.count_nonzero(irreps[:,None] == orbsym[mo_occ==2], axis=1)
             nsoccs = numpy.count_nonzero(irreps[:,None] == orbsym[mo_occ==1], axis=1)
@@ -926,12 +920,7 @@ def canonicalize(self, mo_coeff, mo_occ, fock=None):
         mo_e = lib.tag_array(mo_e, mo_ea=mo_ea, mo_eb=mo_eb)
         return mo_e, mo_coeff
 
-    def get_orbsym(self, mo_coeff=None, s=None):
-        if mo_coeff is None:
-            mo_coeff = self.mo_coeff
-        if s is None:
-            s = self.get_ovlp()
-        return numpy.asarray(get_orbsym(self.mol, mo_coeff, s))
+    get_orbsym = SymAdaptedRHF.get_orbsym
     orbsym = property(get_orbsym)
 
     get_wfnsym = get_wfnsym
diff --git a/pyscf/scf/stability.py b/pyscf/scf/stability.py
index 9c5a4fbe1c..574c429028 100644
--- a/pyscf/scf/stability.py
+++ b/pyscf/scf/stability.py
@@ -384,7 +384,8 @@ def precond(dx, e, x0):
         hdiagd = hdiag2 - e
         hdiagd[abs(hdiagd)<1e-8] = 1e-8
         return dx/hdiagd
-    x0 = v1
+    x0 = numpy.zeros_like(hdiag2)
+    x0[hdiag2>1e-5] = 1. / hdiag2[hdiag2>1e-5]
     e3, v3 = lib.davidson(hop2, x0, precond, tol=1e-4, verbose=log)
     if e3 < -1e-5:
         log.note(f'{mf.__class__} wavefunction has a RHF/RKS -> UHF/UKS instability.')
diff --git a/pyscf/scf/test/test_diis.py b/pyscf/scf/test/test_diis.py
index 442c5a716a..afa53d989b 100644
--- a/pyscf/scf/test/test_diis.py
+++ b/pyscf/scf/test/test_diis.py
@@ -94,6 +94,17 @@ def test_diis_restart(self):
         e = mf.kernel()
         self.assertAlmostEqual(e, eref, 9)
 
+    # issue 1524
+    def test_diis_for_symmetry_adapted_scf(self):
+        mol = gto.M(atom='O', spin=2, basis='ccpvdz', symmetry=True)
+        mf = mol.ROHF()
+        mf.init_guess = '1e'
+        mf.irrep_nelec = {'s+0' : (2,2), 'p-1' : (1,0), 'p+0' : (1,0), 'p+1' : (1,1) }
+        mf.max_cycle = 9
+        mf.kernel()
+        self.assertTrue(mf.converged)
+        self.assertAlmostEqual(mf.e_tot, -74.7874921601008, 9)
+
 
 if __name__ == "__main__":
     print("Full Tests for DIIS")
diff --git a/pyscf/scf/test/test_ghf.py b/pyscf/scf/test/test_ghf.py
index 5509e93043..39929492e1 100644
--- a/pyscf/scf/test/test_ghf.py
+++ b/pyscf/scf/test/test_ghf.py
@@ -140,6 +140,7 @@ def test_ghf_complex(self):
         dm[nao:,:nao] = dm[:nao,nao:].T.conj()
         mf1.kernel(dm)
         self.assertAlmostEqual(mf1.e_tot, mf.e_tot, 9)
+        self.assertAlmostEqual(mf1.e_tot, -76.02676567312, 9)
 
     def test_get_veff(self):
         nao = mol.nao_nr()*2
@@ -192,17 +193,6 @@ def test_get_jk(self):
         self.assertAlmostEqual(lib.fp(vj), 254.68614111766146+0j, 9)
         self.assertAlmostEqual(lib.fp(vk), 62.08832587927003-8.640597547171135j, 9)
 
-        nao = mol.nao_nr()
-        numpy.random.seed(1)
-        d1 = numpy.random.random((nao,nao))
-        d2 = numpy.random.random((nao,nao))
-        d = (d1+d1.T, d2+d2.T)
-        vj, vk = mf.get_jk(mol, d)
-        self.assertEqual(vj.shape, (2,nao,nao))
-        self.assertEqual(vk.shape, (2,nao,nao))
-        self.assertAlmostEqual(lib.fp(vj), -388.17756605981504, 9)
-        self.assertAlmostEqual(lib.fp(vk), -84.276190743451622, 9)
-
     def test_spin_square(self):
         nao = mol.nao_nr()
         s = mol.intor('int1e_ovlp')
@@ -230,6 +220,9 @@ def test_spin_square(self):
         ssref = spin_square(mol, mo)
         self.assertAlmostEqual(ssref, ss, 9)
 
+        # Test the call to spin_square() in ghf.analyze()
+        mf.analyze()
+
     def test_canonicalize(self):
         mo = mf.mo_coeff + 0j
         nocc = numpy.count_nonzero(mf.mo_occ > 0)
diff --git a/pyscf/scf/test/test_h2o.py b/pyscf/scf/test/test_h2o.py
index 9104bef8db..bca61250c8 100644
--- a/pyscf/scf/test/test_h2o.py
+++ b/pyscf/scf/test/test_h2o.py
@@ -106,6 +106,11 @@ def test_nr_df_uhf(self):
         uhf.conv_tol = 1e-11
         self.assertAlmostEqual(uhf.scf(), -75.983210886950, 9)
 
+    def test_nr_df_ghf(self):
+        mf = mol.GHF().density_fit(auxbasis='weigend')
+        mf.conv_tol = 1e-11
+        self.assertAlmostEqual(mf.scf(), -75.983210886950, 9)
+
     def test_nr_rhf_no_mem(self):
         rhf = scf.RHF(mol)
         rhf.conv_tol = 1e-11
diff --git a/pyscf/scf/test/test_rhf.py b/pyscf/scf/test/test_rhf.py
index d6e7ad9dd0..ffd4876fef 100644
--- a/pyscf/scf/test/test_rhf.py
+++ b/pyscf/scf/test/test_rhf.py
@@ -590,8 +590,7 @@ def test_rohf_symm_get_occ(self):
                 [0, 2, 0, 0, 0, 1, 2, 0, 1, 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2]))
 
         mf1 = scf.RHF(mol).set(verbose=0).view(scf.hf_symm.ROHF)
-        self.assertTrue(numpy.allclose(mf1.get_occ(energy, mo_coeff),
-                [0 ,2 ,0 ,0 ,0 ,0 ,2 ,0 ,0 ,0 ,0 ,0 ,2 ,0 ,2 ,0 ,0 ,0 ,0 ,2]))
+        self.assertRaises(RuntimeError, mf1.get_occ, energy, mo_coeff)
 
     def test_get_occ_extreme_case(self):
         mol = gto.M(atom='He', verbose=7, output='/dev/null')
@@ -906,14 +905,17 @@ def test_schwarz_condition(self):
         q = opt.q_cond
         self.assertTrue(mol.intor_by_shell('int2e', shls).ravel()[0] < q[i,j] * q[k,l])
 
+    @unittest.skip('Numerical accuracy issue in libcint 5.2')
+    def test_schwarz_condition_numerical_error(self):
         mol = gto.M(atom='''
                     H    0   0   0
                     H    0   0   6
                     ''', unit='B',
                     basis = [[0, (.6, 1)], [0, (1e3, 1)]])
         omega = 5.
-        with mol.with_short_range_coulomb(5.):
+        with mol.with_short_range_coulomb(omega):
             mf = scf.RHF(mol)
+            # sr eri cannot reach the accuracy 1e-18
             mf.direct_scf_tol = 1e-18
             opt = mf.init_direct_scf()
             shls = i, j, k, l = 2, 0, 1, 1
diff --git a/pyscf/scf/uhf.py b/pyscf/scf/uhf.py
index 9db9e6a13d..3e691b30d2 100644
--- a/pyscf/scf/uhf.py
+++ b/pyscf/scf/uhf.py
@@ -694,14 +694,13 @@ def det_ovlp(mo1, mo2, occ1, occ2, ovlp):
             :math:`\mathbf{U} \mathbf{\Lambda}^{-1} \mathbf{V}^\dagger`
             They are used to calculate asymmetric density matrix
     '''
-
-    if not numpy.array_equal(occ1, occ2):
-        raise RuntimeError('Electron numbers are not equal. Electronic coupling does not exist.')
-
     c1_a = mo1[0][:, occ1[0]>0]
     c1_b = mo1[1][:, occ1[1]>0]
     c2_a = mo2[0][:, occ2[0]>0]
     c2_b = mo2[1][:, occ2[1]>0]
+    if c1_a.shape[1] != c2_a.shape[1] or c1_b.shape[1] != c2_b.shape[1]:
+        raise RuntimeError('Electron numbers are not equal. Electronic coupling does not exist.')
+
     o_a = reduce(numpy.dot, (c1_a.conj().T, ovlp, c2_a))
     o_b = reduce(numpy.dot, (c1_b.conj().T, ovlp, c2_b))
     u_a, s_a, vt_a = numpy.linalg.svd(o_a)
@@ -975,8 +974,11 @@ def make_asym_dm(self, mo1, mo2, occ1, occ2, x):
         return make_asym_dm(mo1, mo2, occ1, occ2, x)
 
     def _finalize(self):
-        ss, s = self.spin_square()
+        if self.mo_coeff is None or self.mo_occ is None:
+            # Skip spin_square (issue #1574)
+            return hf.SCF._finalize(self)
 
+        ss, s = self.spin_square()
         if self.converged:
             logger.note(self, 'converged SCF energy = %.15g  '
                         '<S^2> = %.8g  2S+1 = %.8g', self.e_tot, ss, s)
diff --git a/pyscf/scf/uhf_symm.py b/pyscf/scf/uhf_symm.py
index 447457d371..5acbe23d62 100644
--- a/pyscf/scf/uhf_symm.py
+++ b/pyscf/scf/uhf_symm.py
@@ -41,9 +41,6 @@ def analyze(mf, verbose=logger.DEBUG, with_meta_lowdin=WITH_META_LOWDIN,
     from pyscf.lo import orth
     from pyscf.tools import dump_mat
     mol = mf.mol
-    if not mol.symmetry:
-        return uhf.analyze(mf, verbose, with_meta_lowdin, **kwargs)
-
     mo_energy = mf.mo_energy
     mo_occ = mf.mo_occ
     mo_coeff = mf.mo_coeff
@@ -193,8 +190,7 @@ def canonicalize(mf, mo_coeff, mo_occ, fock=None):
     '''
     mol = mf.mol
     if not mol.symmetry:
-        return uhf.canonicalize(mf, mo_coeff, mo_occ, fock)
-
+        raise RuntimeError('mol.symmetry not enabled')
     mo_occ = numpy.asarray(mo_occ)
     assert (mo_occ.ndim == 2)
     if fock is None:
@@ -326,15 +322,13 @@ def dump_flags(self, verbose=None):
 
     def build(self, mol=None):
         if mol is None: mol = self.mol
-        if mol.symmetry:
-            hf_symm.check_irrep_nelec(mol, self.irrep_nelec, self.nelec)
+        if not mol.symmetry:
+            raise RuntimeError('mol.symmetry not enabled')
+        hf_symm.check_irrep_nelec(mol, self.irrep_nelec, self.nelec)
         return uhf.UHF.build(self, mol)
 
     def eig(self, h, s):
         mol = self.mol
-        if not mol.symmetry:
-            return self._eigh(h, s)
-
         nirrep = mol.symm_orb.__len__()
         s = symm.symmetrize_matrix(s, mol.symm_orb)
         ha = symm.symmetrize_matrix(h[0], mol.symm_orb)
@@ -391,17 +385,16 @@ def eig(self, h, s):
 
     def get_grad(self, mo_coeff, mo_occ, fock=None):
         g = uhf.UHF.get_grad(self, mo_coeff, mo_occ, fock)
-        if self.mol.symmetry:
-            occidxa = mo_occ[0] > 0
-            occidxb = mo_occ[1] > 0
-            viridxa = ~occidxa
-            viridxb = ~occidxb
-            orbsyma, orbsymb = self.get_orbsym(mo_coeff, self.get_ovlp())
-            sym_forbida = orbsyma[viridxa].reshape(-1,1) != orbsyma[occidxa]
-            sym_forbidb = orbsymb[viridxb].reshape(-1,1) != orbsymb[occidxb]
-            sym_forbid = numpy.hstack((sym_forbida.ravel(),
-                                       sym_forbidb.ravel()))
-            g[sym_forbid] = 0
+        occidxa = mo_occ[0] > 0
+        occidxb = mo_occ[1] > 0
+        viridxa = ~occidxa
+        viridxb = ~occidxb
+        orbsyma, orbsymb = self.get_orbsym(mo_coeff)
+        sym_forbida = orbsyma[viridxa].reshape(-1,1) != orbsyma[occidxa]
+        sym_forbidb = orbsymb[viridxb].reshape(-1,1) != orbsymb[occidxb]
+        sym_forbid = numpy.hstack((sym_forbida.ravel(),
+                                   sym_forbidb.ravel()))
+        g[sym_forbid] = 0
         return g
 
     def get_occ(self, mo_energy=None, mo_coeff=None):
@@ -411,9 +404,9 @@ def get_occ(self, mo_energy=None, mo_coeff=None):
         if mo_energy is None: mo_energy = self.mo_energy
         mol = self.mol
         if not mol.symmetry:
-            return uhf.UHF.get_occ(self, mo_energy, mo_coeff)
+            raise RuntimeError('mol.symmetry not enabled')
 
-        orbsyma, orbsymb = self.get_orbsym(mo_coeff, self.get_ovlp())
+        orbsyma, orbsymb = self.get_orbsym(mo_coeff)
         mo_occ = numpy.zeros_like(mo_energy)
         idx_ea_left = []
         idx_eb_left = []
@@ -516,7 +509,7 @@ def _finalize(self):
         idxb = numpy.hstack((idxb[self.mo_occ[1]> 0][ob_sort],
                              idxb[self.mo_occ[1]==0][vb_sort]))
         self.mo_energy = (ea[idxa], eb[idxb])
-        orbsyma, orbsymb = self.get_orbsym(self.mo_coeff, self.get_ovlp())
+        orbsyma, orbsymb = self.get_orbsym(self.mo_coeff)
         orbsyma = orbsyma[idxa]
         orbsymb = orbsymb[idxb]
         degen_a = degen_b = None
@@ -558,6 +551,8 @@ def get_irrep_nelec(self, mol=None, mo_coeff=None, mo_occ=None, s=None):
     def get_orbsym(self, mo_coeff=None, s=None):
         if mo_coeff is None:
             mo_coeff = self.mo_coeff
+        if getattr(mo_coeff, 'orbsym', None) is not None:
+            return mo_coeff.orbsym
         if s is None:
             s = self.get_ovlp()
         return get_orbsym(self.mol, mo_coeff, s)
diff --git a/pyscf/solvent/_ddcosmo_tdscf_grad.py b/pyscf/solvent/_ddcosmo_tdscf_grad.py
index 6a6c0cea03..3f9067ce6f 100644
--- a/pyscf/solvent/_ddcosmo_tdscf_grad.py
+++ b/pyscf/solvent/_ddcosmo_tdscf_grad.py
@@ -280,7 +280,7 @@ def tdrks_grad_elec(td_grad, x_y, singlet=True, atmlst=None,
             tdrks_grad._contract_xc_kernel(td_grad, mf.xc, dmxpy,
                                            dmzoo, True, True, singlet, max_memory)
 
-    if abs(hyb) > 1e-10:
+    if ni.libxc.is_hybrid_xc(mf.xc):
         dm = (dmzoo, dmxpy+dmxpy.T, dmxmy-dmxmy.T)
         vj, vk = mf.get_jk(mol, dm, hermi=0)
         if with_solvent.equilibrium_solvation:
@@ -289,7 +289,7 @@ def tdrks_grad_elec(td_grad, x_y, singlet=True, atmlst=None,
             vj[0] += mf.with_solvent._B_dot_x(dmzoo)
 
         vk *= hyb
-        if abs(omega) > 1e-10:
+        if omega != 0:
             vk += mf.get_k(mol, dm, hermi=0, omega=omega) * (alpha-hyb)
         veff0doo = vj[0] * 2 - vk[0] + f1oo[0] + k1ao[0] * 2
         wvo = reduce(numpy.dot, (orbv.T, veff0doo, orbo)) * 2
@@ -365,11 +365,11 @@ def fvind(x):
 
     dmz1doo = z1ao + dmzoo
     oo0 = reduce(numpy.dot, (orbo, orbo.T))
-    if abs(hyb) > 1e-10:
+    if ni.libxc.is_hybrid_xc(mf.xc):
         dm = (oo0, dmz1doo+dmz1doo.T, dmxpy+dmxpy.T, dmxmy-dmxmy.T)
         vj, vk = td_grad.get_jk(mol, dm)
         vk *= hyb
-        if abs(omega) > 1e-10:
+        if omega != 0:
             with mol.with_range_coulomb(omega):
                 vk += td_grad.get_k(mol, dm) * (alpha-hyb)
         vj = vj.reshape(-1,3,nao,nao)
@@ -688,12 +688,12 @@ def tduks_grad_elec(td_grad, x_y, atmlst=None, max_memory=2000, verbose=logger.I
             tduks_grad._contract_xc_kernel(td_grad, mf.xc, (dmxpya,dmxpyb),
                                            (dmzooa,dmzoob), True, True, max_memory)
 
-    if abs(hyb) > 1e-10:
+    if ni.libxc.is_hybrid_xc(mf.xc):
         dm = (dmzooa, dmxpya+dmxpya.T, dmxmya-dmxmya.T,
               dmzoob, dmxpyb+dmxpyb.T, dmxmyb-dmxmyb.T)
         vj, vk = mf.get_jk(mol, dm, hermi=0)
         vk *= hyb
-        if abs(omega) > 1e-10:
+        if omega != 0:
             vk += mf.get_k(mol, dm, hermi=0, omega=omega) * (alpha-hyb)
         vj = vj.reshape(2,3,nao,nao)
         vk = vk.reshape(2,3,nao,nao)
@@ -818,13 +818,13 @@ def fvind(x):
     oo0b = reduce(numpy.dot, (orbob, orbob.T))
     as_dm1 = oo0a + oo0b + (dmz1dooa + dmz1doob) * .5
 
-    if abs(hyb) > 1e-10:
+    if ni.libxc.is_hybrid_xc(mf.xc):
         dm = (oo0a, dmz1dooa+dmz1dooa.T, dmxpya+dmxpya.T, dmxmya-dmxmya.T,
               oo0b, dmz1doob+dmz1doob.T, dmxpyb+dmxpyb.T, dmxmyb-dmxmyb.T)
         vj, vk = td_grad.get_jk(mol, dm)
         vj = vj.reshape(2,4,3,nao,nao)
         vk = vk.reshape(2,4,3,nao,nao) * hyb
-        if abs(omega) > 1e-10:
+        if omega != 0:
             with mol.with_range_coulomb(omega):
                 vk += td_grad.get_k(mol, dm).reshape(2,4,3,nao,nao) * (alpha-hyb)
         veff1 = vj[0] + vj[1] - vk
diff --git a/pyscf/solvent/ddcosmo.py b/pyscf/solvent/ddcosmo.py
index 492b1192a6..d532cc4cf7 100644
--- a/pyscf/solvent/ddcosmo.py
+++ b/pyscf/solvent/ddcosmo.py
@@ -384,7 +384,7 @@ def make_L(pcmobj, r_vdw, ylm_1sph, fi):
         for ka in atoms_with_vdw_overlap(ja, atom_coords, r_vdw):
             vjk = r_vdw[ja] * coords_1sph + atom_coords[ja] - atom_coords[ka]
             tjk = lib.norm(vjk, axis=1) / r_vdw[ka]
-            wjk = pcmobj.regularize_xt(tjk, eta, r_vdw[ka])
+            wjk = pcmobj.regularize_xt(tjk, eta)
             wjk *= part_weights
             pol = sph.multipoles(vjk, lmax)
             p1 = 0
@@ -408,7 +408,7 @@ def make_fi(pcmobj, r_vdw):
             v = r_vdw[ia]*coords_1sph + atom_coords[ia] - atom_coords[ja]
             rv = lib.norm(v, axis=1)
             t = rv / r_vdw[ja]
-            xt = pcmobj.regularize_xt(t, eta, r_vdw[ja])
+            xt = pcmobj.regularize_xt(t, eta)
             fi[ia] += xt
     fi[fi < 1e-20] = 0
     return fi
@@ -851,8 +851,7 @@ def _B_dot_x(self, dm):
     get_atomic_radii = get_atomic_radii
 
     def regularize_xt(self, t, eta, scale=1):
-        # scale = eta*scale, is it correct?
-        return regularize_xt(t, eta*scale)
+        return regularize_xt(t, eta)
 
     def nuc_grad_method(self, grad_method):
         '''For grad_method in vacuum, add nuclear gradients of solvent
diff --git a/pyscf/solvent/ddcosmo_grad.py b/pyscf/solvent/ddcosmo_grad.py
index 2b3026a935..91cb49f43d 100644
--- a/pyscf/solvent/ddcosmo_grad.py
+++ b/pyscf/solvent/ddcosmo_grad.py
@@ -164,8 +164,8 @@ def make_L1(pcmobj, r_vdw, ylm_1sph, fi):
             vjk = r_vdw[ja] * coords_1sph + atom_coords[ja] - atom_coords[ka]
             rv = lib.norm(vjk, axis=1)
             tjk = rv / r_vdw[ka]
-            wjk0 = pcmobj.regularize_xt(tjk, eta, r_vdw[ka])
-            wjk1 = regularize_xt1(tjk, eta*r_vdw[ka])
+            wjk0 = pcmobj.regularize_xt(tjk, eta)
+            wjk1 = regularize_xt1(tjk, eta)
             sjk = vjk.T / rv
             wjk1 = 1./r_vdw[ka] * wjk1 * sjk
 
@@ -243,7 +243,7 @@ def make_fi1(pcmobj, r_vdw):
             v = r_vdw[ia]*coords_1sph + atom_coords[ia] - atom_coords[ja]
             rv = lib.norm(v, axis=1)
             t = rv / r_vdw[ja]
-            xt1 = regularize_xt1(t, eta*r_vdw[ja])
+            xt1 = regularize_xt1(t, eta)
             s_ij = v.T / rv
             xt1 = 1./r_vdw[ja] * xt1 * s_ij
             fi1[ia,:,ia] += xt1
diff --git a/pyscf/solvent/ddpcm.py b/pyscf/solvent/ddpcm.py
index d4fbe594cb..8c9d2eee7e 100644
--- a/pyscf/solvent/ddpcm.py
+++ b/pyscf/solvent/ddpcm.py
@@ -149,8 +149,7 @@ def energy(pcmobj, dm):
     epcm = gen_ddpcm_solver(pcmobj, pcmobj.verbose)(dm)[0]
     return epcm
 
-def regularize_xt(t, eta, scale=1):
-    eta *= scale
+def regularize_xt(t, eta):
     xt = numpy.zeros_like(t)
     inner = t <= 1-eta
     on_shell = (1-eta < t) & (t < 1)
@@ -330,7 +329,7 @@ def _B_dot_x(self, dm):
     gen_solver = as_solver = gen_ddpcm_solver
 
     def regularize_xt(self, t, eta, scale=1):
-        return regularize_xt(t, eta, scale)
+        return regularize_xt(t, eta)
 
     def nuc_grad_method(self, grad_method):
         raise NotImplementedError
diff --git a/pyscf/solvent/test/test_ddcosmo.py b/pyscf/solvent/test/test_ddcosmo.py
index 7c7f84bf0f..be13e22f20 100644
--- a/pyscf/solvent/test/test_ddcosmo.py
+++ b/pyscf/solvent/test/test_ddcosmo.py
@@ -70,7 +70,7 @@ def make_L(pcmobj, r_vdw, lebedev_order, lmax, eta=0.1):
             sjk = vjk / v.reshape(-1,1)
             Ys = sph.real_sph_vec(sjk, lmax, True)
             # scale the weight, see JCTC 9, 3637, Eq (16)
-            wjk = pcmobj.regularize_xt(tjk, eta, r_vdw[ka])
+            wjk = pcmobj.regularize_xt(tjk, eta)
             wjk[fi[ja]>1] /= fi[ja,fi[ja]>1]
             tt = numpy.ones_like(wjk)
             p1 = 0
@@ -249,24 +249,24 @@ def test_ddcosmo_scf(self):
         pcm.lmax = 6
         pcm.lebedev_order = 17
         mf = ddcosmo.ddcosmo_for_scf(scf.RHF(mol), pcm).run()
-        self.assertAlmostEqual(mf.e_tot, -112.35450855007909, 9)
+        self.assertAlmostEqual(mf.e_tot, -112.35463433688, 9)
 
     def test_ddcosmo_scf_with_overwritten_attributes(self):
         mf = ddcosmo.ddcosmo_for_scf(scf.RHF(mol))
         mf.kernel()
-        self.assertAlmostEqual(mf.e_tot, -75.57036436805902, 9)
+        self.assertAlmostEqual(mf.e_tot, -75.57006258287, 9)
 
         mf.with_solvent.lebedev_order = 15
         mf.with_solvent.lmax = 5
         mf.with_solvent.eps = .5
         mf.with_solvent.conv_tol = 1e-8
         mf.kernel()
-        self.assertAlmostEqual(mf.e_tot, -75.55326109712902, 9)
+        self.assertAlmostEqual(mf.e_tot, -75.55351392557, 9)
 
         mf.with_solvent.grids.radi_method = dft.mura_knowles
         mf.with_solvent.grids.atom_grid = {"H": (8, 50), "O": (8, 50),}
         mf.kernel()
-        self.assertAlmostEqual(mf.e_tot, -75.55216799624262, 9)
+        self.assertAlmostEqual(mf.e_tot, -75.55237426980, 9)
 
     def test_make_ylm(self):
         numpy.random.seed(1)
@@ -414,7 +414,6 @@ def test_B_dot_x(self):
         vmat = pcm._B_dot_x(dm)
         self.assertEqual(vmat.shape, (2,nao,nao))
         self.assertAlmostEqual(abs(vmat-vref*.5).max(), 0, 12)
-        self.assertAlmostEqual(lib.fp(vmat), -17.383712106418606, 12)
 
     def test_vmat(self):
         mol = gto.M(atom='H 0 0 0; H 0 1 1.2; H 1. .1 0; H .5 .5 1', verbose=0)
@@ -464,11 +463,11 @@ def test_newton_rohf(self):
         mf = mol.ROHF(max_memory=0).ddCOSMO()
         mf = mf.newton()
         e = mf.kernel()
-        self.assertAlmostEqual(e, -75.570364368046086, 9)
+        self.assertAlmostEqual(e, -75.57006258287, 9)
 
         mf = mol.RHF().ddCOSMO()
         e = mf.kernel()
-        self.assertAlmostEqual(e, -75.570364368046086, 9)
+        self.assertAlmostEqual(e, -75.57006258287, 9)
 
     def test_convert_scf(self):
         mf = mol.RHF().ddCOSMO()
@@ -490,13 +489,14 @@ def test_rhf_tda(self):
         # TDA with equilibrium_solvation
         mf = mol.RHF().ddCOSMO().run(conv_tol=1e-10)
         td = mf.TDA().ddCOSMO().run(equilibrium_solvation=True)
-        ref = numpy.array([0.3014315117408341, 0.358844688787903, 0.3951664712235241])
+        ref = numpy.array([0.30124900879, 0.358722766464, 0.3950184783571])
         self.assertAlmostEqual(abs(ref - td.e).max(), 0, 7)
+        
 
         # TDA without equilibrium_solvation
         mf = mol.RHF().ddCOSMO().run(conv_tol=1e-10)
         td = mf.TDA().ddCOSMO().run()
-        ref = numpy.array([0.3016104587222408, 0.358896882513815, 0.4004977667270891])
+        ref = numpy.array([0.301421953639, 0.358782851661, 0.400409174628])
         self.assertAlmostEqual(abs(ref - td.e).max(), 0, 7)
 
 # TODO: add tests for direct-scf, ROHF, ROKS, .newton(), and their mixes
diff --git a/pyscf/solvent/test/test_ddcosmo_grad.py b/pyscf/solvent/test/test_ddcosmo_grad.py
index eab2df6767..08ccc9fd8d 100644
--- a/pyscf/solvent/test/test_ddcosmo_grad.py
+++ b/pyscf/solvent/test/test_ddcosmo_grad.py
@@ -734,7 +734,7 @@ def get_phi1(pcmojb):
         pcmobj = ddcosmo.DDCOSMO(mol2)
         L_S2, phi = get_phi1(pcmobj)[:2]
         e2 = numpy.einsum('jx,jx', phi, L_S)
-        self.assertAlmostEqual(abs((e2-e1)/dx - phi1[0,2]).max(), 0, 7)
+        self.assertAlmostEqual(abs((e2-e1)/dx - phi1[0,2]).max(), 0, 6)
 
     def test_fi(self):
         pcmobj = ddcosmo.DDCOSMO(mol0)
@@ -753,8 +753,8 @@ def test_fi(self):
         fi_2 = ddcosmo.make_fi(pcmobj, pcmobj.get_atomic_radii())
         ui_2 = 1 - fi_2
         ui_2[ui_2<0] = 0
-        self.assertAlmostEqual(abs((fi_2-fi_1)/dx - fi1[0,2]).max(), 0, 6)
-        self.assertAlmostEqual(abs((ui_2-ui_1)/dx - ui1[0,2]).max(), 0, 6)
+        self.assertAlmostEqual(abs((fi_2-fi_1)/dx - fi1[0,2]).max(), 0, 5)
+        self.assertAlmostEqual(abs((ui_2-ui_1)/dx - ui1[0,2]).max(), 0, 5)
 
     def test_L1(self):
         pcmobj = ddcosmo.DDCOSMO(mol0)
@@ -772,7 +772,7 @@ def test_L1(self):
         pcmobj = ddcosmo.DDCOSMO(mol2)
         fi = ddcosmo.make_fi(pcmobj, r_vdw)
         L_2 = ddcosmo.make_L(pcmobj, r_vdw, ylm_1sph, fi)
-        self.assertAlmostEqual(abs((L_2-L_1)/dx - L1[0,2]).max(), 0, 7)
+        self.assertAlmostEqual(abs((L_2-L_1)/dx - L1[0,2]).max(), 0, 6)
 
     def test_e_cosmo_grad(self):
         pcmobj = ddcosmo.DDCOSMO(mol0)
@@ -781,16 +781,16 @@ def test_e_cosmo_grad(self):
         e1 = pcmobj.energy(dm)
         pcmobj = ddcosmo.DDCOSMO(mol2)
         e2 = pcmobj.energy(dm)
-        self.assertAlmostEqual(abs((e2-e1)/dx - de[0,2]).max(), 0, 7)
+        self.assertAlmostEqual(abs((e2-e1)/dx - de[0,2]).max(), 0, 6)
 
     def test_scf_grad(self):
         mf = ddcosmo.ddcosmo_for_scf(scf.RHF(mol0)).run()
         # solvent only
         de_cosmo = ddcosmo_grad.kernel(mf.with_solvent, mf.make_rdm1())
-        self.assertAlmostEqual(lib.fp(de_cosmo), 0.000770107393352652, 6)
+        self.assertAlmostEqual(lib.fp(de_cosmo), 0.000902640319, 6)
         # solvent + solute
         de = mf.nuc_grad_method().kernel()
-        self.assertAlmostEqual(lib.fp(de), -0.1920179073822721, 6)
+        self.assertAlmostEqual(lib.fp(de), -0.191856565, 6)
 
         dm1 = mf.make_rdm1()
 
@@ -806,20 +806,20 @@ def test_scf_grad(self):
 
         sc = mf.nuc_grad_method().as_scanner()
         e, g = sc('H 0 1 0; H 0 1 1.2; H 1. 0 0; H .5 .5 0')
-        self.assertAlmostEqual(e, -0.8317337703056022, 8)
-        self.assertAlmostEqual(lib.fp(g), 0.06804297145388238, 6)
+        self.assertAlmostEqual(e, -0.83152362, 8)
+        self.assertAlmostEqual(lib.fp(g), 0.068317954, 6)
 
         mol3 = gto.M(atom='H 0 1 0; H 0 1 1.2; H 1. 0 0; H .5 .5 0', unit='B')
         mf = ddcosmo.ddcosmo_for_scf(scf.RHF(mol3)).run()
         de = mf.nuc_grad_method().kernel()
-        self.assertAlmostEqual(lib.fp(de), 0.06804297145388238, 6)
+        self.assertAlmostEqual(lib.fp(de), 0.0683179013, 6)
 
     def test_casci_grad(self):
         mf = scf.RHF(mol0).ddCOSMO().run()
         mc = solvent.ddCOSMO(mcscf.CASCI(mf, 2, 2))
         e, de = mc.nuc_grad_method().as_scanner()(mol0)
-        self.assertAlmostEqual(e, -1.1844606066401635, 7)
-        self.assertAlmostEqual(lib.fp(de), -0.18558925270492277, 5)
+        self.assertAlmostEqual(e, -1.18433554, 7)
+        self.assertAlmostEqual(lib.fp(de), -0.18543118, 5)
 
         mf = scf.RHF(mol1).run()
         mc1 = solvent.ddCOSMO(mcscf.CASCI(mf, 2, 2)).run()
@@ -850,8 +850,8 @@ def test_casscf_grad(self):
         mc = solvent.ddCOSMO(mcscf.CASSCF(mf, 2, 2)).set(conv_tol=1e-9)
         mc_g = mc.nuc_grad_method().as_scanner()
         e, de = mc_g(mol0)
-        self.assertAlmostEqual(e, -1.1964048498155815, 5)
-        self.assertAlmostEqual(lib.fp(de), -0.18331022006442843, 4)
+        self.assertAlmostEqual(e, -1.19627418, 5)
+        self.assertAlmostEqual(lib.fp(de), -0.1831184, 4)
 
         mf = scf.RHF(mol1).run()
         mc1 = solvent.ddCOSMO(mcscf.CASSCF(mf, 2, 2)).run(conv_tol=1e-9)
@@ -867,8 +867,8 @@ def test_ccsd_grad(self):
         mf = scf.RHF(mol0).ddCOSMO().run()
         mycc = cc.CCSD(mf).ddCOSMO()
         e, de = mycc.nuc_grad_method().as_scanner()(mol0)
-        self.assertAlmostEqual(e, -1.206178782599439, 7)
-        self.assertAlmostEqual(lib.fp(de), -0.17959270231901459, 5)
+        self.assertAlmostEqual(e, -1.2060391657, 7)
+        self.assertAlmostEqual(lib.fp(de), -0.1794318433, 5)
 
         mf = scf.RHF(mol1).run()
         mycc1 = solvent.ddCOSMO(cc.CCSD(mf)).run()
@@ -1017,6 +1017,16 @@ def getB(mol):
         de *= .5 * f_epsilon
         self.assertAlmostEqual(abs(de-ref).max(), 0, 12)
 
+    def test_regularize_xt(self):
+        pcmobj = ddcosmo.DDCOSMO(mol0)
+        numpy.random.seed(2)
+        t = numpy.random.rand(4)
+        eta = 0.8
+        L1 = ddcosmo_grad.regularize_xt1(t, eta)
+        L_1 = ddcosmo.regularize_xt(t-1e-4, eta)
+        L_2 = ddcosmo.regularize_xt(t+1e-4, eta)
+        self.assertAlmostEqual(abs((L_2-L_1)/2e-4 - L1).max(), 0, 6)
+
 
 if __name__ == "__main__":
     print("Full Tests for ddcosmo gradients")
diff --git a/pyscf/solvent/test/test_ddpcm.py b/pyscf/solvent/test/test_ddpcm.py
index eb2eddbb50..4496bc2bff 100644
--- a/pyscf/solvent/test/test_ddpcm.py
+++ b/pyscf/solvent/test/test_ddpcm.py
@@ -31,7 +31,7 @@ def test_ddpcm_scf(self):
         pcm.lmax = 6
         pcm.lebedev_order = 17
         mf = scf.RHF(mol).ddPCM(pcm).run()
-        self.assertAlmostEqual(mf.e_tot, -112.35457046199065, 9)
+        self.assertAlmostEqual(mf.e_tot, -112.3544929827, 8)
 
 
 if __name__ == "__main__":
diff --git a/pyscf/soscf/ciah.py b/pyscf/soscf/ciah.py
index 27d18f6054..d730eb4b2e 100644
--- a/pyscf/soscf/ciah.py
+++ b/pyscf/soscf/ciah.py
@@ -281,13 +281,6 @@ def davidson_cc(h_op, g_op, precond, x0, tol=1e-10, xs=[], ax=[],
 
 
 def _regular_step(heff, ovlp, xs, lindep, log):
-    try:
-        e, c = scipy.linalg.eigh(heff[1:,1:], ovlp[1:,1:])
-    except scipy.linalg.LinAlgError:
-        e, c = lib.safe_eigh(heff[1:,1:], ovlp[1:,1:], lindep)[:2]
-    if numpy.any(e < -1e-5):
-        log.debug('Negative hessians found %s', e[e<0])
-
     w, v, seig = lib.safe_eigh(heff, ovlp, lindep)
     if log.verbose >= logger.DEBUG3:
         numpy.set_printoptions(3, linewidth=1000)
@@ -303,8 +296,16 @@ def _regular_step(heff, ovlp, xs, lindep, log):
     idx = numpy.where(abs(v[0]) > 0.1)[0]
     sel = idx[0]
     log.debug1('CIAH eigen-sel %s', sel)
-
     w_t = w[sel]
+
+    if w_t < 1e-4:
+        try:
+            e, c = scipy.linalg.eigh(heff[1:,1:], ovlp[1:,1:])
+        except scipy.linalg.LinAlgError:
+            e, c = lib.safe_eigh(heff[1:,1:], ovlp[1:,1:], lindep)[:2]
+        if numpy.any(e < -1e-5):
+            log.debug('Negative hessians found %s', e[e<0])
+
     xtrial = _dgemv(v[1:,sel]/v[0,sel], xs)
     return xtrial, w_t, v[:,sel], sel, seig
 
diff --git a/pyscf/symm/basis.py b/pyscf/symm/basis.py
index 374d8ec956..87183eb1ea 100644
--- a/pyscf/symm/basis.py
+++ b/pyscf/symm/basis.py
@@ -439,22 +439,30 @@ def linearmole_irrep_symb2id(gpname, symb):
         if symb in DOOH_IRREP_ID_TABLE:
             return DOOH_IRREP_ID_TABLE[symb]
         else:
-            n = int(''.join([i for i in symb if i.isdigit()]))
-            if n % 2:
-                return (n//2)*10 + DOOH_IRREP_ID_TABLE['_odd'+symb[-2:]]
-            else:
-                return (n//2)*10 + DOOH_IRREP_ID_TABLE['_even'+symb[-2:]]
+            try:
+                n = int(''.join([i for i in symb if i.isdigit()]))
+                if n % 2:
+                    return (n//2)*10 + DOOH_IRREP_ID_TABLE['_odd'+symb[-2:]]
+                else:
+                    return (n//2)*10 + DOOH_IRREP_ID_TABLE['_even'+symb[-2:]]
+            except (KeyError, ValueError):
+                raise PointGroupSymmetryError(f'Incorrect Dooh irrep {symb}')
     elif gpname == 'Coov':
         if symb in COOV_IRREP_ID_TABLE:
             return COOV_IRREP_ID_TABLE[symb]
         else:
-            n = int(''.join([i for i in symb if i.isdigit()]))
-            if n % 2:
-                return (n//2)*10 + COOV_IRREP_ID_TABLE['_odd'+symb[-1]]
-            else:
-                return (n//2)*10 + COOV_IRREP_ID_TABLE['_even'+symb[-1]]
+            if 'g' in symb or 'u' in symb:
+                raise PointGroupSymmetryError(f'Incorrect Coov irrep {symb}')
+            try:
+                n = int(''.join([i for i in symb if i.isdigit()]))
+                if n % 2:
+                    return (n//2)*10 + COOV_IRREP_ID_TABLE['_odd'+symb[-1]]
+                else:
+                    return (n//2)*10 + COOV_IRREP_ID_TABLE['_even'+symb[-1]]
+            except (KeyError, ValueError):
+                raise PointGroupSymmetryError(f'Incorrect Coov irrep {symb}')
     else:
-        raise PointGroupSymmetryError('%s is not proper for linear molecule.' % gpname)
+        raise PointGroupSymmetryError(f'Incorrect cylindrical symmetry group {gpname}')
 
 DOOH_IRREP_SYMBS = ('A1g' , 'A2g' , 'E1gx', 'E1gy' , 'A2u', 'A1u' , 'E1uy', 'E1ux')
 DOOH_IRREP_SYMBS_EXT = ('gx' , 'gy' , 'gx', 'gy' , 'uy', 'ux' , 'uy', 'ux')
@@ -473,13 +481,15 @@ def linearmole_irrep_id2symb(gpname, irrep_id):
         else:
             l = abs(linearmole_irrep2momentum(irrep_id))
             n = irrep_id % 10
+            if n >= 4:
+                raise PointGroupSymmetryError(f'Incorrect Coov irrep {irrep_id}')
             if n % 2:
                 xy = 'y'
             else:
                 xy = 'x'
             return 'E%d%s' % (l, xy)
     else:
-        raise PointGroupSymmetryError('%s is not proper for linear molecule.' % gpname)
+        raise PointGroupSymmetryError(f'Incorrect cylindrical symmetry group {gpname}')
 
 def linearmole_irrep2momentum(irrep_id):
     if irrep_id % 10 in (0, 1, 5, 4):
diff --git a/pyscf/symm/geom.py b/pyscf/symm/geom.py
index 1915e84048..2ee075e63a 100644
--- a/pyscf/symm/geom.py
+++ b/pyscf/symm/geom.py
@@ -118,34 +118,32 @@ def alias_axes(axes, ref):
         new_axes = axes[[y_id,x_id,z_id]]
     return new_axes
 
-def _adjust_planar_c2v(atom_coords, center, axes):
+def _adjust_planar_c2v(atom_coords, axes):
     '''Adjust axes for planar molecules'''
     # Following http://iopenshell.usc.edu/resources/howto/symmetry/
     # See also dicussions in issue #1201
     # * planar C2v molecules should be oriented such that the X axis is perpendicular
     # to the plane of the molecule, and the Z axis is the axis of symmetry;
-    r = atom_coords - center
     natm = len(atom_coords)
     tol = TOLERANCE / numpy.sqrt(1+natm)
-    atoms_on_xz = abs(r.dot(axes[1])) < tol
+    atoms_on_xz = abs(atom_coords.dot(axes[1])) < tol
     if all(atoms_on_xz):
         # rotate xy
         axes = numpy.array([-axes[1], axes[0], axes[2]])
     return axes
 
-def _adjust_planar_d2h(atom_coords, center, axes):
+def _adjust_planar_d2h(atom_coords, axes):
     '''Adjust axes for planar molecules'''
     # Following http://iopenshell.usc.edu/resources/howto/symmetry/
     # See also dicussions in issue #1201
     # * planar D2h molecules should be oriented such that the X axis is
     # perpendicular to the plane of the molecule, and the Z axis passes through
     # the greatest number of atoms.
-    r = atom_coords - center
     natm = len(atom_coords)
     tol = TOLERANCE / numpy.sqrt(1+natm)
-    natm_with_x = numpy.count_nonzero(abs(r.dot(axes[0])) > tol)
-    natm_with_y = numpy.count_nonzero(abs(r.dot(axes[1])) > tol)
-    natm_with_z = numpy.count_nonzero(abs(r.dot(axes[2])) > tol)
+    natm_with_x = numpy.count_nonzero(abs(atom_coords.dot(axes[0])) > tol)
+    natm_with_y = numpy.count_nonzero(abs(atom_coords.dot(axes[1])) > tol)
+    natm_with_z = numpy.count_nonzero(abs(atom_coords.dot(axes[2])) > tol)
     if natm_with_z == 0:  # atoms on xy plane
         if natm_with_x >= natm_with_y:  # atoms-on-y >= atoms-on-x
             # rotate xz
@@ -292,7 +290,7 @@ def detect_symm(atoms, basis=None, verbose=logger.WARN):
                 if rawsys.has_icenter():
                     gpname = 'D2h'
                     # _adjust_planar_d2h is unlikely to be called
-                    axes = _adjust_planar_d2h(rawsys.atom_coords, charge_center, axes)
+                    axes = _adjust_planar_d2h(rawsys.atom_coords, axes)
                 else:
                     gpname = 'D2'
                 axes = alias_axes(axes, numpy.eye(3))
@@ -305,7 +303,7 @@ def detect_symm(atoms, basis=None, verbose=logger.WARN):
                     gpname = 'C2h'
                 elif rawsys.has_mirror(axes[0]):
                     gpname = 'C2v'
-                    axes = _adjust_planar_c2v(rawsys.atom_coords, charge_center, axes)
+                    axes = _adjust_planar_c2v(rawsys.atom_coords, axes)
                 else:
                     gpname = 'C2'
             else:
@@ -449,8 +447,8 @@ def symm_ops(gpname, axes=None):
              'C2x': opc2x,
              'C2y': opc2y,
              'i'  : opi,
-             'sz' : opcsz,
-             'sx' : opcsx,
+             'sz' : opcsz,  # the mirror perpendicular to z
+             'sx' : opcsx,  # the mirror perpendicular to x
              'sy' : opcsy,}
     return opdic
 
diff --git a/pyscf/symm/param.py b/pyscf/symm/param.py
index 6ab62d2089..1b555d0d6e 100644
--- a/pyscf/symm/param.py
+++ b/pyscf/symm/param.py
@@ -18,6 +18,10 @@
 
 import numpy
 
+
+# C2x, C2y, C2z are rotation axis parallel to x, y, z axes
+# sx, sy, sz are mirrors perpendicular to x, y, z axes
+#
 # D2h   C2h   C2v   D2   Cs   Ci   C2   C1
 # E     E     E     E    E    E    E    E
 # C2x               C2x
diff --git a/pyscf/symm/test/test_geom.py b/pyscf/symm/test/test_geom.py
index dd8c8475e4..4fd7c8fadd 100644
--- a/pyscf/symm/test/test_geom.py
+++ b/pyscf/symm/test/test_geom.py
@@ -876,6 +876,16 @@ def test_sort_coords(self):
         idx = symm.argsort_coords(c)
         self.assertAlmostEqual(abs(c[idx] - c0).max(), 0, 9)
 
+    def test_c2v_shifted(self):
+        atoms = [
+            ["C", [1.0000000, 0.0000000, 0.1238210]],
+            ["H", [1.0000000, 0.9620540, -0.3714630]],
+            ["H", [1.0000000, -0.9620540, -0.3714630]],
+        ]
+        l, orig, axes = geom.detect_symm(atoms)
+        self.assertEqual(l, 'C2v')
+        self.assertAlmostEqual(abs(axes - numpy.diag(axes.diagonal())).max(), 0, 9)
+
 
 def ring(n, start=0):
     r = 1. / numpy.sin(numpy.pi/n)
diff --git a/pyscf/tdscf/dhf.py b/pyscf/tdscf/dhf.py
index b420d667de..6d9946e39f 100644
--- a/pyscf/tdscf/dhf.py
+++ b/pyscf/tdscf/dhf.py
@@ -123,8 +123,8 @@ def add_hf_(a, b, hyb=1):
         from pyscf.dft import xc_deriv
         ni = mf._numint
         ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
-        if getattr(mf, 'nlc', '') != '':
-            raise NotImplementedError
+        if mf.nlc or ni.libxc.is_nlc(mf.xc):
+            raise NotImplementedError('DKS-TDDFT for NLC functionals')
 
         omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, mol.spin)
 
diff --git a/pyscf/tdscf/ghf.py b/pyscf/tdscf/ghf.py
index 8b204012b2..00cc0a672b 100644
--- a/pyscf/tdscf/ghf.py
+++ b/pyscf/tdscf/ghf.py
@@ -155,8 +155,8 @@ def add_hf_(a, b, hyb=1):
         from pyscf.dft import xc_deriv
         ni = mf._numint
         ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
-        if getattr(mf, 'nlc', '') != '':
-            raise NotImplementedError
+        if mf.nlc or ni.libxc.is_nlc(mf.xc):
+            raise NotImplementedError('DKS-TDDFT NLC functional')
 
         if not mf.collinear:
             raise NotImplementedError
diff --git a/pyscf/tdscf/rhf.py b/pyscf/tdscf/rhf.py
index d6d034640d..e4ae1cc9bb 100644
--- a/pyscf/tdscf/rhf.py
+++ b/pyscf/tdscf/rhf.py
@@ -144,7 +144,7 @@ def add_hf_(a, b, hyb=1):
         from pyscf.dft import xc_deriv
         ni = mf._numint
         ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
-        if getattr(mf, 'nlc', '') != '':
+        if mf.nlc or ni.libxc.is_nlc(mf.xc):
             logger.warn(mf, 'NLC functional found in DFT object.  Its second '
                         'deriviative is not available. Its contribution is '
                         'not included in the response function.')
diff --git a/pyscf/tdscf/test/test_tdrks.py b/pyscf/tdscf/test/test_tdrks.py
index 154678994a..c9d309d8c7 100644
--- a/pyscf/tdscf/test/test_tdrks.py
+++ b/pyscf/tdscf/test/test_tdrks.py
@@ -190,6 +190,17 @@ def test_tddft_b88p86_triplet(self):
         ref = numpy.sort(e[e.real>0])[[0,1,4,6,7]] * 27.2114
         self.assertAlmostEqual(abs(es - ref).max(), 0, 4)
 
+    def test_tda_rsh(self):
+        mol = gto.M(atom='H 0 0 0.6; H 0 0 0', basis = "6-31g")
+        mf = dft.RKS(mol)
+        mf.xc = 'wb97'
+        e = mf.kernel()
+        self.assertAlmostEqual(e, -1.14670613191817, 8)
+
+        e_td = mf.TDA().kernel()[0]
+        ref = [16.25021865, 27.93720198, 49.4665691]
+        self.assertAlmostEqual(abs(e_td*nist.HARTREE2EV - ref).max(), 0, 4)
+
     def test_tda_m06l_singlet(self):
         td = mf_m06l.TDA()
         es = td.kernel(nstates=5)[0] * 27.2114
diff --git a/pyscf/tdscf/uhf.py b/pyscf/tdscf/uhf.py
index a2a1c00586..d9e3d2c127 100644
--- a/pyscf/tdscf/uhf.py
+++ b/pyscf/tdscf/uhf.py
@@ -181,7 +181,7 @@ def add_hf_(a, b, hyb=1):
         from pyscf.dft import xc_deriv
         ni = mf._numint
         ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
-        if getattr(mf, 'nlc', '') != '':
+        if mf.nlc or ni.libxc.is_nlc(mf.xc):
             logger.warn(mf, 'NLC functional found in DFT object.  Its second '
                         'deriviative is not available. Its contribution is '
                         'not included in the response function.')
@@ -782,7 +782,7 @@ def vind(xys):
     return vind, hdiag
 
 
-class TDHF(TDMixin):
+class TDHF(TDA):
 
     singlet = None
 
diff --git a/pyscf/tools/cubegen.py b/pyscf/tools/cubegen.py
index 589e5708ca..97a428b138 100644
--- a/pyscf/tools/cubegen.py
+++ b/pyscf/tools/cubegen.py
@@ -327,11 +327,22 @@ def read(self, cube_file):
             natm = int(data[0])
             self.boxorig = numpy.array([float(x) for x in data[1:]])
             def parse_nx(data):
+                from pyscf.pbc.gto import Cell
                 d = data.split()
-                return int(d[0]), numpy.array([float(x) for x in d[1:]])
-            self.nx, self.xs = parse_nx(f.readline())
-            self.ny, self.ys = parse_nx(f.readline())
-            self.nz, self.zs = parse_nx(f.readline())
+                nx = int(d[0])
+                x_vec = numpy.array([float(x) for x in d[1:]]) * nx
+                if isinstance(self.mol, Cell):
+                    # Use an asymmetric mesh for tiling unit cells
+                    xs = numpy.linspace(0, 1, nx, endpoint=False)
+                else:
+                    # Use endpoint=True to get a symmetric mesh
+                    # see also the discussion https://github.com/sunqm/pyscf/issues/154
+                    xs = numpy.linspace(0, 1, nx, endpoint=True)
+                return x_vec, nx, xs
+            self.box = numpy.zeros((3,3))
+            self.box[0], self.nx, self.xs = parse_nx(f.readline())
+            self.box[1], self.ny, self.ys = parse_nx(f.readline())
+            self.box[2], self.nz, self.zs = parse_nx(f.readline())
             atoms = []
             for ia in range(natm):
                 d = f.readline().split()
diff --git a/pyscf/tools/dump_mat.py b/pyscf/tools/dump_mat.py
index aa469e7be8..f7ad160461 100644
--- a/pyscf/tools/dump_mat.py
+++ b/pyscf/tools/dump_mat.py
@@ -152,7 +152,7 @@ def dump_rec(stdout, c, label=None, label2=None,
         else:
             stdout.write(((' '*(digits+10))+'%s\n') % ' '.join(label2[ic:ic+m]))
             for k, v in enumerate(dc):
-                stdout.write(('%12s' % label[k]) + (fmt % tuple(v)))
+                stdout.write(('%-14s' % label[k]) + (fmt % tuple(v)))
 
 def dump_mo(mol, c, label=None,
             ncol=OUTPUT_COLS, digits=OUTPUT_DIGITS, start=BASE):
diff --git a/pyscf/tools/test/test_cubegen.py b/pyscf/tools/test/test_cubegen.py
index 0733991acb..3272571621 100644
--- a/pyscf/tools/test/test_cubegen.py
+++ b/pyscf/tools/test/test_cubegen.py
@@ -36,46 +36,78 @@ def tearDownModule():
 
 class KnownValues(unittest.TestCase):
     def test_mep(self):
-        ftmp = tempfile.NamedTemporaryFile()
-        mep = cubegen.mep(mol, ftmp.name, mf.make_rdm1(),
-                          nx=10, ny=10, nz=10)
-        self.assertEqual(mep.shape, (10,10,10))
-        self.assertAlmostEqual(lib.finger(mep), -0.3198103636180436, 5)
+        with tempfile.NamedTemporaryFile() as ftmp:
+            mep = cubegen.mep(mol, ftmp.name, mf.make_rdm1(),
+                              nx=10, ny=10, nz=10)
+            self.assertEqual(mep.shape, (10,10,10))
+            self.assertAlmostEqual(lib.fp(mep), -0.3198103636180436, 5)
 
-        mep = cubegen.mep(mol, ftmp.name, mf.make_rdm1(),
-                          nx=10, ny=10, nz=10, resolution=0.5)
-        self.assertEqual(mep.shape, (12,18,15))
-        self.assertAlmostEqual(lib.finger(mep), -4.653995909548524, 5)
+            mep = cubegen.mep(mol, ftmp.name, mf.make_rdm1(),
+                              nx=10, ny=10, nz=10, resolution=0.5)
+            self.assertEqual(mep.shape, (12,18,15))
+            self.assertAlmostEqual(lib.fp(mep), -4.653995909548524, 5)
 
     def test_orb(self):
-        ftmp = tempfile.NamedTemporaryFile()
-        orb = cubegen.orbital(mol, ftmp.name, mf.mo_coeff[:,0],
-                              nx=10, ny=10, nz=10)
-        self.assertEqual(orb.shape, (10,10,10))
-        self.assertAlmostEqual(lib.finger(orb), -0.11804191128016768, 5)
+        with tempfile.NamedTemporaryFile() as ftmp:
+            orb = cubegen.orbital(mol, ftmp.name, mf.mo_coeff[:,0],
+                                  nx=10, ny=10, nz=10)
+            self.assertEqual(orb.shape, (10,10,10))
+            self.assertAlmostEqual(lib.fp(orb), -0.11804191128016768, 5)
 
-        orb = cubegen.orbital(mol, ftmp.name, mf.mo_coeff[:,0],
-                              nx=10, ny=10, nz=10, resolution=0.5)
-        self.assertEqual(orb.shape, (12,18,15))
-        self.assertAlmostEqual(lib.finger(orb), -0.8591778390706646, 5)
+            orb = cubegen.orbital(mol, ftmp.name, mf.mo_coeff[:,0],
+                                  nx=10, ny=10, nz=10, resolution=0.5)
+            self.assertEqual(orb.shape, (12,18,15))
+            self.assertAlmostEqual(lib.fp(orb), -0.8591778390706646, 5)
 
-        orb = cubegen.orbital(mol, ftmp.name, mf.mo_coeff[:,0],
-                              nx=10, ny=1, nz=1)
-        self.assertEqual(orb.shape, (10,1,1))
-        self.assertAlmostEqual(lib.finger(orb), 6.921008881822988e-09, 5)
+            orb = cubegen.orbital(mol, ftmp.name, mf.mo_coeff[:,0],
+                                  nx=10, ny=1, nz=1)
+            self.assertEqual(orb.shape, (10,1,1))
+            self.assertAlmostEqual(lib.fp(orb), 6.921008881822988e-09, 5)
 
 
     def test_rho(self):
-        ftmp = tempfile.NamedTemporaryFile()
-        rho = cubegen.density(mol, ftmp.name, mf.make_rdm1(),
-                              nx=10, ny=10, nz=10)
-        self.assertEqual(rho.shape, (10,10,10))
-        self.assertAlmostEqual(lib.finger(rho), -0.3740462814001553, 5)
+        with tempfile.NamedTemporaryFile() as ftmp:
+            rho = cubegen.density(mol, ftmp.name, mf.make_rdm1(),
+                                  nx=10, ny=10, nz=10)
+            self.assertEqual(rho.shape, (10,10,10))
+            self.assertAlmostEqual(lib.fp(rho), -0.3740462814001553, 5)
+
+            rho = cubegen.density(mol, ftmp.name, mf.make_rdm1(),
+                                  nx=10, ny=10, nz=10, resolution=0.5)
+            self.assertEqual(rho.shape, (12,18,15))
+            self.assertAlmostEqual(lib.fp(rho), -1.007950007160415, 5)
+
+    def test_rho_with_pbc(self):
+        from pyscf.pbc.gto import Cell
+        cell = Cell()
+        cell.unit = 'B'
+        cell.atom = '''
+        C  0.          0.          0.
+        C  1.68506879  1.68506879  1.68506879
+        '''
+        cell.a = '''
+        0.          3.37013758  3.37013758
+        3.37013758  0.          3.37013758
+        3.37013758  3.37013758  0.
+        '''
+        cell.basis = 'gth-szv'
+        cell.pseudo = 'gth-pade'
+        cell.mesh = [11]*3
+        cell.verbose = 5
+        cell.output = '/dev/null'
+        cell.build()
+        mf = cell.RHF().run()
+        with tempfile.NamedTemporaryFile() as ftmp:
+            rho = cubegen.density(cell, ftmp.name, mf.make_rdm1(),
+                                  nx=10, ny=10, nz=10)
+            cc = cubegen.Cube(cell)
+            self.assertEqual(rho.shape, (10,10,10))
+            self.assertAlmostEqual(lib.fp(rho), -0.253781345652853, 5)
+
+            rho1 = cc.read(ftmp.name)
+            self.assertAlmostEqual(abs(rho1 - rho).max(), 0, 5)
+            self.assertAlmostEqual(abs(cc.box - cell.lattice_vectors()).max(), 0, 5)
 
-        rho = cubegen.density(mol, ftmp.name, mf.make_rdm1(),
-                              nx=10, ny=10, nz=10, resolution=0.5)
-        self.assertEqual(rho.shape, (12,18,15))
-        self.assertAlmostEqual(lib.finger(rho), -1.007950007160415, 5)
 
 if __name__ == "__main__":
     print("Full Tests for molden")
diff --git a/pyscf/x2c/tdscf.py b/pyscf/x2c/tdscf.py
index 4cf296121c..ca3cdc8ab5 100644
--- a/pyscf/x2c/tdscf.py
+++ b/pyscf/x2c/tdscf.py
@@ -74,8 +74,8 @@ def add_hf_(a, b, hyb=1):
         from pyscf.dft import xc_deriv
         ni = mf._numint
         ni.libxc.test_deriv_order(mf.xc, 2, raise_error=True)
-        if getattr(mf, 'nlc', '') != '':
-            raise NotImplementedError
+        if mf.nlc or ni.libxc.is_nlc(mf.xc):
+            raise NotImplementedError('X2C-TDDFT for NLC functionals')
 
         if not mf.collinear:
             raise NotImplementedError
diff --git a/setup.py b/setup.py
index ae1a622a79..d4bc19e66f 100755
--- a/setup.py
+++ b/setup.py
@@ -28,6 +28,8 @@
 'Programming Language :: Python :: 3.7',
 'Programming Language :: Python :: 3.8',
 'Programming Language :: Python :: 3.9',
+'Programming Language :: Python :: 3.10',
+'Programming Language :: Python :: 3.11',
 'Topic :: Software Development',
 'Topic :: Scientific/Engineering',
 'Operating System :: POSIX',
@@ -58,7 +60,6 @@ def get_version():
 
 EXTRAS = {
     'geomopt': ['pyberny>=0.6.2', 'geometric>=0.9.7.2', 'pyscf-qsdopt'],
-    'dftd3': ['pyscf-dftd3'],
     #'dmrgscf': ['pyscf-dmrgscf'],
     'doci': ['pyscf-doci'],
     'icmpspt': ['pyscf-icmpspt'],
@@ -68,6 +69,7 @@ def get_version():
     'cppe': ['cppe'],
     'pyqmc': ['pyqmc'],
     'mcfun': ['mcfun>=0.2.1'],
+    'bse': ['basis-set-exchange'],
 }
 EXTRAS['all'] = [p for extras in EXTRAS.values() for p in extras]
 # extras which should not be installed by "all" components