diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e7210bdc6e1c5b3774871ed7b9604acaecdbe91..7d5f34d2f45f1261a9e5db83b227a3ac218abe42 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -110,6 +110,10 @@ IF(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
     EXECUTE_PROCESS(COMMAND ${PYTHON_EXECUTABLE} -m site --user-site OUTPUT_VARIABLE PY_SITE OUTPUT_STRIP_TRAILING_WHITESPACE)
     SET(CMAKE_INSTALL_PREFIX "${PY_SITE}/dartflo" CACHE STRING "Install location" FORCE)
     SET(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT FALSE)
+ELSE()
+    IF(NOT(CMAKE_INSTALL_PREFIX MATCHES "dartflo$"))
+        SET(CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}/dartflo" CACHE STRING "Install location" FORCE)
+    ENDIF()
 ENDIF()
 IF(UNIX AND NOT APPLE)
     SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}")
diff --git a/dart/src/wAdjoint.cpp b/dart/src/wAdjoint.cpp
index 23fe93fbda6af18f0484d7a5c37276ab51d8fff1..cb48a3a68a918be3c752f0d3751dc6dfeb67ba95 100644
--- a/dart/src/wAdjoint.cpp
+++ b/dart/src/wAdjoint.cpp
@@ -332,18 +332,7 @@ std::vector<std::vector<double>> Adjoint::computeGradientCoefficientsFlow()
     // Compute gradients of lift and drag
     Eigen::RowVectorXd dCl = Eigen::RowVectorXd::Zero(sol->pbl->msh->nodes.size()), dCd = Eigen::RowVectorXd::Zero(sol->pbl->msh->nodes.size());
     for (auto bnd : sol->pbl->bodies)
-    {
-        // gradients of loads
-        Eigen::SparseMatrix<double, Eigen::RowMajor> dL = Eigen::SparseMatrix<double, Eigen::RowMajor>(3 * bnd->nodes.size(), sol->pbl->msh->nodes.size());
-        this->buildGradientLoadsFlow(dL, *bnd);
-        // project to lift and drag directions
-        Eigen::RowVector3d dirL = sol->pbl->dirL.eval().transpose(), dirD = sol->pbl->dirD.eval().transpose();
-        for (size_t i = 0; i < bnd->nodes.size(); ++i)
-        {
-            dCl += dirL * dL.middleRows(i * 3, 3);
-            dCd += dirD * dL.middleRows(i * 3, 3);
-        }
-    }
+        this->buildGradientCoefficientsFlow(dCl, dCd, *bnd);
     return std::vector<std::vector<double>>{std::vector<double>(dCl.data(), dCl.data() + dCl.size()), std::vector<double>(dCd.data(), dCd.data() + dCd.size())};
 }
 
@@ -369,23 +358,12 @@ std::vector<std::vector<double>> Adjoint::computeGradientCoefficientsMesh()
     // Compute the gradients of the lift and drag
     Eigen::RowVectorXd dCl = Eigen::RowVectorXd::Zero(sol->pbl->nDim * sol->pbl->msh->nodes.size()), dCd = Eigen::RowVectorXd::Zero(sol->pbl->nDim * sol->pbl->msh->nodes.size());
     for (auto bnd : sol->pbl->bodies)
-    {
-        // gradients of loads
-        Eigen::SparseMatrix<double, Eigen::RowMajor> dL = Eigen::SparseMatrix<double, Eigen::RowMajor>(3 * bnd->nodes.size(), sol->pbl->nDim * sol->pbl->msh->nodes.size());
-        this->buildGradientLoadsMesh(dL, *bnd);
-        // project to lift and drag directions
-        Eigen::RowVector3d dirL = sol->pbl->dirL.eval().transpose(), dirD = sol->pbl->dirD.eval().transpose();
-        for (size_t i = 0; i < bnd->nodes.size(); ++i)
-        {
-            dCl += dirL * dL.middleRows(i * 3, 3);
-            dCd += dirD * dL.middleRows(i * 3, 3);
-        }
-    }
+        this->buildGradientCoefficientsMesh(dCl, dCd, *bnd);
     return std::vector<std::vector<double>>{std::vector<double>(dCl.data(), dCl.data() + dCl.size()), std::vector<double>(dCd.data(), dCd.data() + dCd.size())};
 }
 
 /**
- * @brief Build the gradients of the loads with respect to the flow variables one a given body
+ * @brief Build gradients of the loads with respect to the flow variables on a given body
  */
 void Adjoint::buildGradientLoadsFlow(Eigen::SparseMatrix<double, Eigen::RowMajor> &dL, Body const &bnd)
 {
@@ -421,7 +399,43 @@ void Adjoint::buildGradientLoadsFlow(Eigen::SparseMatrix<double, Eigen::RowMajor
 }
 
 /**
- * @brief Compute gradients of the residual with respect to angle of attack
+ * @brief Build gradients of the load coefficients with respect to the flow variables on a given body
+ * 
+ * Note: this is essentially the same as buildGradientLoadsFlow, but with an additional projection performed inside the loop for performance
+ */
+void Adjoint::buildGradientCoefficientsFlow(Eigen::RowVectorXd &dCl, Eigen::RowVectorXd &dCd, Body const &bnd)
+{
+    // Multithread
+    tbb::global_control control(tbb::global_control::max_allowed_parallelism, nthreads);
+    tbb::spin_mutex mutex;
+
+    // Build gradients
+    Eigen::RowVector3d dirL = sol->pbl->dirL.eval().transpose(), dirD = sol->pbl->dirD.eval().transpose();
+    tbb::parallel_for_each(bnd.groups[0]->tag->elems.begin(), bnd.groups[0]->tag->elems.end(), [&](Element *e) {
+        // Associated volume element
+        Element *eV = bnd.svMap.at(e);
+        // Build
+        Eigen::MatrixXd Ae = LoadFunctional::buildGradientFlow(*e, *eV, sol->phi, *sol->pbl->fluid->cP);
+        // Project
+        Eigen::RowVectorXd dCle = Eigen::RowVectorXd::Zero(eV->nodes.size()), dCde = Eigen::RowVectorXd::Zero(eV->nodes.size());
+        for (size_t i = 0; i < e->nodes.size(); ++i)
+        {
+            dCle += dirL * Ae.middleRows(i * 3, 3);
+            dCde += dirD * Ae.middleRows(i * 3, 3);
+        }
+        // Assembly
+        tbb::spin_mutex::scoped_lock lock(mutex);
+        for (size_t j = 0; j < eV->nodes.size(); ++j)
+        {
+            int rowj = eV->nodes[j]->row;
+            dCl(rowj) += dCle(j);
+            dCd(rowj) += dCde(j);
+        }
+    });
+}
+
+/**
+ * @brief Build gradients of the residual with respect to angle of attack
  */
 void Adjoint::buildGradientResidualAoa(Eigen::VectorXd &dR)
 {
@@ -451,7 +465,7 @@ void Adjoint::buildGradientResidualAoa(Eigen::VectorXd &dR)
 }
 
 /**
- * @brief Compute gradients of the lift and drag with respect to angle of attack
+ * @brief Build gradients of the lift and drag with respect to angle of attack
  */
 void Adjoint::buildGradientLoadsAoa(double &dCl, double &dCd)
 {
@@ -466,7 +480,7 @@ void Adjoint::buildGradientLoadsAoa(double &dCl, double &dCd)
 }
 
 /**
- * @brief Compute gradients of the residual with respect to mesh coordinates
+ * @brief Build gradients of the residual with respect to mesh coordinates
  */
 void Adjoint::buildGradientResidualMesh(Eigen::SparseMatrix<double, Eigen::RowMajor> &dR)
 {
@@ -585,7 +599,7 @@ void Adjoint::buildGradientResidualMesh(Eigen::SparseMatrix<double, Eigen::RowMa
 }
 
 /**
- * @brief Compute gradients of the loads with respect to mesh coordinates, on a given body
+ * @brief Build gradients of the loads with respect to mesh coordinates on a given body
  */
 void Adjoint::buildGradientLoadsMesh(Eigen::SparseMatrix<double, Eigen::RowMajor> &dL, Body const &bnd)
 {
@@ -630,6 +644,60 @@ void Adjoint::buildGradientLoadsMesh(Eigen::SparseMatrix<double, Eigen::RowMajor
     dL.makeCompressed();
 }
 
+/**
+ * @brief Build gradients of the loads with respect to mesh coordinates on a given body
+ * 
+ * Note: this is essentially the same as buildGradientLoadsMesh, but with an additional projection performed inside the loop for performance
+ */
+void Adjoint::buildGradientCoefficientsMesh(Eigen::RowVectorXd &dCl, Eigen::RowVectorXd &dCd, Body const &bnd)
+{
+    // Multithread
+    tbb::global_control control(tbb::global_control::max_allowed_parallelism, nthreads);
+    tbb::spin_mutex mutex;
+
+    // Build gradients
+    Eigen::RowVector3d dirL = sol->pbl->dirL.eval().transpose(), dirD = sol->pbl->dirD.eval().transpose();
+    tbb::parallel_for_each(bnd.groups[0]->tag->elems.begin(), bnd.groups[0]->tag->elems.end(), [&](Element *e) {
+        // Associated volume element
+        Element *eV = bnd.svMap.at(e);
+        // Build
+        Eigen::MatrixXd Aes, Aev;
+        std::tie(Aes, Aev) = LoadFunctional::buildGradientMesh(*e, *eV, sol->phi, *sol->pbl->fluid->cP, sol->pbl->nDim);
+        // Project
+        Eigen::RowVectorXd dCles = Eigen::RowVectorXd::Zero(sol->pbl->nDim * e->nodes.size()), dCdes = Eigen::RowVectorXd::Zero(sol->pbl->nDim * e->nodes.size());
+        Eigen::RowVectorXd dClev = Eigen::RowVectorXd::Zero(sol->pbl->nDim * eV->nodes.size()), dCdev = Eigen::RowVectorXd::Zero(sol->pbl->nDim * eV->nodes.size());
+        for (size_t i = 0; i < e->nodes.size(); ++i)
+        {
+            dCles += dirL * Aes.middleRows(i * 3, 3);
+            dCdes += dirD * Aes.middleRows(i * 3, 3);
+            dClev += dirL * Aev.middleRows(i * 3, 3);
+            dCdev += dirD * Aev.middleRows(i * 3, 3);
+        }
+        // Assembly
+        tbb::spin_mutex::scoped_lock lock(mutex);
+        // surface
+        for (size_t j = 0; j < e->nodes.size(); ++j)
+        {
+            int rowj = e->nodes[j]->row;
+            for (int n = 0; n < sol->pbl->nDim; ++n)
+            {
+                dCl(sol->pbl->nDim * rowj + n) += dCles(sol->pbl->nDim * j + n);
+                dCd(sol->pbl->nDim * rowj + n) += dCdes(sol->pbl->nDim * j + n);
+            }
+        }
+        // volume
+        for (size_t j = 0; j < eV->nodes.size(); ++j)
+        {
+            int rowj = eV->nodes[j]->row;
+            for (int n = 0; n < sol->pbl->nDim; ++n)
+            {
+                dCl(sol->pbl->nDim * rowj + n) += dClev(sol->pbl->nDim * j + n);
+                dCd(sol->pbl->nDim * rowj + n) += dCdev(sol->pbl->nDim * j + n);
+            }
+        }
+    });
+}
+
 /**
  * @brief Write the results
  */
diff --git a/dart/src/wAdjoint.h b/dart/src/wAdjoint.h
index a38c2706e9cac8a4a7af8acb20eda63c095f78c2..308a438271a57291ab5d3dcc0bc91ee0389555af 100644
--- a/dart/src/wAdjoint.h
+++ b/dart/src/wAdjoint.h
@@ -93,12 +93,14 @@ public:
 private:
     // Partial gradients wrt with respect to flow variables
     void buildGradientLoadsFlow(Eigen::SparseMatrix<double, Eigen::RowMajor> &dL, Body const &bnd);
+    void buildGradientCoefficientsFlow(Eigen::RowVectorXd &dCl, Eigen::RowVectorXd &dCd, Body const &bnd);
     // Partial gradients with respect to angle of attack
     void buildGradientResidualAoa(Eigen::VectorXd &dR);
     void buildGradientLoadsAoa(double &dCl, double &dCd);
     // Partial gradients with respect to mesh coordinates
     void buildGradientResidualMesh(Eigen::SparseMatrix<double, Eigen::RowMajor> &dR);
     void buildGradientLoadsMesh(Eigen::SparseMatrix<double, Eigen::RowMajor> &dL, Body const &bnd);
+    void buildGradientCoefficientsMesh(Eigen::RowVectorXd &dCl, Eigen::RowVectorXd &dCd, Body const &bnd);
 };
 
 } // namespace dart
diff --git a/dart/src/wNewton.cpp b/dart/src/wNewton.cpp
index 79bcd1375a1487f7125872cfbfcd3c47e284405e..bd6560eabe63a2d19a7d4c9f93a15fb8479d2871 100644
--- a/dart/src/wNewton.cpp
+++ b/dart/src/wNewton.cpp
@@ -41,6 +41,7 @@
 #include <tbb/global_control.h>
 #include <tbb/parallel_for_each.h>
 #include <tbb/spin_mutex.h>
+#include <tbb/combinable.h>
 
 #include <iomanip>
 #include <deque>
@@ -271,11 +272,8 @@ STATUS Newton::run()
  */
 void Newton::buildJac(Eigen::SparseMatrix<double, Eigen::RowMajor> &J)
 {
-    // Multithread
-    tbb::spin_mutex mutex;
-
-    // List of triplets to build Jacobian matrix
-    std::deque<Eigen::Triplet<double>> T;
+    // List of (thead-safe) triplets to build Jacobian matrix
+    tbb::combinable<std::deque<Eigen::Triplet<double>>> cmbT;
 
     // Full Potential Equation with upwind bias: analytical derivatives
     tms["0-Jbase"].start();
@@ -290,14 +288,14 @@ void Newton::buildJac(Eigen::SparseMatrix<double, Eigen::RowMajor> &J)
         std::tie(Ae1, Ae2) = PotentialResidual::buildGradientFlow(*e, *eU, phi, *fluid, muCv, mCOv);
 
         // Assembly (subsonic)
-        tbb::spin_mutex::scoped_lock lock(mutex);
+        std::deque<Eigen::Triplet<double>> &locT = cmbT.local();
         for (size_t ii = 0; ii < e->nodes.size(); ++ii)
         {
             Node *nodi = e->nodes[ii];
             for (size_t jj = 0; jj < e->nodes.size(); ++jj)
             {
                 Node *nodj = e->nodes[jj];
-                T.push_back(Eigen::Triplet<double>(rows[nodi->row], nodj->row, Ae1(ii, jj)));
+                locT.push_back(Eigen::Triplet<double>(rows[nodi->row], nodj->row, Ae1(ii, jj)));
             }
         }
         // Assembly (supersonic)
@@ -310,7 +308,7 @@ void Newton::buildJac(Eigen::SparseMatrix<double, Eigen::RowMajor> &J)
                 for (size_t jj = 0; jj < eU->nodes.size(); ++jj)
                 {
                     Node *nodj = eU->nodes[jj];
-                    T.push_back(Eigen::Triplet<double>(rows[nodi->row], nodj->row, Ae2(ii, jj)));
+                    locT.push_back(Eigen::Triplet<double>(rows[nodi->row], nodj->row, Ae2(ii, jj)));
                 }
             }
         }
@@ -328,19 +326,19 @@ void Newton::buildJac(Eigen::SparseMatrix<double, Eigen::RowMajor> &J)
             Eigen::MatrixXd Aue, Ale;
             std::tie(Aue, Ale) = WakeResidual::buildGradientFlow(*we, phi);
             // Assembly
-            tbb::spin_mutex::scoped_lock lock(mutex);
+            std::deque<Eigen::Triplet<double>> &locT = cmbT.local();
             for (size_t ii = 0; ii < we->nRow; ++ii)
             {
                 Node *nodi = we->wkN[ii].second;
                 for (size_t jj = 0; jj < we->nColUp; ++jj)
                 {
                     Node *nodj = we->volUpE->nodes[jj];
-                    T.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, Aue(ii, jj)));
+                    locT.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, Aue(ii, jj)));
                 }
                 for (size_t jj = 0; jj < we->nColLw; ++jj)
                 {
                     Node *nodj = we->volLwE->nodes[jj];
-                    T.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, -Ale(ii, jj)));
+                    locT.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, -Ale(ii, jj)));
                 }
             }
         });
@@ -354,14 +352,14 @@ void Newton::buildJac(Eigen::SparseMatrix<double, Eigen::RowMajor> &J)
             // Build K
             Eigen::MatrixXd Ae = KuttaResidual::buildGradientFlow(*ke, phi);
             // Assembly
-            tbb::spin_mutex::scoped_lock lock(mutex);
+            std::deque<Eigen::Triplet<double>> &locT = cmbT.local();
             for (size_t ii = 0; ii < ke->nRow; ++ii)
             {
                 Node *nodi = ke->teN[ii].second;
                 for (size_t jj = 0; jj < ke->nCol; ++jj)
                 {
                     Node *nodj = ke->volE->nodes[jj];
-                    T.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, Ae(ii, jj)));
+                    locT.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, Ae(ii, jj)));
                 }
             }
         });
@@ -369,6 +367,10 @@ void Newton::buildJac(Eigen::SparseMatrix<double, Eigen::RowMajor> &J)
     tms["1-Jkutta"].stop();
 
     // Build Jacobian matrix without BCs
+    std::deque<Eigen::Triplet<double>> T = cmbT.combine([&](std::deque<Eigen::Triplet<double>> x, std::deque<Eigen::Triplet<double>> const &y) {
+        x.insert(x.begin(), y.begin(), y.end());
+        return x;
+    });
     J.setFromTriplets(T.begin(), T.end());
 
     // Apply Dirichlet BCs