From 306ef085e2d6b588db9ff1853effa806e1f109a9 Mon Sep 17 00:00:00 2001
From: acrovato <170-acrovato@users.noreply.gitlab.uliege.be>
Date: Tue, 24 Mar 2020 14:59:06 +0100
Subject: [PATCH] Changed assembly strategy for Kutta. Added internal Timers to
 flow. Changed verbosity (bool to int)).

---
 flow/default.py          |   2 +
 flow/scripts/config.py   |   1 +
 flow/src/wAdjoint.cpp    |  18 ++--
 flow/src/wNewton.cpp     | 119 ++++++++++++-----------
 flow/src/wPicard.cpp     | 199 ++++++++++++++++++++-------------------
 flow/src/wSolver.cpp     |   2 +-
 flow/src/wSolver.h       |   6 +-
 tbox/src/wLinesearch.cpp |  28 +++---
 tbox/src/wLinesearch.h   |   2 +-
 tbox/src/wMshDeform.cpp  |  47 +++++----
 10 files changed, 224 insertions(+), 200 deletions(-)

diff --git a/flow/default.py b/flow/default.py
index 5c5df956..74d9ba10 100644
--- a/flow/default.py
+++ b/flow/default.py
@@ -28,6 +28,7 @@ from fwk.wutils import parseargs
 def initPicard(_solver):
     args = parseargs()
     _solver.nthreads = args.k
+    _solver.verbose = args.verb
     _solver.relTol = 1e-6
     _solver.absTol = 1e-8
     _solver.maxIt = 25
@@ -39,6 +40,7 @@ def initPicard(_solver):
 def initNewton(_solver):
     args = parseargs()
     _solver.nthreads = args.k
+    _solver.verbose = args.verb
     _solver.relTol = 1e-6
     _solver.absTol = 1e-8
     _solver.maxIt = 25
diff --git a/flow/scripts/config.py b/flow/scripts/config.py
index d9aabb04..6a5bd844 100644
--- a/flow/scripts/config.py
+++ b/flow/scripts/config.py
@@ -125,6 +125,7 @@ class Config(object):
             raise RuntimeError('Available nonlinear solver type: Picard or Newton, but ' + p['NSolver'] + ' was given!\n')
         args = parseargs()
         self.solver.nthreads = args.k
+        self.solver.verbose = args.verb
         self.solver.relTol = p['Rel_tol']
         self.solver.absTol = p['Abs_tol']
         self.solver.maxIt = p['Max_it']
diff --git a/flow/src/wAdjoint.cpp b/flow/src/wAdjoint.cpp
index 30413670..21171fdf 100644
--- a/flow/src/wAdjoint.cpp
+++ b/flow/src/wAdjoint.cpp
@@ -130,7 +130,7 @@ void Adjoint::buildDR(Eigen::SparseMatrix<double, Eigen::RowMajor> &dR)
 
     // List of triplets to build Jacobian matrix
     std::vector<Eigen::Triplet<double>> T;
-    T.reserve(sol->pbl->msh->nodes.size());
+    T.reserve(sol->pbl->msh->nodes.size() * sol->pbl->msh->nodes.size() / 1000);
 
     // Full Potential Equation with upwind bias: analytical derivatives
     auto fluid = sol->pbl->medium;
@@ -208,8 +208,6 @@ void Adjoint::buildDR(Eigen::SparseMatrix<double, Eigen::RowMajor> &dR)
             }
         }
     });
-    // Build Jacobian matrix without BCs
-    dR.setFromTriplets(T.begin(), T.end());
     // Apply wake BCs for adjoint problem
     for (auto wake : sol->pbl->wBCs)
     {
@@ -227,9 +225,11 @@ void Adjoint::buildDR(Eigen::SparseMatrix<double, Eigen::RowMajor> &dR)
                 for (size_t jj = 0; jj < we->nRow; ++jj)
                 {
                     Node *nodj = we->surUpE->nodes[jj];
-                    dR.coeffRef(nodi->row, nodj->row) += 2 * Kupup(ii, jj);
+                    T.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, 2 * Kupup(ii, jj)));
+                    //dR.coeffRef(nodi->row, nodj->row) += 2 * Kupup(ii, jj);
                     nodj = we->surLwE->nodes[jj];
-                    dR.coeffRef(nodi->row, nodj->row) += Kuplw(ii, jj);
+                    T.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, Kuplw(ii, jj)));
+                    //dR.coeffRef(nodi->row, nodj->row) += Kuplw(ii, jj);
                 }
             }
             for (size_t ii = 0; ii < we->nColLw; ++ii)
@@ -238,13 +238,17 @@ void Adjoint::buildDR(Eigen::SparseMatrix<double, Eigen::RowMajor> &dR)
                 for (size_t jj = 0; jj < we->nRow; ++jj)
                 {
                     Node *nodj = we->surUpE->nodes[jj];
-                    dR.coeffRef(nodi->row, nodj->row) -= 2 * Klwup(ii, jj);
+                    //dR.coeffRef(nodi->row, nodj->row) -= 2 * Klwup(ii, jj);
+                    T.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, -2 * Klwup(ii, jj)));
                     nodj = we->surLwE->nodes[jj];
-                    dR.coeffRef(nodi->row, nodj->row) -= Klwlw(ii, jj);
+                    //dR.coeffRef(nodi->row, nodj->row) -= Klwlw(ii, jj);
+                    T.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, -Klwlw(ii, jj)));
                 }
             }
         });
     }
+    // Build Jacobian matrix without BCs
+    dR.setFromTriplets(T.begin(), T.end());
     // Apply Dirichlet BCs
     for (auto dBC : sol->pbl->dBCs)
     {
diff --git a/flow/src/wNewton.cpp b/flow/src/wNewton.cpp
index c29c0a0e..5bf51037 100644
--- a/flow/src/wNewton.cpp
+++ b/flow/src/wNewton.cpp
@@ -173,11 +173,14 @@ bool Newton::run()
         buildJac(J);
 
         // Solve linear SoE J \DeltaPhi = R
+        tms["3-Slinsol"].start();
         mumps.setSoE(J, rPhi_);
         mumps.compute(dPhi_);
+        tms["3-Slinsol"].stop();
         nIt++;
 
         // Compute new solution and residual with linesearch
+        tms["4-Rlnsrch"].start();
         fpe.update(phi, dPhi_);
         try
         {
@@ -186,6 +189,7 @@ bool Newton::run()
         catch (...)
         {
         }
+        tms["4-Rlnsrch"].stop();
         absRes = rPhi_.norm();
         relRes = absRes / resInit;
         // Re-find best upwind element
@@ -226,20 +230,29 @@ bool Newton::run()
     // Check the solution
     if (relRes < relTol || absRes < absTol)
     {
-        std::cout << ANSI_COLOR_GREEN << "Newton solver converged!" << ANSI_COLOR_RESET << std::endl
-                  << std::endl;
+        std::cout << ANSI_COLOR_GREEN << "Newton solver converged!" << ANSI_COLOR_RESET << std::endl;
+        if (verbose > 0)
+            std::cout << "Newton solver CPU" << std::endl
+                      << tms;
+        std::cout << std::endl;
         return true;
     }
     else if (std::isnan(relRes))
     {
-        std::cout << ANSI_COLOR_RED << "Newton solver diverged!" << ANSI_COLOR_RESET << std::endl
-                  << std::endl;
+        std::cout << ANSI_COLOR_RED << "Newton solver diverged!" << ANSI_COLOR_RESET << std::endl;
+        if (verbose > 0)
+            std::cout << "Newton solver CPU" << std::endl
+                      << tms;
+        std::cout << std::endl;
         return false;
     }
     else
     {
-        std::cout << ANSI_COLOR_YELLOW << "Newton solver not fully converged!" << ANSI_COLOR_RESET << std::endl
-                  << std::endl;
+        std::cout << ANSI_COLOR_YELLOW << "Newton solver not fully converged!" << ANSI_COLOR_RESET << std::endl;
+        if (verbose > 0)
+            std::cout << "Newton solver CPU" << std::endl
+                      << tms;
+        std::cout << std::endl;
         return true;
     }
 }
@@ -253,11 +266,26 @@ void Newton::buildJac(Eigen::SparseMatrix<double, Eigen::RowMajor> &J)
     // Multithread
     tbb::spin_mutex mutex;
 
+    // List of rows to build Jacobian matrix
+    std::vector<int> rows(pbl->msh->nodes.size());
+    for (size_t i = 0; i < rows.size(); ++i)
+        rows[i] = pbl->msh->nodes[i]->row;
+    // Kutta condition step 1: equality of normal flux through the wake
+    // -> add the upper wake node rows to lower wake nodes rows and zero the upper wake node rows
+    // Nishida Thesis, pp. 29, Eq 2.17
+    // -> directly assemble upper rows on lower rows
+    // Galbraith paper
+    for (auto wake : pbl->wBCs)
+        tbb::parallel_do(wake->nodMap.begin(), wake->nodMap.end(), [&](std::pair<Node *, Node *> p) {
+            rows[p.first->row] = p.second->row;
+        });
+
     // List of triplets to build Jacobian matrix
     std::vector<Eigen::Triplet<double>> T;
-    T.reserve(pbl->msh->nodes.size());
+    T.reserve(pbl->msh->nodes.size() * pbl->msh->nodes.size() / 1000);
 
     // Full Potential Equation with upwind bias: analytical derivatives
+    tms["0-Jbase"].start();
     auto fluid = pbl->medium;
     tbb::parallel_do(fluid->adjMap.begin(), fluid->adjMap.end(), [&](std::pair<Element *, std::vector<Element *>> p) {
         // Current element
@@ -312,12 +340,12 @@ void Newton::buildJac(Eigen::SparseMatrix<double, Eigen::RowMajor> &J)
                 for (size_t jj = 0; jj < e->nodes.size(); ++jj)
                 {
                     Node *nodj = e->nodes[jj];
-                    T.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, Ae4(ii, jj) + Ae5(ii, jj)));
+                    T.push_back(Eigen::Triplet<double>(rows[nodi->row], nodj->row, Ae4(ii, jj) + Ae5(ii, jj)));
                 }
                 for (size_t jj = 0; jj < eU->nodes.size(); ++jj)
                 {
                     Node *nodj = eU->nodes[jj];
-                    T.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, Ae3(ii, jj)));
+                    T.push_back(Eigen::Triplet<double>(rows[nodi->row], nodj->row, Ae3(ii, jj)));
                 }
             }
         }
@@ -329,39 +357,16 @@ void Newton::buildJac(Eigen::SparseMatrix<double, Eigen::RowMajor> &J)
             for (size_t jj = 0; jj < e->nodes.size(); ++jj)
             {
                 Node *nodj = e->nodes[jj];
-                T.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, Ae1(ii, jj) + Ae2(ii, jj)));
+                T.push_back(Eigen::Triplet<double>(rows[nodi->row], nodj->row, Ae1(ii, jj) + Ae2(ii, jj)));
             }
         }
     });
-    // Build Jacobian matrix without BCs
-    J.setFromTriplets(T.begin(), T.end());
+    tms["0-Jbase"].stop();
 
-    // Kutta condition (3 steps): analytical derivatives
-    // Egality of normal flux through the wake
-    // -> add the upper wake node lines to lower wake nodes line and zero the upper wake node lines
-    // Nishida Thesis, pp. 29, Eq 2.17
-    for (auto wake : pbl->wBCs)
-    {
-        for (auto p : wake->nodMap)
-        {
-            int idxUp = p.first->row;
-            int idxLw = p.second->row;
-            // first grab upper row values...
-            std::vector<Eigen::Triplet<double>> tUp;
-            tUp.reserve(pbl->msh->nodes.size() / 10);
-            for (Eigen::SparseMatrix<double, Eigen::RowMajor>::InnerIterator it(J, idxUp); it; ++it)
-            {
-                tUp.push_back(Eigen::Triplet<double>(it.row(), it.col(), it.value()));
-                it.valueRef() = 0.;
-            }
-            // ...then add them to lower row values
-            for (auto t : tUp)
-                J.coeffRef(idxLw, t.col()) += t.value();
-        }
-    }
-    // Egality of velocity norm through the wake
-    // -> add the Kutta equation on the upper wake node lines
+    // Kutta condition step 2: equality of velocity norm through the wake
+    // -> add the Kutta equation on the upper wake node rows
     // Nishida Thesis, pp. 29-30, Eq. 2.18
+    tms["1-Jkutta"].start();
     for (auto wake : pbl->wBCs)
     {
         tbb::parallel_do(wake->wEle.begin(), wake->wEle.end(), [&](WakeElement *we) {
@@ -376,18 +381,18 @@ void Newton::buildJac(Eigen::SparseMatrix<double, Eigen::RowMajor> &J)
                 for (size_t jj = 0; jj < we->nColUp; ++jj)
                 {
                     Node *nodj = we->volUpE->nodes[jj];
-                    J.coeffRef(nodi->row, nodj->row) += Aue(ii, jj);
+                    T.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, Aue(ii, jj)));
                 }
                 for (size_t jj = 0; jj < we->nColLw; ++jj)
                 {
                     Node *nodj = we->volLwE->nodes[jj];
-                    J.coeffRef(nodi->row, nodj->row) -= Ale(ii, jj);
+                    T.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, -Ale(ii, jj)));
                 }
             }
         });
     }
-    // Egality of velocity norm on the Trailing edge (Kutta)
-    // -> add the Kutta equation on the upper TE node lines
+    // Kutta condition step 3: equality of velocity norm on the Trailing edge (Kutta)
+    // -> add the Kutta equation on the upper TE node rows (only works for 2D cases)
     // Galbraith paper, pp. 7, Eq. 34
     for (auto kutta : pbl->kSCs)
     {
@@ -403,13 +408,18 @@ void Newton::buildJac(Eigen::SparseMatrix<double, Eigen::RowMajor> &J)
                 for (size_t jj = 0; jj < ke->nCol; ++jj)
                 {
                     Node *nodj = ke->volE->nodes[jj];
-                    J.coeffRef(nodi->row, nodj->row) += Ae(ii, jj);
+                    T.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, Ae(ii, jj)));
                 }
             }
         });
     }
+    tms["1-Jkutta"].stop();
+
+    // Build Jacobian matrix without BCs
+    J.setFromTriplets(T.begin(), T.end());
 
     // Apply Dirichlet BCs
+    tms["2-Jdbc"].start();
     for (auto dBC : pbl->dBCs)
     {
         for (auto nod : dBC->nodes)
@@ -423,12 +433,13 @@ void Newton::buildJac(Eigen::SparseMatrix<double, Eigen::RowMajor> &J)
             }
         }
     }
+    tms["2-Jdbc"].stop();
 
     // Clean matrix and turn to compressed row format
     J.prune(0.);
     J.makeCompressed();
 
-    if (verbose)
+    if (verbose > 1)
         std::cout << "J (" << J.rows() << "," << J.cols() << ") nnz=" << J.nonZeros() << "\n";
 }
 
@@ -520,20 +531,19 @@ void Newton::buildRes(Eigen::Map<Eigen::VectorXd> &R)
     // Slip BCs are naturally enforced in FEM, hence not applied explicitely
 
     // Kutta condition (3 steps)
-    // Egality of normal flux through the wake
-    // -> add the upper wake node lines to lower wake nodes line and zero the upper wake node lines
+    // Equality of normal flux through the wake
+    // -> add the upper wake node rows to lower wake nodes rows and zero the upper wake node rows
     // Nishida Thesis, pp. 29, Eq 2.17
+    // here we do not use Galbraith method (as in J) as it would be slower (rows[] needs to be rebuilt!)
     for (auto wake : pbl->wBCs)
     {
         tbb::parallel_do(wake->nodMap.begin(), wake->nodMap.end(), [&](std::pair<Node *, Node *> p) {
-            int idxUp = p.first->row;
-            int idxLw = p.second->row;
-            R(idxLw) += R[idxUp];
-            R(idxUp) = 0.;
+            R(p.second->row) += R[p.first->row];
+            R(p.first->row) = 0.;
         });
     }
-    // Egality of velocity norm through the wake
-    // -> add the Kutta equation on the upper wake node lines
+    // Equality of velocity norm through the wake
+    // -> add the Kutta equation on the upper wake node rows
     // Nishida Thesis, pp. 29-30, Eq. 2.18
     for (auto wake : pbl->wBCs)
     {
@@ -551,10 +561,9 @@ void Newton::buildRes(Eigen::Map<Eigen::VectorXd> &R)
             }
         });
     }
-    // Egality of velocity norm on the Trailing edge (Kutta)
-    // -> add the Kutta equation on the upper TE node lines
+    // Equality of velocity norm on the Trailing edge (Kutta)
+    // -> add the Kutta equation on the upper TE node rows (only works for 2D cases)
     // Galbraith paper, pp. 7, Eq. 34
-    // Currently work only for Line2 element
     for (auto kutta : pbl->kSCs)
     {
         tbb::parallel_do(kutta->kEle.begin(), kutta->kEle.end(), [&](KuttaElement *ke) {
@@ -578,7 +587,7 @@ void Newton::buildRes(Eigen::Map<Eigen::VectorXd> &R)
             R(nod->row) = 0.;
     }
 
-    if (verbose)
+    if (verbose > 1)
         std::cout << "R (" << R.size() << ")\n";
 }
 
diff --git a/flow/src/wPicard.cpp b/flow/src/wPicard.cpp
index 3bf73578..1ff686b9 100644
--- a/flow/src/wPicard.cpp
+++ b/flow/src/wPicard.cpp
@@ -143,8 +143,10 @@ bool Picard::run()
         phiOld = phi_;
 
         // Solve linear SoE
+        tms["4-Slinsol"].start();
         mumps.setSoE(A, b_);
         mumps.compute(phi_);
+        tms["4-Slinsol"].start();
 
         // Relaxation
         phi_ = (1 - relax) * phiOld + relax * phi_;
@@ -159,20 +161,29 @@ bool Picard::run()
     // Check the solution
     if (relRes < relTol || absRes < absTol)
     {
-        std::cout << ANSI_COLOR_GREEN << "Picard solver converged!" << ANSI_COLOR_RESET << std::endl
-                  << std::endl;
+        std::cout << ANSI_COLOR_GREEN << "Picard solver converged!" << ANSI_COLOR_RESET << std::endl;
+        if (verbose > 0)
+            std::cout << "Picard solver CPU" << std::endl
+                      << tms;
+        std::cout << std::endl;
         return true;
     }
     else if (std::isnan(relRes))
     {
-        std::cout << ANSI_COLOR_RED << "Picard sovler diverged!" << ANSI_COLOR_RESET << std::endl
-                  << std::endl;
+        std::cout << ANSI_COLOR_RED << "Picard sovler diverged!" << ANSI_COLOR_RESET << std::endl;
+        if (verbose > 0)
+            std::cout << "Picard solver CPU" << std::endl
+                      << tms;
+        std::cout << std::endl;
         return false;
     }
     else
     {
-        std::cout << ANSI_COLOR_YELLOW << "Picard solver not fully converged!" << ANSI_COLOR_RESET << std::endl
-                  << std::endl;
+        std::cout << ANSI_COLOR_YELLOW << "Picard solver not fully converged!" << ANSI_COLOR_RESET << std::endl;
+        if (verbose > 0)
+            std::cout << "Picard solver CPU" << std::endl
+                      << tms;
+        std::cout << std::endl;
         return true;
     }
 }
@@ -186,10 +197,26 @@ void Picard::build(Eigen::SparseMatrix<double, Eigen::RowMajor> &A, std::vector<
     // Multithread
     tbb::spin_mutex mutex;
 
+    // List of rows to build Jacobian matrix
+    std::vector<int> rows(pbl->msh->nodes.size());
+    for (size_t i = 0; i < rows.size(); ++i)
+        rows[i] = pbl->msh->nodes[i]->row;
+    // Kutta condition step 1: equality of normal flux through the wake
+    // -> add the upper wake node rows to lower wake nodes rows and zero the upper wake node rows
+    // Nishida Thesis, pp. 29, Eq 2.17
+    // -> directly assemble upper rows on lower rows
+    // Galbraith paper
+    for (auto wake : pbl->wBCs)
+        tbb::parallel_do(wake->nodMap.begin(), wake->nodMap.end(), [&](std::pair<Node *, Node *> p) {
+            rows[p.first->row] = p.second->row;
+        });
+
     // List of triplets to build matrix
     std::vector<Eigen::Triplet<double>> T;
-    T.reserve(pbl->msh->nodes.size());
+    T.reserve(pbl->msh->nodes.size() * pbl->msh->nodes.size() / 1000);
 
+    // Full Potential Equation: analytical derivatives
+    tms["0-Abase"].start();
     auto fluid = pbl->medium;
     tbb::parallel_do(fluid->adjMap.begin(), fluid->adjMap.end(), [&](std::pair<Element *, std::vector<Element *>> p) {
         // Current element
@@ -207,29 +234,66 @@ void Picard::build(Eigen::SparseMatrix<double, Eigen::RowMajor> &A, std::vector<
             for (size_t jj = 0; jj < e->nodes.size(); ++jj)
             {
                 Node *nodj = e->nodes[jj];
-                T.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, Ae(ii, jj)));
+                T.push_back(Eigen::Triplet<double>(rows[nodi->row], nodj->row, Ae(ii, jj)));
             }
         }
     });
-    // Build matrix without BCs
-    A.setFromTriplets(T.begin(), T.end());
+    tms["0-Abase"].stop();
 
-    // Apply Dirichlet BCs to A matrix
-    for (auto dBC : pbl->dBCs)
+    // Kutta condition step 2: equality of velocity norm through the wake
+    // -> add the Kutta equation on the upper wake node lines
+    // Nishida Thesis, pp. 29-30, Eq. 2.18
+    tms["1-Akutta"].start();
+    for (auto wake : pbl->wBCs)
     {
-        for (auto nod : dBC->nodes)
-        {
-            for (Eigen::SparseMatrix<double, Eigen::RowMajor>::InnerIterator it(A, nod->row); it; ++it)
+        tbb::parallel_do(wake->wEle.begin(), wake->wEle.end(), [&](WakeElement *we) {
+            // Build Ku, Kl
+            Eigen::MatrixXd Aue, Ale;
+            we->buildK(Aue, Ale, phi, pbl->alpha);
+            // Assembly
+            tbb::spin_mutex::scoped_lock lock(mutex);
+            for (size_t ii = 0; ii < we->nRow; ++ii)
             {
-                if (it.row() == it.col())
-                    it.valueRef() = 1.;
-                else
-                    it.valueRef() = 0.;
+                Node *nodi = we->wkN[ii].second;
+                for (size_t jj = 0; jj < we->nColUp; ++jj)
+                {
+                    Node *nodj = we->volUpE->nodes[jj];
+                    T.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, Aue(ii, jj)));
+                }
+                for (size_t jj = 0; jj < we->nColLw; ++jj)
+                {
+                    Node *nodj = we->volLwE->nodes[jj];
+                    T.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, -Ale(ii, jj)));
+                }
             }
-        }
+        });
+    }
+    // Kutta condition step 3: equality of velocity norm on the Trailing edge (Kutta)
+    // -> add the Kutta equation on the upper TE node lines (only works for 2D cases)
+    // Galbraith paper, pp. 7, Eq. 34
+    for (auto kutta : pbl->kSCs)
+    {
+        tbb::parallel_do(kutta->kEle.begin(), kutta->kEle.end(), [&](KuttaElement *ke) {
+            // Build K
+            Eigen::MatrixXd Ae;
+            ke->buildK(Ae, phi, pbl->alpha);
+            // Assembly
+            tbb::spin_mutex::scoped_lock lock(mutex);
+            for (size_t ii = 0; ii < ke->nRow; ++ii)
+            {
+                Node *nodi = ke->teN[ii].second;
+                for (size_t jj = 0; jj < ke->nCol; ++jj)
+                {
+                    Node *nodj = ke->volE->nodes[jj];
+                    T.push_back(Eigen::Triplet<double>(nodi->row, nodj->row, Ae(ii, jj)));
+                }
+            }
+        });
     }
+    tms["1-Akutta"].stop();
 
     // Fill vector b: loop on all Neumann boundaries (except slip condition which are naturally enforced)
+    tms["2-bnbc"].start();
     for (auto nBC : pbl->fBCs)
     {
         tbb::parallel_do(nBC->tag->elems.begin(), nBC->tag->elems.end(), [&](Element *e) {
@@ -242,7 +306,7 @@ void Picard::build(Eigen::SparseMatrix<double, Eigen::RowMajor> &A, std::vector<
             for (size_t ii = 0; ii < e->nodes.size(); ++ii)
             {
                 Node *nodi = e->nodes[ii];
-                b[nodi->row] += be(ii);
+                b[rows[nodi->row]] += be(ii);
             }
         });
     }
@@ -259,95 +323,38 @@ void Picard::build(Eigen::SparseMatrix<double, Eigen::RowMajor> &A, std::vector<
             for (size_t ii = 0; ii < p.first->nodes.size(); ++ii)
             {
                 Node *nodi = p.first->nodes[ii];
-                b[nodi->row] += be(ii);
+                b[rows[nodi->row]] += be(ii);
             }
         });
     }
+    tms["2-bnbc"].stop();
 
-    // Apply Dirichlet BCs to b vector
-    for (auto dBC : pbl->dBCs)
-        dBC->apply(b);
+    // Build matrix without BCs
+    A.setFromTriplets(T.begin(), T.end());
 
-    // Kutta condition (3 steps)
-    // Egality of normal flux through the wake
-    // -> add the upper wake node lines to lower wake nodes line and zero the upper wake node lines
-    // Nishida Thesis, pp. 29, Eq 2.17
-    for (auto wake : pbl->wBCs)
-    {
-        tbb::parallel_do(wake->nodMap.begin(), wake->nodMap.end(), [&](std::pair<Node *, Node *> p) {
-            int idxUp = p.first->row;
-            int idxLw = p.second->row;
-            // first grab upper row values...
-            std::vector<Eigen::Triplet<double>> tUp;
-            tUp.reserve(pbl->msh->nodes.size() / 10);
-            for (Eigen::SparseMatrix<double, Eigen::RowMajor>::InnerIterator it(A, idxUp); it; ++it)
-            {
-                tUp.push_back(Eigen::Triplet<double>(it.row(), it.col(), it.value()));
-                it.valueRef() = 0.;
-            }
-            // ...then add them to lower row values
-            for (auto t : tUp)
-                A.coeffRef(idxLw, t.col()) += t.value();
-            // same for rhs
-            b[idxLw] += b[idxUp];
-            b[idxUp] = 0.;
-        });
-    }
-    // Egality of velocity norm through the wake
-    // -> add the Kutta equation on the upper wake node lines
-    // Nishida Thesis, pp. 29-30, Eq. 2.18
-    for (auto wake : pbl->wBCs)
-    {
-        tbb::parallel_do(wake->wEle.begin(), wake->wEle.end(), [&](WakeElement *we) {
-            // Build Ku, Kl
-            Eigen::MatrixXd Aue, Ale;
-            we->buildK(Aue, Ale, phi, pbl->alpha);
-            // Assembly
-            tbb::spin_mutex::scoped_lock lock(mutex);
-            for (size_t ii = 0; ii < we->nRow; ++ii)
-            {
-                Node *nodi = we->wkN[ii].second;
-                for (size_t jj = 0; jj < we->volUpE->nodes.size(); ++jj)
-                {
-                    Node *nodj = we->volUpE->nodes[jj];
-                    A.coeffRef(nodi->row, nodj->row) += Aue(ii, jj);
-                }
-                for (size_t jj = 0; jj < we->volLwE->nodes.size(); ++jj)
-                {
-                    Node *nodj = we->volLwE->nodes[jj];
-                    A.coeffRef(nodi->row, nodj->row) -= Ale(ii, jj);
-                }
-            }
-        });
-    }
-    // Egality of velocity norm on the Trailing edge (Kutta)
-    // -> add the Kutta equation on the upper TE node lines
-    // Galbraith paper, pp. 7, Eq. 34
-    for (auto kutta : pbl->kSCs)
+    // Apply Dirichlet BCs to A matrix and b vector
+    tms["3-Abdbc"].start();
+    for (auto dBC : pbl->dBCs)
     {
-        tbb::parallel_do(kutta->kEle.begin(), kutta->kEle.end(), [&](KuttaElement *ke) {
-            // Build K
-            Eigen::MatrixXd Ae;
-            ke->buildK(Ae, phi, pbl->alpha);
-            // Assembly
-            tbb::spin_mutex::scoped_lock lock(mutex);
-            for (size_t ii = 0; ii < ke->nRow; ++ii)
+        for (auto nod : dBC->nodes)
+        {
+            for (Eigen::SparseMatrix<double, Eigen::RowMajor>::InnerIterator it(A, nod->row); it; ++it)
             {
-                Node *nodi = ke->teN[ii].second;
-                for (size_t jj = 0; jj < ke->nCol; ++jj)
-                {
-                    Node *nodj = ke->volE->nodes[jj];
-                    A.coeffRef(nodi->row, nodj->row) += Ae(ii, jj);
-                }
+                if (it.row() == it.col())
+                    it.valueRef() = 1.;
+                else
+                    it.valueRef() = 0.;
             }
-        });
+        }
+        dBC->apply(b);
     }
+    tms["3-Abdbc"].stop();
 
     // Clean matrix and turn to compressed row format
     A.prune(0.);
     A.makeCompressed();
 
-    if (verbose)
+    if (verbose > 1)
     {
         std::cout << "A (" << A.rows() << "," << A.cols() << ") nnz=" << A.nonZeros() << "\n";
         std::cout << "b (" << b.size() << ")\n";
diff --git a/flow/src/wSolver.cpp b/flow/src/wSolver.cpp
index daaa6d37..4791a1ad 100644
--- a/flow/src/wSolver.cpp
+++ b/flow/src/wSolver.cpp
@@ -64,7 +64,7 @@ Solver::Solver(std::shared_ptr<Problem> _pbl) : pbl(_pbl)
 
     // Default parameters
     nthreads = 1;
-    verbose = false;
+    verbose = 0;
     maxIt = 50;
     relTol = 1e-6;
     absTol = 1e-8;
diff --git a/flow/src/wSolver.h b/flow/src/wSolver.h
index d7415b3a..430b274c 100644
--- a/flow/src/wSolver.h
+++ b/flow/src/wSolver.h
@@ -19,6 +19,7 @@
 
 #include "flow.h"
 #include "wObject.h"
+#include "wTimers.h"
 #include "wProblem.h"
 
 #include <iostream>
@@ -41,7 +42,7 @@ public:
     double relTol; ///< relative tolerance on the residual
     double absTol; ///< absolute tolerance on the residual
     int maxIt;     ///< max number of iterations
-    bool verbose;  ///< display more info
+    int verbose;   ///< display more info
 
     int nIt;                  ///< number of iterations
     std::vector<double> phi;  ///< full potential
@@ -55,6 +56,9 @@ public:
     double Cd;                ///< drag coefficient
     double Cm;                ///< pitch moment coefficient (positive nose-up)
 
+protected:
+    fwk::Timers tms; ///< internal timers
+
 public:
     Solver(std::shared_ptr<Problem> _pbl);
     ~Solver();
diff --git a/tbox/src/wLinesearch.cpp b/tbox/src/wLinesearch.cpp
index 09d7d2a7..78f9a169 100644
--- a/tbox/src/wLinesearch.cpp
+++ b/tbox/src/wLinesearch.cpp
@@ -44,7 +44,7 @@ double RevFunction::eval(double alpha)
 
 Linesearch::Linesearch(LSFunction &_fct) : fct(_fct)
 {
-    verbose = false;
+    verbose = 0;
     maxLsIt = 1000;
     tol = 1e-10;
     fevalIt = 0;
@@ -114,7 +114,7 @@ double FleuryLS::run()
     double phit = fct.eval(h);
     fevalIt = 2;
 
-    if (verbose)
+    if (verbose > 2)
     {
         std::cout << "Starting Backeting \n";
         std::cout << "\t" << std::setprecision(16) << "phi1(" << 0 << ") = " << phi1 << '\n';
@@ -131,14 +131,14 @@ double FleuryLS::run()
         {
             if (h > hU)
             {
-                if (verbose)
+                if (verbose > 2)
                 {
                     std::cout << "Positive slope detected (in ascending bracketing #1)!\n";
                 }
 
                 if (revAllowed)
                 {
-                    if (verbose)
+                    if (verbose > 2)
                     {
                         std::cout << "Reverse Direction of Line Search (in ascending bracketing #1)!"
                                   << "\n";
@@ -149,7 +149,7 @@ double FleuryLS::run()
                 }
                 else
                 {
-                    if (verbose)
+                    if (verbose > 2)
                     {
                         std::cout << "Error in ascending bracketing #1:\n"
                                   << std::setprecision(16) << "a1=" << 0 << " a2=" << h << " a3 =" << 2.0 * h << '\n'
@@ -167,7 +167,7 @@ double FleuryLS::run()
             phit = fct.eval(2 * h);
             fevalIt++;
 
-            if (verbose)
+            if (verbose > 2)
             {
                 std::cout << "Ascending bracketing iteration # " << nB << ": \n";
                 std::cout << "\t"
@@ -188,14 +188,14 @@ double FleuryLS::run()
         {
             if (h < hL)
             {
-                if (verbose)
+                if (verbose > 2)
                 {
                     std::cout << "Positive slope detected (in descending bracketing #2)!\n";
                 }
 
                 if (revAllowed)
                 {
-                    if (verbose)
+                    if (verbose > 2)
                     {
                         std::cout << "Reverse Direction of Line Search (in descending bracketing #2) !"
                                   << "\n";
@@ -206,7 +206,7 @@ double FleuryLS::run()
                 }
                 else
                 {
-                    if (verbose)
+                    if (verbose > 2)
                     {
                         std::cout << "Error in descending bracketing #2:\n"
                                   << std::setprecision(16) << "a1=" << 0 << " a2=" << h << " a3 =" << 2.0 * h << '\n'
@@ -223,7 +223,7 @@ double FleuryLS::run()
             phi3 = phit;
             phit = fct.eval(h / 2);
             fevalIt++;
-            if (verbose)
+            if (verbose > 2)
             {
                 std::cout << "Descending bracketing iteration # " << nB << ": \n";
                 std::cout << "\t"
@@ -244,7 +244,7 @@ double FleuryLS::run()
     double a3 = 2 * h;
     double a4 = h * (4 * phi2 - 3 * phi1 - phi3) / (4 * phi2 - 2 * phi1 - 2 * phi3);
 
-    if (verbose)
+    if (verbose > 2)
     {
         std::cout << "Backeting succeeds in " << nB << " iterations : \n";
         std::cout << "\t" << std::setprecision(16) << "phi1(" << a1 << ") = " << phi1 << '\n';
@@ -279,7 +279,7 @@ double FleuryLS::run()
             throw std::runtime_error(msg.str());
         }
 
-        if (verbose)
+        if (verbose > 2)
         {
             std::cout << "Line search iteration # " << nLS << ": \n";
             std::cout << "\t" << std::setprecision(16) << "phi1(" << a1 << ") = " << phi1 << '\n';
@@ -322,7 +322,7 @@ double FleuryLS::run()
 
         A = (a2 - a3) * phi1 + (a3 - a1) * phi2 + (a1 - a2) * phi3;
 
-        if (verbose)
+        if (verbose > 2)
         {
             std::cout << "\t" << std::setprecision(16) << "|A| = " << fabs(A) << "><" << tol << '\n';
         }
@@ -340,7 +340,7 @@ double FleuryLS::run()
              (A);
     }
 
-    if (verbose)
+    if (verbose > 2)
     {
         std::cout << "Line search succeeds in " << nLS << " iterations : \n";
         std::cout << "\t" << std::setprecision(16) << "|A| = " << fabs(A) << '\n';
diff --git a/tbox/src/wLinesearch.h b/tbox/src/wLinesearch.h
index 86daeab3..729fc26f 100644
--- a/tbox/src/wLinesearch.h
+++ b/tbox/src/wLinesearch.h
@@ -56,7 +56,7 @@ protected:
     LSFunction &fct; ///< function
     size_t maxLsIt;  ///< max. number of iterations for linesearch
     double tol;      ///< tolerance for linesearch
-    bool verbose;    ///< display more info
+    int verbose;     ///< display more info
 
 public:
     size_t fevalIt; ///< function evaluation count
diff --git a/tbox/src/wMshDeform.cpp b/tbox/src/wMshDeform.cpp
index e0b70a36..dbc7a81a 100644
--- a/tbox/src/wMshDeform.cpp
+++ b/tbox/src/wMshDeform.cpp
@@ -236,13 +236,26 @@ void MshDeform::deform()
     //    tbb::task_scheduler_init init(nthreads);
     //    tbb::spin_mutex mutex;
 
+    // List of rows to build stiffness matrix
+    std::vector<int> rows(nDim * mshSize);
+    for (size_t i = 0; i < mshSize; ++i)
+        for (size_t m = 0; m < nDim; ++m)
+            rows[nDim * i + m] = nDim * msh->nodes[i]->row + m;
+    // Periodic boundary condition step 1
+    // -> for each node pair(a, b): node a contributions will be added to node b rows
+    //    tbb::parallel_do(wake->nodMap.begin(), wake->nodMap.end(), [&](std::pair<Node *, Node *> p) {
+    for (auto p : intBndPair)
+        for (size_t m = 0; m < nDim; ++m)
+            rows[nDim * (p.first->row) + m] = nDim * (p.second->row) + m;
+    //    });
+
     // Stiffness matrix
     Eigen::SparseMatrix<double, Eigen::RowMajor> K(nDim * mshSize, nDim * mshSize);
     // List of triplets to build stifness matrix
     std::vector<Eigen::Triplet<double>> T;
-    T.reserve(nDim * mshSize);
+    T.reserve(nDim * mshSize * nDim * mshSize / 1000);
     // force (RHS) and displacement (UKN) vectors
-    Eigen::VectorXd f = Eigen::VectorXd::Zero(nDim * mshSize), q= Eigen::VectorXd::Zero(nDim * mshSize);
+    Eigen::VectorXd f = Eigen::VectorXd::Zero(nDim * mshSize), q = Eigen::VectorXd::Zero(nDim * mshSize);
 
     // Build stiffness matrix
     std::cout << "--- Deforming mesh using linear elasticiy equations ---\n"
@@ -301,39 +314,23 @@ void MshDeform::deform()
                     for (int n = 0; n < nDim; ++n)
                     {
                         size_t rowj = nDim * (e->nodes[j]->row) + n;
-                        T.push_back(Eigen::Triplet<double>(rowi, rowj, Ke(nDim * i + m, nDim * j + n)));
+                        T.push_back(Eigen::Triplet<double>(rows[rowi], rowj, Ke(nDim * i + m, nDim * j + n)));
                     }
                 }
             }
         }
     }
     //    });
-    // Build matrix without BCs
-    K.setFromTriplets(T.begin(), T.end());
-    // Apply "periodic condition" on internal boundaries
-    // -> for each node pair (a,b)
+    // Periodic boundary condition step 2
+    // -> for each node pair(a, b): write "pos_a - pos_b" = 0 on row a
     for (auto p : intBndPair)
-    {
         for (int m = 0; m < nDim; ++m)
         {
-            size_t row0 = nDim * (p.first->row) + m;
-            size_t row1 = nDim * (p.second->row) + m;
-            // first grab "a" row values...
-            std::vector<Eigen::Triplet<double>> tUp;
-            tUp.reserve(nDim * mshSize / 10);
-            for (Eigen::SparseMatrix<double, Eigen::RowMajor>::InnerIterator it(K, row0); it; ++it)
-            {
-                tUp.push_back(Eigen::Triplet<double>(it.row(), it.col(), it.value()));
-                it.valueRef() = 0.;
-            }
-            // ...then add them to "b" row values
-            for (auto t : tUp)
-                K.coeffRef(row1, t.col()) += t.value();
-            // write equation on "a" line (pos"a" = pos"b")
-            K.coeffRef(row0, row0) = 1.0;
-            K.coeffRef(row0, row1) = -1.0;
+            T.push_back(Eigen::Triplet<double>(nDim * (p.first->row) + m, nDim * (p.first->row) + m, 1.));
+            T.push_back(Eigen::Triplet<double>(nDim * (p.first->row) + m, nDim * (p.second->row) + m, -1.));
         }
-    }
+    // Build matrix without BCs
+    K.setFromTriplets(T.begin(), T.end());
     // Apply Dirichlet condition on fixed boundaries
     for (size_t i = 0; i < fxdBndNodes.size(); i++)
     {
-- 
GitLab