diff --git a/src/core/common/include/context.h b/src/core/common/include/context.h
index f2377bde..bde85fa0 100644
--- a/src/core/common/include/context.h
+++ b/src/core/common/include/context.h
@@ -1,70 +1,143 @@
 #pragma once
 
+#include <vector_types.h>
+
+#include <array>
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
-#include <array>
-#include <vector_types.h>
 
 #include "common/include/md_types.h"
 #include "common/include/nonbonded_14_mode.h"
+#include "common/include/vdw_rules.h"
+#include "host_device_buffer.h"
 
 class Context {
    public:
-
     static Context& instance() {
         static Context ctx;
         return ctx;
     }
-
     /* =============================================
-     * == GENERAL
+     * == CONFIG
      * =============================================
      */
 
-    int n_atoms = 0;
-    int n_atoms_solute = 0;
+    std::string base_folder;
+    bool run_gpu = false;
+
+    int n_atoms = 0;         // the total number of atoms
+    int n_atoms_solute = 0;  // the total number of solute number, in our system [0, n_atoms_solute) are solute, [n_atoms_solute, n_atoms) are water atoms
     int n_patoms = 0;
     int n_qatoms = 0;
     int n_waters = 0;
     int n_molecules = 0;
-
-    std::string base_folder;
     double dt = 0.0;
     double tau_T = 0.0;
+    md_t md;
 
-    bool run_gpu = false;
+    /*
+    
+    */
+    std::unique_ptr<HostDeviceBuffer<coord_t>> coords;
+    std::unique_ptr<HostDeviceBuffer<vel_t>> velocities;
+    std::unique_ptr<HostDeviceBuffer<dvel_t>> dvelocities;
 
-    /* =============================================
-     * == FROM MD FILE
-     * =============================================
-     */
 
-    md_t md;
-    bool separate_scaling = false;
-
-    /* =============================================
-     * == FROM TOPOLOGY FILE
-     * =============================================
-     */
+    /*
+    */
+    int n_angles = 0;
+    int n_angles_solute = 0;
+    int n_cangles = 0;
+    std::unique_ptr<HostDeviceBuffer<angle_t>> angles;
+    std::unique_ptr<HostDeviceBuffer<cangle_t>> cangles;
 
-    int n_coords = 0;
     int n_bonds = 0;
     int n_bonds_solute = 0;
     int n_cbonds = 0;
-    int n_angles = 0;
-    int n_angles_solute = 0;
-    int n_cangles = 0;
+    std::unique_ptr<HostDeviceBuffer<bond_t>> bonds;
+    std::unique_ptr<HostDeviceBuffer<cbond_t>> cbonds;
     int n_torsions = 0;
     int n_torsions_solute = 0;
     int n_ctorsions = 0;
+    std::unique_ptr<HostDeviceBuffer<torsion_t>> torsions;
+    std::unique_ptr<HostDeviceBuffer<ctorsion_t>> ctorsions;
     int n_impropers = 0;
     int n_impropers_solute = 0;
     int n_cimpropers = 0;
+    std::unique_ptr<HostDeviceBuffer<improper_t>> impropers;
+    std::unique_ptr<HostDeviceBuffer<cimproper_t>> cimpropers;
+
+    int n_restrspos = 0;
+    std::unique_ptr<HostDeviceBuffer<restrpos_t>> restrspos;
+
+    int n_restrangs = 0;
+    std::unique_ptr<HostDeviceBuffer<restrang_t>> restrangs;
+    
+
+    int n_restrdists = 0;
+    std::unique_ptr<HostDeviceBuffer<restrdis_t>> restrdists;
+
+    int n_restrseqs = 0;
+    std::unique_ptr<HostDeviceBuffer<restrseq_t>> restrseqs;
+    int n_restrwalls = 0;
+    std::unique_ptr<HostDeviceBuffer<restrwall_t>> restrwalls;
+
+    /*
+    Atom Info
+    */
     int n_charges = 0;
     int n_ccharges = 0;
+    std::unique_ptr<HostDeviceBuffer<charge_t>> charges;
+    std::unique_ptr<HostDeviceBuffer<ccharge_t>> ccharges;
     int n_atypes = 0;
     int n_catypes = 0;
+    std::unique_ptr<HostDeviceBuffer<atype_t>> atypes;
+    std::unique_ptr<HostDeviceBuffer<catype_t>> catypes;
+
+    std::unique_ptr<HostDeviceBuffer<bool>> heavy;
+    std::unique_ptr<HostDeviceBuffer<coord_t>> coords_init;
+
+    std::unique_ptr<HostDeviceBuffer<bool>> excluded;
+
+    std::unique_ptr<HostDeviceBuffer<double>> winv;
+
+    std::unique_ptr<HostDeviceBuffer<bool>> shell;
+    /*
+    Pair
+    */
+    std::unique_ptr<HostDeviceBuffer<int>> LJ_matrix;
+
+    /*
+    Shake
+    */
+    int n_shake_constraints = 0;
+    std::unique_ptr<HostDeviceBuffer<int>> mol_n_shakes;
+    std::unique_ptr<HostDeviceBuffer<shake_bond_t>> shake_bonds;
+    std::unique_ptr<HostDeviceBuffer<coord_t>> xcoords; // todo: It's just a temporary variables...
+    /*
+    Water
+    */
+
+    std::unique_ptr<HostDeviceBuffer<shell_t>> wshells;
+
+
+
+    /*
+    FEP
+    */
+    std::unique_ptr<HostDeviceBuffer<double>> lambdas; // Actually length is only 2..
+
+    /*
+    Energy
+    */
+
+    std::unique_ptr<HostDeviceBuffer<E_restraint_t>> EQ_restraint;
+
+    /*
+    */
+
     int n_ngbrs23 = 0;
     int n_ngbrs14 = 0;
     int n_excluded = 0;
@@ -72,30 +145,35 @@ class Context {
     int n_cgrps_solvent = 0;
     int iuse_switch_atom = 0;
 
-    std::vector<coord_t> coords_top;
-    std::vector<bond_t> bonds;
-    std::vector<cbond_t> cbonds;
-    std::vector<angle_t> angles;
-    std::vector<cangle_t> cangles;
-    std::vector<torsion_t> torsions;
-    std::vector<ctorsion_t> ctorsions;
-    std::vector<improper_t> impropers;
-    std::vector<cimproper_t> cimpropers;
-    std::vector<charge_t> charges;
-    std::vector<ccharge_t> ccharges;
-    std::vector<atype_t> atypes;
-    std::vector<catype_t> catypes;
     std::vector<int> atom_to_qi;
-    std::vector<ccharge_t> unified_ccharges;
-    std::vector<catype_t> unified_catypes;
-    std::vector<int> LJ_matrix;
+    std::unique_ptr<HostDeviceBuffer<ccharge_t>> unified_ccharges;
+    std::unique_ptr<HostDeviceBuffer<catype_t>> unified_catypes;
+    std::unique_ptr<HostDeviceBuffer<int3>> ngbrs_14;
+    std::vector<int3> ngbrs_14_builder;
+
+    std::unique_ptr<HostDeviceBuffer<int>> p_atoms_list;
+    std::unique_ptr<HostDeviceBuffer<int>> w_atoms_list;
+    std::unique_ptr<HostDeviceBuffer<int>> q_atoms_list;
+
+    std::unique_ptr<HostDeviceBuffer<ccharge_t>> charge_table_all;
+    std::unique_ptr<HostDeviceBuffer<double>> charge_pair_products;
+    std::unique_ptr<HostDeviceBuffer<int>> p_charge_types;
+    std::unique_ptr<HostDeviceBuffer<int>> w_charge_types;
+    std::unique_ptr<HostDeviceBuffer<int>> q_charge_types;
+
+    std::unique_ptr<HostDeviceBuffer<catype_t>> catype_table_all;
+    std::unique_ptr<HostDeviceBuffer<vdw_pair_param_t>> catype_pair_params;
+    std::unique_ptr<HostDeviceBuffer<int>> p_catype_types;
+    std::unique_ptr<HostDeviceBuffer<int>> w_catype_types;
+    std::unique_ptr<HostDeviceBuffer<int>> q_catype_types;
+
+    std::map<std::array<double, 4>, int> catype_to_type_host;
+    int n_charge_types = 0;
+    int zero_charge_type = -1;
+    int n_catype_types = 0;
+    int zero_catype_type = -1;
 
-    std::vector<int3> ngbrs_14;
-
-    std::unique_ptr<bool[]> excluded;
-    std::unique_ptr<bool[]> heavy;
     std::vector<int> molecules;
-    std::vector<double> winv;
     std::vector<cgrp_t> charge_groups;
 
     topo_t topo = {};
@@ -106,7 +184,6 @@ class Context {
      */
 
     int n_lambdas = 0;
-    std::vector<double> lambdas;
 
     int n_qangcouples = 0;
     int n_qangles = 0;
@@ -143,7 +220,7 @@ class Context {
     std::vector<atype_t> q_atypes;
     std::vector<bond_t> q_bonds;
     std::vector<ccharge_t> q_charges;
-    std::vector<q_elscale_t> q_elscales;
+    std::unique_ptr<HostDeviceBuffer<q_elscale_t>> q_elscales;
     std::vector<q_exclpair_t> q_exclpairs;
     std::vector<q_improper_t> q_impropers;
     std::vector<q_shake_t> q_shakes;
@@ -155,20 +232,6 @@ class Context {
      * =============================================
      */
 
-    int n_restrseqs = 0;
-    int n_restrspos = 0;
-    int n_restrdists = 0;
-    int n_restrangs = 0;
-    int n_restrwalls = 0;
-
-    std::vector<restrseq_t> restrseqs;
-    std::vector<restrpos_t> restrspos;
-    std::vector<restrdis_t> restrdists;
-    std::vector<restrang_t> restrangs;
-    std::vector<restrwall_t> restrwalls;
-
-    std::unique_ptr<bool[]> shell;
-
     /* =============================================
      * == SHELLS / SOLVENT
      * =============================================
@@ -185,16 +248,12 @@ class Context {
     int n_shells = 0;
     std::vector<std::vector<int>> list_sh;
     std::vector<std::vector<int>> nsort;
-    std::vector<shell_t> wshells;
 
     /* =============================================
      * == SHAKE
      * =============================================
      */
 
-    int n_shake_constraints = 0;
-    std::vector<int> mol_n_shakes;
-    std::vector<shake_bond_t> shake_bonds;
 
     /* =============================================
      * == CALCULATED IN THE INTEGRATION
@@ -202,10 +261,6 @@ class Context {
      */
 
     std::vector<int> p_atoms;
-    std::vector<coord_t> coords;
-    std::vector<coord_t> xcoords;
-    std::vector<vel_t> velocities;
-    std::vector<dvel_t> dvelocities;
 
     energy_t E_total = {};
     std::vector<energy_t> EQ_total;
@@ -225,7 +280,6 @@ class Context {
     std::vector<E_nonbonded_t> EQ_nonbond_qx;
 
     E_restraint_t E_restraint = {};
-    std::vector<E_restraint_t> EQ_restraint;
 
     double Temp = 0.0;
     double Tfree = 0.0;
@@ -244,6 +298,7 @@ class Context {
      * =============================================
      */
 
+    bool separate_scaling = false;
     double Ndegf = 0.0;
     double Ndegfree = 0.0;
     double Ndegf_solvent = 0.0;
@@ -270,7 +325,7 @@ class Context {
     }
 
     const ccharge_t& unified_ccharge_by_code(int code) const {
-        return unified_ccharges[code - 1];
+        return unified_ccharges->cpu_data_p[code - 1];
     }
 
     const ccharge_t& unified_ccharge(int atom_idx, int state) const {
@@ -282,21 +337,27 @@ class Context {
     }
 
     const catype_t& unified_catype_by_code(int code) const {
-        return unified_catypes[code - 1];
+        return unified_catypes->cpu_data_p[code - 1];
     }
 
     const catype_t& unified_catype(int atom_idx, int state) const {
         return unified_catype_by_code(unified_atype_code(atom_idx, state));
     }
 
+    void cuda_initialize_helpers();
+    void cuda_free_helpers();
+    void cuda_reset_energies();
+    void cuda_sync_all_to_device();
+
    private:
     Context() = default;
     ~Context() {}
 
+    void cuda_initialize_atom_lists_host();
+    void cuda_initialize_ngbrs14_host();
+    void cuda_initialize_charge_tables_host();
+    void cuda_initialize_catype_tables_host();
+
     Context(const Context&) = delete;
     Context& operator=(const Context&) = delete;
-
-
-
-
 };
diff --git a/src/core/common/include/cuda_runtime_utility.h b/src/core/common/include/cuda_runtime_utility.h
new file mode 100644
index 00000000..ea019882
--- /dev/null
+++ b/src/core/common/include/cuda_runtime_utility.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <cstdlib>
+
+inline void check_cuda(cudaError_t status) {
+    if (status != cudaSuccess) {
+        std::printf(">>> FATAL: CUDA call failed with error code %d: %s\n", status, cudaGetErrorString(status));
+        std::exit(EXIT_FAILURE);
+    }
+}
+
+inline void check_cudaMalloc(void** dev_ptr, size_t size) {
+    check_cuda(cudaMalloc(dev_ptr, size));
+}
diff --git a/src/core/common/include/host_device_buffer.h b/src/core/common/include/host_device_buffer.h
new file mode 100644
index 00000000..d0590f8c
--- /dev/null
+++ b/src/core/common/include/host_device_buffer.h
@@ -0,0 +1,149 @@
+#pragma once
+
+#include <cstddef>
+#include <cstring>
+#include <stdexcept>
+
+#include "cuda_runtime_utility.h"
+
+template <typename T>
+class HostDeviceBuffer {
+   public:
+    size_t length = 0;       // number of elements
+    bool cpu_mem_flag = true;  // whether host memory exists
+    bool gpu_mem_flag = true;  // whether device memory exists
+
+    T* cpu_data_p = nullptr;  // CPU pointer
+    T* gpu_data_p = nullptr;  // GPU pointer
+
+    HostDeviceBuffer(int length, bool cpu_mem_flag = true, bool gpu_mem_flag = true);
+    HostDeviceBuffer(unsigned int length, bool cpu_mem_flag = true, bool gpu_mem_flag = true);
+    HostDeviceBuffer(size_t length, bool cpu_mem_flag = true, bool gpu_mem_flag = true);
+
+    ~HostDeviceBuffer();
+
+    void allocate();
+    void deallocate();
+
+    // copy CPU -> GPU
+    void upload(T* buffer = nullptr);
+
+    // copy GPU -> CPU
+    void download(T* buffer = nullptr);
+};
+
+template <typename T>
+HostDeviceBuffer<T>::HostDeviceBuffer(int length, bool cpu_mem_flag, bool gpu_mem_flag)
+    : length(length), cpu_mem_flag(cpu_mem_flag), gpu_mem_flag(gpu_mem_flag) {
+    allocate();
+}
+
+template <typename T>
+HostDeviceBuffer<T>::HostDeviceBuffer(unsigned int length, bool cpu_mem_flag, bool gpu_mem_flag)
+    : length(length), cpu_mem_flag(cpu_mem_flag), gpu_mem_flag(gpu_mem_flag) {
+    allocate();
+}
+
+template <typename T>
+HostDeviceBuffer<T>::HostDeviceBuffer(size_t length, bool cpu_mem_flag, bool gpu_mem_flag)
+    : length(length), cpu_mem_flag(cpu_mem_flag), gpu_mem_flag(gpu_mem_flag) {
+    allocate();
+}
+
+template <typename T>
+HostDeviceBuffer<T>::~HostDeviceBuffer() {
+    deallocate();
+}
+
+template <typename T>
+void HostDeviceBuffer<T>::allocate() {
+    if (length == 0) {
+        return;
+    }
+
+    const size_t bytes = length * sizeof(T);
+    if (cpu_mem_flag) {
+        cpu_data_p = new T[length];
+        std::memset(cpu_data_p, 0, bytes);
+    }
+
+    if (gpu_mem_flag) {
+        check_cudaMalloc((void**)&gpu_data_p, bytes);
+        check_cuda(cudaMemset(gpu_data_p, 0, bytes));
+    }
+}
+
+template <typename T>
+void HostDeviceBuffer<T>::deallocate() {
+    if (cpu_data_p != nullptr) {
+        delete[] cpu_data_p;
+        cpu_data_p = nullptr;
+    }
+
+    if (gpu_data_p != nullptr) {
+        check_cuda(cudaFree(gpu_data_p));
+        gpu_data_p = nullptr;
+    }
+
+    length = 0;
+}
+
+//---------------------------------------------------------------------------------------------
+// Upload: CPU -> GPU
+//
+// If buffer != nullptr, copy from buffer to device.
+// Otherwise copy from internal cpu_data_p.
+//
+// Requires device memory.
+//---------------------------------------------------------------------------------------------
+template <typename T>
+void HostDeviceBuffer<T>::upload(T* buffer) {
+    if (length == 0) {
+        return;
+    }
+
+    if (gpu_data_p == nullptr) {
+        throw std::runtime_error("HostDeviceBuffer::upload failed: GPU memory is not allocated.");
+    }
+
+    T* src = buffer;
+    if (src == nullptr) {
+        src = cpu_data_p;
+    }
+
+    if (src == nullptr) {
+        throw std::runtime_error("HostDeviceBuffer::upload failed: no CPU source buffer available.");
+    }
+
+    check_cuda(cudaMemcpy(gpu_data_p, src, length * sizeof(T), cudaMemcpyHostToDevice));
+}
+
+//---------------------------------------------------------------------------------------------
+// Download: GPU -> CPU
+//
+// If buffer != nullptr, copy into buffer.
+// Otherwise copy into internal cpu_data_p.
+//
+// Requires device memory.
+//---------------------------------------------------------------------------------------------
+template <typename T>
+void HostDeviceBuffer<T>::download(T* buffer) {
+    if (length == 0) {
+        return;
+    }
+
+    if (gpu_data_p == nullptr) {
+        throw std::runtime_error("HostDeviceBuffer::download failed: GPU memory is not allocated.");
+    }
+
+    T* dst = buffer;
+    if (dst == nullptr) {
+        dst = cpu_data_p;
+    }
+
+    if (dst == nullptr) {
+        throw std::runtime_error("HostDeviceBuffer::download failed: no CPU destination buffer available.");
+    }
+
+    check_cuda(cudaMemcpy(dst, gpu_data_p, length * sizeof(T), cudaMemcpyDeviceToHost));
+}
diff --git a/src/core/common/include/host_device_memory_info.h b/src/core/common/include/host_device_memory_info.h
new file mode 100644
index 00000000..8848341d
--- /dev/null
+++ b/src/core/common/include/host_device_memory_info.h
@@ -0,0 +1,16 @@
+#pragma once
+class HostDeviceMemoryInfo {
+   public:
+    long long total_cpu_memory;
+    long long total_gpu_memory;
+
+    static HostDeviceMemoryInfo& Instance() {
+        static HostDeviceMemoryInfo instance;
+        return instance;
+    }
+
+   protected:
+    HostDeviceMemoryInfo() {
+        total_cpu_memory = total_gpu_memory = 0;
+    }
+};
\ No newline at end of file
diff --git a/src/core/common/include/init.h b/src/core/common/include/init.h
index 23a62a14..9c1e9950 100644
--- a/src/core/common/include/init.h
+++ b/src/core/common/include/init.h
@@ -64,5 +64,4 @@ void clean_variables();
 // void write_energies(int iteration);
 
 void calc_integration();
-void calc_integration_step(int iteration);
 
diff --git a/src/core/common/src/handler.cpp b/src/core/common/src/handler.cpp
index 85a68b13..3fdd1341 100644
--- a/src/core/common/src/handler.cpp
+++ b/src/core/common/src/handler.cpp
@@ -44,8 +44,10 @@ void Handler::run(int num_iterations) {
 
 void Handler::update_energy_totals() {
     auto& host = Context::instance();
+    auto *lambdas = host.lambdas->cpu_data_p;
+    auto *EQ_restraint = host.EQ_restraint->cpu_data_p;
     for (int state = 0; state < host.n_lambdas; state++) {
-        if (host.lambdas[state] == 0) {
+        if (lambdas[state] == 0) {
             host.EQ_bond[state].Uangle = 0;
             host.EQ_bond[state].Ubond = 0;
             host.EQ_bond[state].Utor = 0;
@@ -56,24 +58,24 @@ void Handler::update_energy_totals() {
             host.EQ_nonbond_qp[state].Uvdw = 0;
             host.EQ_nonbond_qw[state].Ucoul = 0;
             host.EQ_nonbond_qw[state].Uvdw = 0;
-            host.EQ_restraint[state].Urestr = 0;
+            EQ_restraint[state].Urestr = 0;
         }
 
         host.EQ_nonbond_qx[state].Ucoul = host.EQ_nonbond_qq[state].Ucoul + host.EQ_nonbond_qp[state].Ucoul + host.EQ_nonbond_qw[state].Ucoul;
         host.EQ_nonbond_qx[state].Uvdw = host.EQ_nonbond_qq[state].Uvdw + host.EQ_nonbond_qp[state].Uvdw + host.EQ_nonbond_qw[state].Uvdw;
 
         host.EQ_total[state].Utot = host.EQ_bond[state].Ubond + host.EQ_bond[state].Uangle + host.EQ_bond[state].Utor + host.EQ_bond[state].Uimp +
-                                    host.EQ_nonbond_qx[state].Ucoul + host.EQ_nonbond_qx[state].Uvdw + host.EQ_restraint[state].Urestr;
+                                    host.EQ_nonbond_qx[state].Ucoul + host.EQ_nonbond_qx[state].Uvdw + EQ_restraint[state].Urestr;
 
-        host.E_bond_q.Ubond += host.EQ_bond[state].Ubond * host.lambdas[state];
-        host.E_bond_q.Uangle += host.EQ_bond[state].Uangle * host.lambdas[state];
-        host.E_bond_q.Utor += host.EQ_bond[state].Utor * host.lambdas[state];
-        host.E_bond_q.Uimp += host.EQ_bond[state].Uimp * host.lambdas[state];
-        host.E_nonbond_qx.Ucoul += host.EQ_nonbond_qx[state].Ucoul * host.lambdas[state];
-        host.E_nonbond_qx.Uvdw += host.EQ_nonbond_qx[state].Uvdw * host.lambdas[state];
+        host.E_bond_q.Ubond += host.EQ_bond[state].Ubond * lambdas[state];
+        host.E_bond_q.Uangle += host.EQ_bond[state].Uangle * lambdas[state];
+        host.E_bond_q.Utor += host.EQ_bond[state].Utor * lambdas[state];
+        host.E_bond_q.Uimp += host.EQ_bond[state].Uimp * lambdas[state];
+        host.E_nonbond_qx.Ucoul += host.EQ_nonbond_qx[state].Ucoul * lambdas[state];
+        host.E_nonbond_qx.Uvdw += host.EQ_nonbond_qx[state].Uvdw * lambdas[state];
 
         // Update protein restraint energies with an average of all states
-        host.E_restraint.Upres += host.EQ_restraint[state].Urestr * host.lambdas[state];
+        host.E_restraint.Upres += EQ_restraint[state].Urestr * lambdas[state];
     }
 
     // Update totals
@@ -94,6 +96,8 @@ void Handler::print_outputs(int iteration) {
 
 void Handler::reset_energies() {
     auto& host = Context::instance();
+    auto &dvelocities = host.dvelocities->cpu_data_p;
+    auto *EQ_restraint = host.EQ_restraint->cpu_data_p;
     host.E_total.Upot = 0;
     host.E_bond_p.Uangle = 0;
     host.E_bond_p.Ubond = 0;
@@ -121,7 +125,8 @@ void Handler::reset_energies() {
     host.E_restraint.Ushell = 0;
     host.E_restraint.Upres = 0;
     host.E_restraint.Urestr = 0;
-    for (auto& dvel : host.dvelocities) {
+    for (int i = 0; i < host.n_atoms; i++) {
+        auto& dvel = dvelocities[i];
         dvel.x = 0;
         dvel.y = 0;
         dvel.z = 0;
@@ -140,6 +145,6 @@ void Handler::reset_energies() {
         host.EQ_nonbond_qx[state].Ucoul = 0;
         host.EQ_nonbond_qx[state].Uvdw = 0;
         host.EQ_total[state].Utot = 0;
-        host.EQ_restraint[state].Urestr = 0;
+        EQ_restraint[state].Urestr = 0;
     }
 }
diff --git a/src/core/common/src/init.cpp b/src/core/common/src/init.cpp
index 0fc9cdb2..0b48cc87 100644
--- a/src/core/common/src/init.cpp
+++ b/src/core/common/src/init.cpp
@@ -17,36 +17,38 @@
 
 static void finalize_ngbrs14_host() {
     auto& ctx = Context::instance();
+    auto &LJ_matrix = ctx.LJ_matrix->cpu_data_p;
+    auto &ngbrs_14 = ctx.ngbrs_14_builder;
 
-    for (auto& pair : ctx.ngbrs_14) {
+    for (auto& pair : ngbrs_14) {
         if (pair.x > pair.y) {
             std::swap(pair.x, pair.y);
         }
     }
 
-    ctx.ngbrs_14.erase(
-        std::remove_if(ctx.ngbrs_14.begin(), ctx.ngbrs_14.end(), [&ctx](const int3& pair) {
+    ngbrs_14.erase(
+        std::remove_if(ngbrs_14.begin(), ngbrs_14.end(), [&ctx, &LJ_matrix](const int3& pair) {
             if (pair.x < 0 || pair.y < 0 || pair.x >= ctx.n_atoms_solute || pair.y >= ctx.n_atoms_solute) {
                 return true;
             }
             if (pair.x == pair.y) {
                 return true;
             }
-            return ctx.LJ_matrix[pair.x * ctx.n_atoms_solute + pair.y] != 1;
+            return LJ_matrix[pair.x * ctx.n_atoms_solute + pair.y] != 1;
         }),
-        ctx.ngbrs_14.end());
+        ngbrs_14.end());
 
-    std::sort(ctx.ngbrs_14.begin(), ctx.ngbrs_14.end(), [](const int3& lhs, const int3& rhs) {
+    std::sort(ngbrs_14.begin(), ngbrs_14.end(), [](const int3& lhs, const int3& rhs) {
         if (lhs.x != rhs.x) return lhs.x < rhs.x;
         return lhs.y < rhs.y;
     });
-    ctx.ngbrs_14.erase(
-        std::unique(ctx.ngbrs_14.begin(), ctx.ngbrs_14.end(), [](const int3& lhs, const int3& rhs) {
+    ngbrs_14.erase(
+        std::unique(ngbrs_14.begin(), ngbrs_14.end(), [](const int3& lhs, const int3& rhs) {
             return lhs.x == rhs.x && lhs.y == rhs.y;
         }),
-        ctx.ngbrs_14.end());
+        ngbrs_14.end());
 
-    for (auto& pair : ctx.ngbrs_14) {
+    for (auto& pair : ngbrs_14) {
         const bool ai_is_q = ctx.atom_to_qi[pair.x] != -1;
         const bool aj_is_q = ctx.atom_to_qi[pair.y] != -1;
         if (ai_is_q && aj_is_q) {
@@ -57,7 +59,12 @@ static void finalize_ngbrs14_host() {
             pair.z = NONBONDED_14_PP;
         }
     }
-    ctx.n_ngbrs14 = static_cast<int>(ctx.ngbrs_14.size());
+    ctx.ngbrs_14 = std::make_unique<HostDeviceBuffer<int3>>(ngbrs_14.size(), true, ctx.run_gpu);
+    if (!ngbrs_14.empty()) {
+        std::copy(ngbrs_14.begin(), ngbrs_14.end(), ctx.ngbrs_14->cpu_data_p);
+    }
+    ctx.n_ngbrs14 = static_cast<int>(ctx.ngbrs_14->length);
+    ngbrs_14.clear();
 }
 
 // Remove bonds, angles, torsions and impropers which are excluded or changed in the FEP file
@@ -69,14 +76,19 @@ void exclude_qatom_definitions() {
 
     excluded = 0;
     int solute_excluded = 0;
+
+    auto &bonds = ctx.bonds->cpu_data_p;
+    auto &angles = ctx.angles->cpu_data_p;
+    auto &torsions = ctx.torsions->cpu_data_p;
+
     if (ctx.n_qangles > 0) {
         for (int i = 0; i < ctx.n_angles; i++) {
-            if (ctx.angles[i].ai == ctx.q_angles[qai].ai && ctx.angles[i].aj == ctx.q_angles[qai].aj && ctx.angles[i].ak == ctx.q_angles[qai].ak) {
+            if (angles[i].ai == ctx.q_angles[qai].ai && angles[i].aj == ctx.q_angles[qai].aj && angles[i].ak == ctx.q_angles[qai].ak) {
                 qai++;
                 excluded++;
                 if (i < ctx.n_angles_solute) solute_excluded++;
             } else {
-                ctx.angles[ai] = ctx.angles[i];
+                angles[ai] = angles[i];
                 ai++;
             }
         }
@@ -88,12 +100,12 @@ void exclude_qatom_definitions() {
     solute_excluded = 0;
     if (ctx.n_qbonds > 0) {
         for (int i = 0; i < ctx.n_bonds; i++) {
-            if (ctx.bonds[i].ai == ctx.q_bonds[qbi].ai && ctx.bonds[i].aj == ctx.q_bonds[qbi].aj) {
+            if (bonds[i].ai == ctx.q_bonds[qbi].ai && bonds[i].aj == ctx.q_bonds[qbi].aj) {
                 qbi++;
                 excluded++;
                 if (i < ctx.n_bonds_solute) solute_excluded++;
             } else {
-                ctx.bonds[bi] = ctx.bonds[i];
+                bonds[bi] = bonds[i];
                 bi++;
             }
         }
@@ -121,12 +133,12 @@ void exclude_qatom_definitions() {
     solute_excluded = 0;
     if (ctx.n_qtorsions > 0) {
         for (int i = 0; i < ctx.n_torsions; i++) {
-            if (ctx.torsions[i].ai == ctx.q_torsions[qti].ai && ctx.torsions[i].aj == ctx.q_torsions[qti].aj && ctx.torsions[i].ak == ctx.q_torsions[qti].ak && ctx.torsions[i].al == ctx.q_torsions[qti].al) {
+            if (torsions[i].ai == ctx.q_torsions[qti].ai && torsions[i].aj == ctx.q_torsions[qti].aj && torsions[i].ak == ctx.q_torsions[qti].ak && torsions[i].al == ctx.q_torsions[qti].al) {
                 qti++;
                 excluded++;
                 if (i < ctx.n_torsions_solute) solute_excluded++;
             } else {
-                ctx.torsions[ti] = ctx.torsions[i];
+                torsions[ti] = torsions[i];
                 ti++;
             }
         }
@@ -139,8 +151,11 @@ void exclude_qatom_definitions() {
 
 void exclude_all_atoms_excluded_definitions() {
     auto& ctx = Context::instance();
+    auto *excluded = ctx.excluded->cpu_data_p;
     int n_excl;
     int ai = 0, bi = 0, ii = 0, ti = 0;
+    auto &impropers = ctx.impropers->cpu_data_p;
+    auto &torsions = ctx.torsions->cpu_data_p;
 
     // n_excl = 0;
     // for (int i = 0; i < n_angles; i++) {
@@ -173,10 +188,10 @@ void exclude_all_atoms_excluded_definitions() {
 
     n_excl = 0;
     for (int i = 0; i < ctx.n_impropers; i++) {
-        if (ctx.excluded[ctx.impropers[i].ai - 1] && ctx.excluded[ctx.impropers[i].aj - 1] && ctx.excluded[ctx.impropers[i].ak - 1] && ctx.excluded[ctx.impropers[i].al - 1]) {
+        if (excluded[impropers[i].ai - 1] && excluded[impropers[i].aj - 1] && excluded[impropers[i].ak - 1] && excluded[impropers[i].al - 1]) {
             n_excl++;
         } else {
-            ctx.impropers[ii] = ctx.impropers[i];
+            impropers[ii] = impropers[i];
             ii++;
         }
     }
@@ -185,10 +200,10 @@ void exclude_all_atoms_excluded_definitions() {
 
     n_excl = 0;
     for (int i = 0; i < ctx.n_torsions; i++) {
-        if (ctx.excluded[ctx.torsions[i].ai - 1] && ctx.excluded[ctx.torsions[i].aj - 1] && ctx.excluded[ctx.torsions[i].ak - 1] && ctx.excluded[ctx.torsions[i].al - 1]) {
+        if (excluded[torsions[i].ai - 1] && excluded[torsions[i].aj - 1] && excluded[torsions[i].ak - 1] && excluded[torsions[i].al - 1]) {
             n_excl++;
         } else {
-            ctx.torsions[ti] = ctx.torsions[i];
+            torsions[ti] = torsions[i];
             ti++;
         }
     }
@@ -204,17 +219,19 @@ void exclude_shaken_definitions() {
     int si = 0;
     int ang_i = 0;
     int ai, aj;
+    auto &shake_bonds = ctx.shake_bonds->cpu_data_p;
 
     excluded = 0;
     solute_excluded = 0;
+    auto &bonds = ctx.bonds->cpu_data_p;
     if (ctx.n_shake_constraints > 0) {
         for (int i = 0; i < ctx.n_bonds; i++) {
-            if (ctx.bonds[i].ai == ctx.shake_bonds[si].ai && ctx.bonds[i].aj == ctx.shake_bonds[si].aj) {
+            if (bonds[i].ai == shake_bonds[si].ai && bonds[i].aj == shake_bonds[si].aj) {
                 si++;
                 excluded++;
                 if (i < ctx.n_bonds_solute) solute_excluded++;
             } else {
-                ctx.bonds[bi] = ctx.bonds[i];
+                bonds[bi] = bonds[i];
                 bi++;
             }
         }
@@ -224,25 +241,26 @@ void exclude_shaken_definitions() {
 
     excluded = 0;
     solute_excluded = 0;
+    auto &angles = ctx.angles->cpu_data_p;
     if (ctx.n_shake_constraints > 0) {
         // Mark angles whose terminal atoms (i and k) match a shaken bond
         for (int i = 0; i < ctx.n_shake_constraints; i++) {
-            ai = ctx.shake_bonds[i].ai;
-            aj = ctx.shake_bonds[i].aj;
+            ai = shake_bonds[i].ai;
+            aj = shake_bonds[i].aj;
             for (int j = 0; j < ctx.n_angles; j++) {
-                if ((ctx.angles[j].ai == ai && ctx.angles[j].ak == aj) || (ctx.angles[j].ai == aj && ctx.angles[j].ak == ai)) {
-                    ctx.angles[j].code = 0;
+                if ((angles[j].ai == ai && angles[j].ak == aj) || (angles[j].ai == aj && angles[j].ak == ai)) {
+                    angles[j].code = 0;
                     break;
                 }
             }
         }
 
         for (int i = 0; i < ctx.n_angles; i++) {
-            if (ctx.angles[i].code == 0) {
+            if (angles[i].code == 0) {
                 excluded++;
                 if (i < ctx.n_angles_solute) solute_excluded++;
             } else {
-                ctx.angles[ang_i] = ctx.angles[i];
+                angles[ang_i] = angles[i];
                 ang_i++;
             }
         }
@@ -253,36 +271,50 @@ void exclude_shaken_definitions() {
 
 void init_velocities() {
     auto& ctx = Context::instance();
-    ctx.velocities.resize(ctx.n_atoms);
+    auto &atypes = ctx.atypes->cpu_data_p;
+    auto &catypes = ctx.catypes->cpu_data_p;
+    ctx.velocities = std::make_unique<HostDeviceBuffer<vel_t>>(ctx.n_atoms, true, ctx.run_gpu);
+    auto &velocities = ctx.velocities->cpu_data_p;
 
     // If not previous value set, use a Maxwell distribution to fill velocities
     double kT = Boltz * ctx.md.initial_temperature;
     double sd, mass;
     for (int i = 0; i < ctx.n_atoms; i++) {
-        mass = ctx.catypes[ctx.atypes[i].code - 1].m;
+        mass = catypes[atypes[i].code - 1].m;
         sd = sqrt(kT / mass);
 
-        ctx.velocities[i].x = gauss(0, sd);
-        ctx.velocities[i].y = gauss(0, sd);
-        ctx.velocities[i].z = gauss(0, sd);
+        velocities[i].x = gauss(0, sd);
+        velocities[i].y = gauss(0, sd);
+        velocities[i].z = gauss(0, sd);
+    }
+
+    if (ctx.run_gpu) {
+        ctx.velocities->upload();
     }
 }
 
 void init_dvelocities() {
     auto& ctx = Context::instance();
-    ctx.dvelocities.assign(ctx.n_atoms, {});
+    ctx.dvelocities = std::make_unique<HostDeviceBuffer<dvel_t>>(ctx.n_atoms, true, ctx.run_gpu);
 }
 
 void init_xcoords() {
     auto& ctx = Context::instance();
-    ctx.xcoords.resize(ctx.n_atoms);
+    ctx.xcoords = std::make_unique<HostDeviceBuffer<coord_t>>(ctx.n_atoms, true, ctx.run_gpu);
 }
 
 void init_inv_mass() {
     auto& ctx = Context::instance();
-    ctx.winv.resize(ctx.n_atoms);
+    auto &atypes = ctx.atypes->cpu_data_p;
+    auto &catypes = ctx.catypes->cpu_data_p;
+    ctx.winv = std::make_unique<HostDeviceBuffer<double>>(ctx.n_atoms, true, ctx.run_gpu);
+    auto *winv = ctx.winv->cpu_data_p;
     for (int ai = 0; ai < ctx.n_atoms; ai++) {
-        ctx.winv[ai] = 1 / ctx.catypes[ctx.atypes[ai].code - 1].m;
+        winv[ai] = 1 / catypes[atypes[ai].code - 1].m;
+    }
+
+    if (ctx.run_gpu) {
+        ctx.winv->upload();
     }
 }
 
@@ -304,10 +336,14 @@ void init_wshells() {
     auto& ctx = Context::instance();
     int n_inshell;
     double drs, router, ri, dr, Vshell, rshell;
+    auto &bonds = ctx.bonds->cpu_data_p;
+    auto &cbonds = ctx.cbonds->cpu_data_p;
+    auto &angles = ctx.angles->cpu_data_p;
+    auto &cangles = ctx.cangles->cpu_data_p;
     if (ctx.mu_w == 0) {
         // Get water properties from first water molecule
-        cbond_t cbondw = ctx.cbonds[ctx.bonds[ctx.n_atoms_solute].code - 1];
-        cangle_t canglew = ctx.cangles[ctx.angles[ctx.n_atoms_solute].code - 1];
+        cbond_t cbondw = cbonds[bonds[ctx.n_atoms_solute].code - 1];
+        cangle_t canglew = cangles[angles[ctx.n_atoms_solute].code - 1];
 
         ctx.crg_ow = ctx.unified_ccharge(ctx.n_atoms_solute, 0).charge;
 
@@ -317,7 +353,8 @@ void init_wshells() {
     drs = wpolr_layer / drouter;
 
     ctx.n_shells = (int)floor(-0.5 + sqrt(2 * drs + 0.25));
-    ctx.wshells.resize(ctx.n_shells);
+    ctx.wshells = std::make_unique<HostDeviceBuffer<shell_t>>(ctx.n_shells, true, ctx.run_gpu);
+    auto *wshells = ctx.wshells->cpu_data_p;
 
     printf("n_shells = %d\n", ctx.n_shells);
 
@@ -325,12 +362,12 @@ void init_wshells() {
     ctx.n_max_inshell = 0;
 
     for (int i = 0; i < ctx.n_shells; i++) {
-        ctx.wshells[i].avtheta = 0;
-        ctx.wshells[i].avn_inshell = 0;
-        ctx.wshells[i].router = router;
+        wshells[i].avtheta = 0;
+        wshells[i].avn_inshell = 0;
+        wshells[i].router = router;
         dr = drouter * (i + 1);
         ri = router - dr;
-        ctx.wshells[i].dr = dr;
+        wshells[i].dr = dr;
         Vshell = pow(router, 3) - pow(ri, 3);
         n_inshell = (int)floor(4 * M_PI / 3 * Vshell * rho_water);
         if (n_inshell > ctx.n_max_inshell) {
@@ -339,13 +376,13 @@ void init_wshells() {
         rshell = pow(0.5 * (pow(router, 3) + pow(ri, 3)), 1.0 / 3.0);
 
         // --- Note below: 0.98750 = (1-1/epsilon) for water
-        ctx.wshells[i].cstb = ctx.crgQtot * 0.98750 / (rho_water * ctx.mu_w * 4 * M_PI * pow(rshell, 2));
+        wshells[i].cstb = ctx.crgQtot * 0.98750 / (rho_water * ctx.mu_w * 4 * M_PI * pow(rshell, 2));
 
         router -= dr;
     }
 
     // rc > wshells[n_shells-1].router - wshells[n_shells-1].dr
-    printf("shell 0: (%f, %f). shell 1: (%f, %f). shell 2: (%f, %f).\n", ctx.wshells[0].router, ctx.wshells[0].router - ctx.wshells[0].dr, ctx.wshells[1].router, ctx.wshells[1].router - ctx.wshells[1].dr, ctx.wshells[2].router, ctx.wshells[2].router - ctx.wshells[2].dr);
+    printf("shell 0: (%f, %f). shell 1: (%f, %f). shell 2: (%f, %f).\n", wshells[0].router, wshells[0].router - wshells[0].dr, wshells[1].router, wshells[1].router - wshells[1].dr, wshells[2].router, wshells[2].router - wshells[2].dr);
 
     ctx.n_max_inshell = ctx.n_waters;  // Make largest a little bigger just in case
 
@@ -356,55 +393,73 @@ void init_wshells() {
 
     ctx.list_sh.assign(ctx.n_max_inshell, std::vector<int>(ctx.n_shells, 0));
     ctx.nsort.assign(ctx.n_max_inshell, std::vector<int>(ctx.n_shells, 0));
+
+    if (ctx.run_gpu) {
+        ctx.wshells->upload();
+    }
 }
 
 void init_pshells() {
     auto& ctx = Context::instance();
+    auto &atypes = ctx.atypes->cpu_data_p;
+    auto &catypes = ctx.catypes->cpu_data_p;
+    auto &coords_init = ctx.coords_init->cpu_data_p;
+    auto *excluded = ctx.excluded->cpu_data_p;
     double mass, r2, rin2;
 
-    ctx.heavy = std::make_unique<bool[]>(ctx.n_atoms);
-    ctx.shell = std::make_unique<bool[]>(ctx.n_atoms);
+    ctx.heavy = std::make_unique<HostDeviceBuffer<bool>>(ctx.n_atoms, true, ctx.run_gpu);
+    auto *heavy = ctx.heavy->cpu_data_p;
+    ctx.shell = std::make_unique<HostDeviceBuffer<bool>>(ctx.n_atoms, true, ctx.run_gpu);
+    auto *shell = ctx.shell->cpu_data_p;
     for (int i = 0; i < ctx.n_atoms; ++i) {
-        ctx.heavy[i] = false;
-        ctx.shell[i] = false;
+        heavy[i] = false;
+        shell[i] = false;
     }
     rin2 = pow(shell_default * ctx.topo.exclusion_radius, 2);
 
     int n_heavy = 0, n_inshell = 0;
 
     for (int i = 0; i < ctx.n_atoms; i++) {
-        mass = ctx.catypes[ctx.atypes[i].code - 1].m;
+        mass = catypes[atypes[i].code - 1].m;
         if (mass < 4.0) {
-            ctx.heavy[i] = false;
+            heavy[i] = false;
         } else {
-            ctx.heavy[i] = true;
+            heavy[i] = true;
             n_heavy++;
         }
 
-        if (ctx.heavy[i] && !ctx.excluded[i] && i < ctx.n_atoms_solute) {
-            r2 = pow(ctx.coords_top[i].x - ctx.topo.solute_center.x, 2) + pow(ctx.coords_top[i].y - ctx.topo.solute_center.y, 2) + pow(ctx.coords_top[i].z - ctx.topo.solute_center.z, 2);
+        if (heavy[i] && !excluded[i] && i < ctx.n_atoms_solute) {
+            r2 = pow(coords_init[i].x - ctx.topo.solute_center.x, 2) + pow(coords_init[i].y - ctx.topo.solute_center.y, 2) + pow(coords_init[i].z - ctx.topo.solute_center.z, 2);
             if (r2 > rin2) {
-                ctx.shell[i] = true;
+                shell[i] = true;
                 n_inshell++;
             } else {
-                ctx.shell[i] = false;
+                shell[i] = false;
             }
         }
     }
 
+    if (ctx.run_gpu) {
+        ctx.heavy->upload();
+        ctx.shell->upload();
+    }
+
     printf("n_heavy = %d, n_inshell = %d\n", n_heavy, n_inshell);
 }
 
 // Marks heavy atoms for the shell/excluded arrays.
 // Shared between switch-atom and centroid shell init paths.
 static int mark_heavy_atoms(Context& ctx) {
+    auto &atypes = ctx.atypes->cpu_data_p;
+    auto &catypes = ctx.catypes->cpu_data_p;
+    auto *heavy = ctx.heavy->cpu_data_p;
     int n_heavy = 0;
     for (int i = 0; i < ctx.n_atoms; i++) {
-        double mass = ctx.catypes[ctx.atypes[i].code - 1].m;
+        double mass = catypes[atypes[i].code - 1].m;
         if (mass < 4.0) {
-            ctx.heavy[i] = false;
+            heavy[i] = false;
         } else {
-            ctx.heavy[i] = true;
+            heavy[i] = true;
             n_heavy++;
         }
     }
@@ -415,13 +470,18 @@ static int mark_heavy_atoms(Context& ctx) {
 // Used when iuse_switch_atom == 1.
 void init_pshells_with_switch_atoms() {
     auto& ctx = Context::instance();
+    auto &coords_init = ctx.coords_init->cpu_data_p;
+    auto *excluded = ctx.excluded->cpu_data_p;
+    bool *heavy = nullptr;
     double r2, rin2;
 
-    ctx.heavy = std::make_unique<bool[]>(ctx.n_atoms);
-    ctx.shell = std::make_unique<bool[]>(ctx.n_atoms);
+    ctx.heavy = std::make_unique<HostDeviceBuffer<bool>>(ctx.n_atoms, true, ctx.run_gpu);
+    heavy = ctx.heavy->cpu_data_p;
+    ctx.shell = std::make_unique<HostDeviceBuffer<bool>>(ctx.n_atoms, true, ctx.run_gpu);
+    auto *shell = ctx.shell->cpu_data_p;
     for (int i = 0; i < ctx.n_atoms; ++i) {
-        ctx.heavy[i] = false;
-        ctx.shell[i] = false;
+        heavy[i] = false;
+        shell[i] = false;
     }
     rin2 = pow(ctx.md.shell_radius, 2);
 
@@ -431,11 +491,11 @@ void init_pshells_with_switch_atoms() {
     for (int grp = 0; grp < ctx.n_cgrps_solute; grp++) {
         cgrp_t cgrp = ctx.charge_groups[grp];
         int i = cgrp.iswitch - 1;
-        if (ctx.heavy[i] && !ctx.excluded[i] && i < ctx.n_atoms_solute) {
-            r2 = pow(ctx.coords_top[i].x - ctx.topo.solute_center.x, 2) + pow(ctx.coords_top[i].y - ctx.topo.solute_center.y, 2) + pow(ctx.coords_top[i].z - ctx.topo.solute_center.z, 2);
+        if (heavy[i] && !excluded[i] && i < ctx.n_atoms_solute) {
+            r2 = pow(coords_init[i].x - ctx.topo.solute_center.x, 2) + pow(coords_init[i].y - ctx.topo.solute_center.y, 2) + pow(coords_init[i].z - ctx.topo.solute_center.z, 2);
             bool in_shell = r2 > rin2;
             for (int j = 0; j < cgrp.n_atoms; j++) {
-                ctx.shell[cgrp.a[j] - 1] = in_shell;
+                shell[cgrp.a[j] - 1] = in_shell;
                 if (in_shell) {
                     n_inshell++;
                 }
@@ -443,6 +503,11 @@ void init_pshells_with_switch_atoms() {
         }
     }
 
+    if (ctx.run_gpu) {
+        ctx.heavy->upload();
+        ctx.shell->upload();
+    }
+
     printf("(switch atoms): n_heavy = %d, n_inshell = %d\n", n_heavy, n_inshell);
 }
 
@@ -450,13 +515,18 @@ void init_pshells_with_switch_atoms() {
 // Used when iuse_switch_atom == 0.
 void init_pshells_with_centroids() {
     auto& ctx = Context::instance();
+    auto &coords_init = ctx.coords_init->cpu_data_p;
+    auto *excluded = ctx.excluded->cpu_data_p;
+    bool *heavy = nullptr;
     double r2, rin2;
 
-    ctx.heavy = std::make_unique<bool[]>(ctx.n_atoms);
-    ctx.shell = std::make_unique<bool[]>(ctx.n_atoms);
+    ctx.heavy = std::make_unique<HostDeviceBuffer<bool>>(ctx.n_atoms, true, ctx.run_gpu);
+    heavy = ctx.heavy->cpu_data_p;
+    ctx.shell = std::make_unique<HostDeviceBuffer<bool>>(ctx.n_atoms, true, ctx.run_gpu);
+    auto *shell = ctx.shell->cpu_data_p;
     for (int i = 0; i < ctx.n_atoms; ++i) {
-        ctx.heavy[i] = false;
-        ctx.shell[i] = false;
+        heavy[i] = false;
+        shell[i] = false;
     }
     rin2 = pow(ctx.md.shell_radius, 2);
 
@@ -466,14 +536,14 @@ void init_pshells_with_centroids() {
     for (int grp = 0; grp < ctx.n_cgrps_solute; grp++) {
         cgrp_t cgrp = ctx.charge_groups[grp];
         int i = cgrp.iswitch - 1;
-        if (ctx.heavy[i] && !ctx.excluded[i] && i < ctx.n_atoms_solute) {
+        if (heavy[i] && !excluded[i] && i < ctx.n_atoms_solute) {
             // Compute centroid of charge group
             double cx = 0, cy = 0, cz = 0;
             for (int j = 0; j < cgrp.n_atoms; j++) {
                 int ai = cgrp.a[j] - 1;
-                cx += ctx.coords_top[ai].x;
-                cy += ctx.coords_top[ai].y;
-                cz += ctx.coords_top[ai].z;
+                cx += coords_init[ai].x;
+                cy += coords_init[ai].y;
+                cz += coords_init[ai].z;
             }
             cx /= cgrp.n_atoms;
             cy /= cgrp.n_atoms;
@@ -482,7 +552,7 @@ void init_pshells_with_centroids() {
             r2 = pow(cx - ctx.topo.solute_center.x, 2) + pow(cy - ctx.topo.solute_center.y, 2) + pow(cz - ctx.topo.solute_center.z, 2);
             bool in_shell = r2 > rin2;
             for (int j = 0; j < cgrp.n_atoms; j++) {
-                ctx.shell[cgrp.a[j] - 1] = in_shell;
+                shell[cgrp.a[j] - 1] = in_shell;
                 if (in_shell) {
                     n_inshell++;
                 }
@@ -490,13 +560,18 @@ void init_pshells_with_centroids() {
         }
     }
 
+    if (ctx.run_gpu) {
+        ctx.heavy->upload();
+        ctx.shell->upload();
+    }
+
     printf("(centroids): n_heavy = %d, n_inshell = %d\n", n_heavy, n_inshell);
 }
 
 void init_restrseqs() {
     auto& ctx = Context::instance();
     ctx.n_restrseqs = 1;
-    ctx.restrseqs.resize(1);
+    ctx.restrseqs = std::make_unique<HostDeviceBuffer<restrseq_t>>(1, true, ctx.run_gpu);
 
     restrseq_t seq;
     seq.ai = 1;
@@ -505,7 +580,11 @@ void init_restrseqs() {
     seq.ih = 0;
     seq.to_center = 2;
 
-    ctx.restrseqs[0] = seq;
+    ctx.restrseqs->cpu_data_p[0] = seq;
+
+    if (ctx.run_gpu) {
+        ctx.restrseqs->upload();
+    }
 }
 
 /* =============================================
@@ -515,56 +594,67 @@ void init_restrseqs() {
 
 void init_shake() {
     auto& ctx = Context::instance();
+    auto *heavy = ctx.heavy->cpu_data_p;
+    auto *excluded = ctx.excluded->cpu_data_p;
     int ai, aj;
     int mol = 0;
     int shake;
     int n_solute_shake_constraints = 0;
     double excl_shake = 0;
+    auto &bonds = ctx.bonds->cpu_data_p;
+    auto &cbonds = ctx.cbonds->cpu_data_p;
 
     ctx.n_shake_constraints = 0;
-    ctx.mol_n_shakes.assign(ctx.n_molecules, 0);
+    ctx.mol_n_shakes = std::make_unique<HostDeviceBuffer<int>>(ctx.n_molecules, true, ctx.run_gpu);
+    auto *mol_n_shakes = ctx.mol_n_shakes->cpu_data_p;
 
     for (int bi = 0; bi < ctx.n_bonds; bi++) {
-        ai = ctx.bonds[bi].ai - 1;
-        aj = ctx.bonds[bi].aj - 1;
+        ai = bonds[bi].ai - 1;
+        aj = bonds[bi].aj - 1;
 
         while (mol + 1 < ctx.n_molecules && ai + 1 >= ctx.molecules[mol + 1]) {
             // new molecule
             mol += 1;
         }
 
-        if ((ctx.md.shake_hydrogens && (!ctx.heavy[ai] || !ctx.heavy[aj])) || (ctx.md.shake_solute && ai + 1 <= ctx.n_atoms_solute) || (ctx.md.shake_solvent && ai + 1 > ctx.n_atoms_solute)) {
-            ctx.mol_n_shakes[mol]++;
+        if ((ctx.md.shake_hydrogens && (!heavy[ai] || !heavy[aj])) || (ctx.md.shake_solute && ai + 1 <= ctx.n_atoms_solute) || (ctx.md.shake_solvent && ai + 1 > ctx.n_atoms_solute)) {
+            mol_n_shakes[mol]++;
             ctx.n_shake_constraints++;
 
-            if (ctx.excluded[ai]) excl_shake += 0.5;
-            if (ctx.excluded[aj]) excl_shake += 0.5;
+            if (excluded[ai]) excl_shake += 0.5;
+            if (excluded[aj]) excl_shake += 0.5;
         }
     }
 
-    ctx.shake_bonds.resize(ctx.n_shake_constraints);
+    ctx.shake_bonds = std::make_unique<HostDeviceBuffer<shake_bond_t>>(ctx.n_shake_constraints, true, ctx.run_gpu);
+    auto *shake_bonds = ctx.shake_bonds->cpu_data_p;
     mol = 0;
     shake = 0;
     for (int bi = 0; bi < ctx.n_bonds; bi++) {
-        ai = ctx.bonds[bi].ai - 1;
-        aj = ctx.bonds[bi].aj - 1;
+        ai = bonds[bi].ai - 1;
+        aj = bonds[bi].aj - 1;
 
         while (mol + 1 < ctx.n_molecules && ai + 1 >= ctx.molecules[mol + 1]) {
             // new molecule
             mol += 1;
         }
 
-        if ((ctx.md.shake_hydrogens && (!ctx.heavy[ai] || !ctx.heavy[aj])) || (ctx.md.shake_solute && ai + 1 <= ctx.n_atoms_solute) || (ctx.md.shake_solvent && ai + 1 > ctx.n_atoms_solute)) {
-            ctx.shake_bonds[shake].ai = ai + 1;
-            ctx.shake_bonds[shake].aj = aj + 1;
-            ctx.shake_bonds[shake].dist2 = pow(ctx.cbonds[ctx.bonds[bi].code - 1].b0, 2);
+        if ((ctx.md.shake_hydrogens && (!heavy[ai] || !heavy[aj])) || (ctx.md.shake_solute && ai + 1 <= ctx.n_atoms_solute) || (ctx.md.shake_solvent && ai + 1 > ctx.n_atoms_solute)) {
+            shake_bonds[shake].ai = ai + 1;
+            shake_bonds[shake].aj = aj + 1;
+            shake_bonds[shake].dist2 = pow(cbonds[bonds[bi].code - 1].b0, 2);
             shake++;
         }
     }
 
     // Get total number of shake constraints in solute (used for separate scaling of temperatures)
     for (int i = 0; i < ctx.n_molecules - ctx.n_waters; i++) {
-        n_solute_shake_constraints += ctx.mol_n_shakes[i];
+        n_solute_shake_constraints += mol_n_shakes[i];
+    }
+
+    if (ctx.run_gpu) {
+        ctx.mol_n_shakes->upload();
+        ctx.shake_bonds->upload();
     }
 
     ctx.Ndegf = 3 * ctx.n_atoms - ctx.n_shake_constraints;
@@ -614,6 +704,10 @@ void init_patoms() {
 // states above 0 get appended as extra codes.
 static void init_unified_atom_parameters() {
     auto& ctx = Context::instance();
+    auto &charges = ctx.charges->cpu_data_p;
+    auto &ccharges = ctx.ccharges->cpu_data_p;
+    auto &atypes = ctx.atypes->cpu_data_p;
+    auto &catypes = ctx.catypes->cpu_data_p;
     const int n_states = ctx.n_parameter_states();
 
     ctx.atom_to_qi.assign(ctx.n_atoms, -1);
@@ -623,8 +717,10 @@ static void init_unified_atom_parameters() {
 
     const int n_extra_q_states = n_states > 0 ? n_states - 1 : 0;
     const int n_codes = ctx.n_atoms + ctx.n_qatoms * n_extra_q_states;
-    ctx.unified_ccharges.resize(n_codes);
-    ctx.unified_catypes.resize(n_codes);
+    ctx.unified_ccharges = std::make_unique<HostDeviceBuffer<ccharge_t>>(n_codes, true, ctx.run_gpu);
+    ctx.unified_catypes = std::make_unique<HostDeviceBuffer<catype_t>>(n_codes, true, ctx.run_gpu);
+    auto *unified_ccharges = ctx.unified_ccharges->cpu_data_p;
+    auto *unified_catypes = ctx.unified_catypes->cpu_data_p;
 
     for (int atom_idx = 0; atom_idx < ctx.n_atoms; atom_idx++) {
         const int unified_code = atom_idx + 1;
@@ -641,13 +737,13 @@ static void init_unified_atom_parameters() {
             resolved_type = ctx.q_catypes[ctx.q_atypes[qi].code - 1];
             resolved_type.code = unified_code;
         } else {
-            resolved_charge.charge = ctx.ccharges[ctx.charges[atom_idx].code - 1].charge;
-            resolved_type = ctx.catypes[ctx.atypes[atom_idx].code - 1];
+            resolved_charge.charge = ccharges[charges[atom_idx].code - 1].charge;
+            resolved_type = catypes[atypes[atom_idx].code - 1];
             resolved_type.code = unified_code;
         }
 
-        ctx.unified_ccharges[unified_code - 1] = resolved_charge;
-        ctx.unified_catypes[unified_code - 1] = resolved_type;
+        unified_ccharges[unified_code - 1] = resolved_charge;
+        unified_catypes[unified_code - 1] = resolved_type;
     }
 
     for (int state = 1; state < n_states; state++) {
@@ -661,8 +757,8 @@ static void init_unified_atom_parameters() {
             catype_t resolved_type = ctx.q_catypes[ctx.q_atypes[qi + state * ctx.n_qatoms].code - 1];
             resolved_type.code = unified_code;
 
-            ctx.unified_ccharges[unified_code - 1] = resolved_charge;
-            ctx.unified_catypes[unified_code - 1] = resolved_type;
+            unified_ccharges[unified_code - 1] = resolved_charge;
+            unified_catypes[unified_code - 1] = resolved_type;
         }
     }
 }
@@ -781,7 +877,7 @@ void init_variables() {
     ctx.EQ_nonbond_qp.assign(ctx.n_lambdas, {});
     ctx.EQ_nonbond_qw.assign(ctx.n_lambdas, {});
     ctx.EQ_nonbond_qx.assign(ctx.n_lambdas, {});
-    ctx.EQ_restraint.assign(ctx.n_lambdas, {});
+    ctx.EQ_restraint = std::make_unique<HostDeviceBuffer<E_restraint_t>>(ctx.n_lambdas, true, ctx.run_gpu);
 
     if (ctx.n_shake_constraints > 0) {
         initial_shaking();
@@ -798,6 +894,11 @@ void init_variables() {
 
 void clean_variables() {
     auto& ctx = Context::instance();
+    ctx.angles.reset();
+    ctx.cangles.reset();
+    ctx.bonds.reset();
+    ctx.cbonds.reset();
+    ctx.coords_init.reset();
 
     for (auto& charge_group : ctx.charge_groups) {
         delete[] charge_group.a;
@@ -805,26 +906,42 @@ void clean_variables() {
     }
     ctx.charge_groups.clear();
 
-    ctx.angles.clear();
-    ctx.atypes.clear();
-    ctx.bonds.clear();
-    ctx.cangles.clear();
-    ctx.catypes.clear();
-    ctx.cbonds.clear();
-    ctx.ccharges.clear();
-    ctx.charges.clear();
+    ctx.atypes.reset();
+    ctx.catypes.reset();
+    ctx.ccharges.reset();
+    ctx.charges.reset();
     ctx.atom_to_qi.clear();
-    ctx.unified_ccharges.clear();
-    ctx.unified_catypes.clear();
-    ctx.cimpropers.clear();
-    ctx.coords.clear();
-    ctx.coords_top.clear();
-    ctx.ctorsions.clear();
+    ctx.unified_ccharges.reset();
+    ctx.unified_catypes.reset();
+    ctx.ngbrs_14.reset();
+    ctx.ngbrs_14_builder.clear();
+    ctx.p_atoms_list.reset();
+    ctx.w_atoms_list.reset();
+    ctx.q_atoms_list.reset();
+    ctx.charge_table_all.reset();
+    ctx.charge_pair_products.reset();
+    ctx.p_charge_types.reset();
+    ctx.w_charge_types.reset();
+    ctx.q_charge_types.reset();
+    ctx.catype_table_all.reset();
+    ctx.catype_pair_params.reset();
+    ctx.p_catype_types.reset();
+    ctx.w_catype_types.reset();
+    ctx.q_catype_types.reset();
+    ctx.catype_to_type_host.clear();
+    ctx.n_charge_types = 0;
+    ctx.zero_charge_type = -1;
+    ctx.n_catype_types = 0;
+    ctx.zero_catype_type = -1;
+    ctx.cimpropers.reset();
+    ctx.coords.reset();
+    ctx.ctorsions.reset();
     ctx.excluded.reset();
     ctx.heavy.reset();
-    ctx.impropers.clear();
-    ctx.torsions.clear();
-    ctx.LJ_matrix.clear();
+    ctx.impropers.reset();
+    ctx.winv.reset();
+    ctx.torsions.reset();
+    ctx.LJ_matrix.reset();
     ctx.molecules.clear();
     ctx.q_angcouples.clear();
     ctx.q_atoms.clear();
@@ -841,30 +958,35 @@ void clean_variables() {
     ctx.q_charges.clear();
     ctx.q_angles.clear();
     ctx.q_bonds.clear();
-    ctx.q_elscales.clear();
+    ctx.q_elscales.reset();
     ctx.q_exclpairs.clear();
     ctx.q_impropers.clear();
     ctx.q_shakes.clear();
     ctx.q_softcores.clear();
     ctx.q_torsions.clear();
-    ctx.wshells.clear();
+    ctx.lambdas.reset();
+    ctx.wshells.reset();
     ctx.theta.clear();
     ctx.theta0.clear();
     ctx.tdum.clear();
     ctx.list_sh.clear();
     ctx.nsort.clear();
-    ctx.restrseqs.clear();
+    ctx.restrseqs.reset();
+    ctx.restrangs.reset();
+    ctx.restrdists.reset();
+    ctx.restrspos.reset();
+    ctx.restrwalls.reset();
     ctx.shell.reset();
-    ctx.velocities.clear();
-    ctx.dvelocities.clear();
-    ctx.xcoords.clear();
-    ctx.mol_n_shakes.clear();
-    ctx.shake_bonds.clear();
+    ctx.velocities.reset();
+    ctx.dvelocities.reset();
+    ctx.xcoords.reset();
+    ctx.mol_n_shakes.reset();
+    ctx.shake_bonds.reset();
     ctx.EQ_total.clear();
     ctx.EQ_bond.clear();
     ctx.EQ_nonbond_qq.clear();
     ctx.EQ_nonbond_qp.clear();
     ctx.EQ_nonbond_qw.clear();
     ctx.EQ_nonbond_qx.clear();
-    ctx.EQ_restraint.clear();
+    ctx.EQ_restraint.reset();
 }
diff --git a/src/core/common/src/output.cpp b/src/core/common/src/output.cpp
index ea734cfd..d70d086a 100644
--- a/src/core/common/src/output.cpp
+++ b/src/core/common/src/output.cpp
@@ -3,6 +3,8 @@
 
 void print_energies() {
     auto& host = Context::instance();
+    auto *lambdas = host.lambdas->cpu_data_p;
+    auto *EQ_restraint = host.EQ_restraint->cpu_data_p;
     printf("[temperature]\n");
     printf("Temp\t%f\n", host.Temp);
     printf("\n");
@@ -34,8 +36,8 @@ void print_energies() {
     printf("[q-energies]\n");
     printf("lambda\tSUM\tUbond\tUangle\tUtor\tUimp\tUcoul\tUvdw\tUrestr\n");
     for (int state = 0; state < host.n_lambdas; state++) {
-        printf("%f\t%f\t%f\t%f\t%f\t%f\t%f\t%f\t%f\n", host.lambdas[state], host.EQ_total[state].Utot, host.EQ_bond[state].Ubond,
-               host.EQ_bond[state].Uangle, host.EQ_bond[state].Utor, host.EQ_bond[state].Uimp, host.EQ_nonbond_qx[state].Ucoul, host.EQ_nonbond_qx[state].Uvdw, host.EQ_restraint[state].Urestr);
+        printf("%f\t%f\t%f\t%f\t%f\t%f\t%f\t%f\t%f\n", lambdas[state], host.EQ_total[state].Utot, host.EQ_bond[state].Ubond,
+               host.EQ_bond[state].Uangle, host.EQ_bond[state].Utor, host.EQ_bond[state].Uimp, host.EQ_nonbond_qx[state].Ucoul, host.EQ_nonbond_qx[state].Uvdw, EQ_restraint[state].Urestr);
     }
     printf("\n");
 
@@ -64,6 +66,7 @@ void write_header(const char* filename) {
 // Write header (number of atoms & lambdas) to output file
 void write_energy_header() {
     auto& ctx = Context::instance();
+    auto *lambdas = ctx.lambdas->cpu_data_p;
     FILE* fp;
 
     char path[1024];
@@ -77,7 +80,7 @@ void write_energy_header() {
     fprintf(fp, "%d\n", ctx.n_lambdas);
 
     for (int state = 0; state < ctx.n_lambdas; state++) {
-        fprintf(fp, "%f\n", ctx.lambdas[state]);
+        fprintf(fp, "%f\n", lambdas[state]);
     }
 
     fclose(fp);
@@ -86,6 +89,7 @@ void write_energy_header() {
 // Write step number, coordinates of atoms to coordinate output file
 void write_coords(int iteration) {
     auto& ctx = Context::instance();
+    auto &coords = ctx.coords->cpu_data_p;
     if (iteration % ctx.md.trajectory != 0) return;
     FILE* fp;
     int i;
@@ -97,7 +101,7 @@ void write_coords(int iteration) {
 
     fprintf(fp, "%d\n", iteration / ctx.md.trajectory);
     for (i = 0; i < ctx.n_atoms; i++) {
-        fprintf(fp, "%f;%f;%f\n", ctx.coords[i].x, ctx.coords[i].y, ctx.coords[i].z);
+        fprintf(fp, "%f;%f;%f\n", coords[i].x, coords[i].y, coords[i].z);
     }
 
     fclose(fp);
@@ -106,6 +110,7 @@ void write_coords(int iteration) {
 // Write step number, velocities of atoms to coordinate output file
 void write_velocities(int iteration) {
     auto& ctx = Context::instance();
+    auto &velocities = ctx.velocities->cpu_data_p;
     if (iteration % ctx.md.trajectory != 0) return;
     FILE* fp;
     int i;
@@ -117,7 +122,7 @@ void write_velocities(int iteration) {
 
     fprintf(fp, "%d\n", iteration / ctx.md.trajectory);
     for (i = 0; i < ctx.n_atoms; i++) {
-        fprintf(fp, "%f;%f;%f\n", ctx.velocities[i].x, ctx.velocities[i].y, ctx.velocities[i].z);
+        fprintf(fp, "%f;%f;%f\n", velocities[i].x, velocities[i].y, velocities[i].z);
     }
 
     fclose(fp);
@@ -126,6 +131,8 @@ void write_velocities(int iteration) {
 // Write step number, energies of atoms to coordinate output file
 void write_energies(int iteration) {
     auto& ctx = Context::instance();
+    auto *lambdas = ctx.lambdas->cpu_data_p;
+    auto *EQ_restraint = ctx.EQ_restraint->cpu_data_p;
     if (iteration % ctx.md.energy != 0) return;
     FILE* fp;
 
@@ -167,8 +174,8 @@ void write_energies(int iteration) {
     fprintf(fp, "[q-energies]\n");
     fprintf(fp, "lambda\tSUM\tUbond\tUangle\tUtor\tUimp\tUcoul\tUvdw\tUrestr\n");
     for (int state = 0; state < ctx.n_lambdas; state++) {
-        fprintf(fp, "%f\t%f\t%f\t%f\t%f\t%f\t%f\t%f\t%f\n", ctx.lambdas[state], ctx.EQ_total[state].Utot, ctx.EQ_bond[state].Ubond,
-                ctx.EQ_bond[state].Uangle, ctx.EQ_bond[state].Utor, ctx.EQ_bond[state].Uimp, ctx.EQ_nonbond_qx[state].Ucoul, ctx.EQ_nonbond_qx[state].Uvdw, ctx.EQ_restraint[state].Urestr);
+        fprintf(fp, "%f\t%f\t%f\t%f\t%f\t%f\t%f\t%f\t%f\n", lambdas[state], ctx.EQ_total[state].Utot, ctx.EQ_bond[state].Ubond,
+                ctx.EQ_bond[state].Uangle, ctx.EQ_bond[state].Utor, ctx.EQ_bond[state].Uimp, ctx.EQ_nonbond_qx[state].Ucoul, ctx.EQ_nonbond_qx[state].Uvdw, EQ_restraint[state].Urestr);
     }
     fprintf(fp, "\n");
 
@@ -179,4 +186,4 @@ void write_energies(int iteration) {
     fprintf(fp, "\n");
 
     fclose(fp);
-}
\ No newline at end of file
+}
diff --git a/src/core/common/src/parse.cpp b/src/core/common/src/parse.cpp
index 8a6f3b00..fc0ca7b6 100644
--- a/src/core/common/src/parse.cpp
+++ b/src/core/common/src/parse.cpp
@@ -215,18 +215,23 @@ void init_md(const char* filename) {
 #ifdef VERBOSE
     printf("reading in %d lambdas (%s in file)\n", ctx.n_lambdas, file.buffer[k][1]);
 #endif
-    ctx.lambdas.resize(ctx.n_lambdas);
+    ctx.lambdas = std::make_unique<HostDeviceBuffer<double>>(ctx.n_lambdas, true, ctx.run_gpu);
+    auto *lambdas = ctx.lambdas->cpu_data_p;
     k++;
     for (int i = 0; i < ctx.n_lambdas; i++) {
-        ctx.lambdas[i] = strtod(file.buffer[k][0], &eptr);
+        lambdas[i] = strtod(file.buffer[k][0], &eptr);
         k++;
     }
+    if (ctx.run_gpu) {
+        ctx.lambdas->upload();
+    }
 
     // [sequence_restraints]
     printf("k = %d\n", k);
     ctx.n_restrseqs = atoi(file.buffer[k][0]);
     printf("reading in %d sequence restraints (%s in file)\n", ctx.n_restrseqs, file.buffer[k][1]);
-    ctx.restrseqs.resize(ctx.n_restrseqs);
+    ctx.restrseqs = std::make_unique<HostDeviceBuffer<restrseq_t>>(ctx.n_restrseqs, true, ctx.run_gpu);
+    auto &restrseqs = ctx.restrseqs->cpu_data_p;
     k++;
     for (int i = 0; i < ctx.n_restrseqs; i++) {
         restrseq_t restrseq;
@@ -237,14 +242,19 @@ void init_md(const char* filename) {
         restrseq.ih = strcmp(file.buffer[k][3], "1") == 0;
         restrseq.to_center = atoi(file.buffer[k][4]);
 
-        ctx.restrseqs[i] = restrseq;
+        restrseqs[i] = restrseq;
         k++;
     }
 
+    if (ctx.run_gpu) {
+        ctx.restrseqs->upload();
+    }
+
     // [position_restraints]
     ctx.n_restrspos = atoi(file.buffer[k][0]);
     printf("reading in %d position restraints\n (%s in file )", ctx.n_restrspos, file.buffer[k][1]);
-    ctx.restrspos.resize(ctx.n_restrspos);
+    ctx.restrspos = std::make_unique<HostDeviceBuffer<restrpos_t>>(ctx.n_restrspos, true, ctx.run_gpu);
+    auto &restrspos = ctx.restrspos->cpu_data_p;
     k++;
     for (int i = 0; i < ctx.n_restrspos; i++) {
         restrpos_t restrpos;
@@ -264,13 +274,18 @@ void init_md(const char* filename) {
         restrpos.x = r_x;
         restrpos.k = r_k;
 
-        ctx.restrspos[i] = restrpos;
+        restrspos[i] = restrpos;
         k++;
     }
 
+    if (ctx.run_gpu) {
+        ctx.restrspos->upload();
+    }
+
     // [distance_restraints]
     ctx.n_restrdists = atoi(file.buffer[k][0]);
-    ctx.restrdists.resize(ctx.n_restrdists);
+    ctx.restrdists = std::make_unique<HostDeviceBuffer<restrdis_t>>(ctx.n_restrdists, true, ctx.run_gpu);
+    auto &restrdists = ctx.restrdists->cpu_data_p;
     printf("reading in %d distance restraints (%s in file)\n", ctx.n_restrdists, file.buffer[k][1]);
     k++;
     for (int i = 0; i < ctx.n_restrdists; i++) {
@@ -283,13 +298,18 @@ void init_md(const char* filename) {
         restrdist.k = strtod(file.buffer[k][4], &eptr);
         restrdist.ipsi = atoi(file.buffer[k][5]);
 
-        ctx.restrdists[i] = restrdist;
+        restrdists[i] = restrdist;
         k++;
     }
 
+    if (ctx.run_gpu) {
+        ctx.restrdists->upload();
+    }
+
     // [angle_restraints]
     ctx.n_restrangs = atoi(file.buffer[k][0]);
-    ctx.restrangs.resize(ctx.n_restrangs);
+    ctx.restrangs = std::make_unique<HostDeviceBuffer<restrang_t>>(ctx.n_restrangs, true, ctx.run_gpu);
+    auto &restrangs = ctx.restrangs->cpu_data_p;
     printf("reading in %d angle restraints (%s in file)\n", ctx.n_restrangs, file.buffer[k][1]);
     k++;
     for (int i = 0; i < ctx.n_restrangs; i++) {
@@ -302,13 +322,18 @@ void init_md(const char* filename) {
         restrang.ang = strtod(file.buffer[k][4], &eptr);
         restrang.k = strtod(file.buffer[k][5], &eptr);
 
-        ctx.restrangs[i] = restrang;
+        restrangs[i] = restrang;
         k++;
     }
 
+    if (ctx.run_gpu) {
+        ctx.restrangs->upload();
+    }
+
     // [wall_restraints]
     ctx.n_restrwalls = atoi(file.buffer[k][0]);
-    ctx.restrwalls.resize(ctx.n_restrwalls);
+    ctx.restrwalls = std::make_unique<HostDeviceBuffer<restrwall_t>>(ctx.n_restrwalls, true, ctx.run_gpu);
+    auto &restrwalls = ctx.restrwalls->cpu_data_p;
     printf("reading in %d wall restraints (%s in file)\n", ctx.n_restrwalls, file.buffer[k][1]);
     k++;
     for (int i = 0; i < ctx.n_restrwalls; i++) {
@@ -322,10 +347,14 @@ void init_md(const char* filename) {
         restrwall.aMorse = strtod(file.buffer[k][5], &eptr);
         restrwall.ih = strcmp(file.buffer[k][6], "1") == 0;
 
-        ctx.restrwalls[i] = restrwall;
+        restrwalls[i] = restrwall;
         k++;
     }
 
+    if (ctx.run_gpu) {
+        ctx.restrwalls->upload();
+    }
+
     clean_csv(file);
 }
 
@@ -377,7 +406,6 @@ void init_coords(const char* filename) {
     auto& ctx = Context::instance();
     csvfile_t file = read_csv(filename, 1, ctx.base_folder.c_str());
 
-    ctx.n_coords = 0;
     ctx.n_atoms = 0;
     ctx.n_atoms_solute = 0;
 
@@ -386,22 +414,28 @@ void init_coords(const char* filename) {
         return;
     }
 
-    ctx.n_coords = atoi(file.buffer[0][0]);
-    ctx.n_atoms = ctx.n_coords;
+    ctx.n_atoms = atoi(file.buffer[0][0]);
 
     ctx.n_atoms_solute = atoi(file.buffer[1][0]);
 
-    ctx.coords.resize(ctx.n_atoms);
-    ctx.coords_top.resize(ctx.n_atoms);
+    ctx.coords_init = std::make_unique<HostDeviceBuffer<coord_t>>(ctx.n_atoms, true, ctx.run_gpu);
+    ctx.coords = std::make_unique<HostDeviceBuffer<coord_t>>(ctx.n_atoms, true, ctx.run_gpu);
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &coords_init = ctx.coords_init->cpu_data_p;
 
     for (int i = 0; i < file.n_lines; i++) {
         char* eptr;
 
-        ctx.coords[i].x = strtod(file.buffer[i + 2][0], &eptr);
-        ctx.coords[i].y = strtod(file.buffer[i + 2][1], &eptr);
-        ctx.coords[i].z = strtod(file.buffer[i + 2][2], &eptr);
+        coords[i].x = strtod(file.buffer[i + 2][0], &eptr);
+        coords[i].y = strtod(file.buffer[i + 2][1], &eptr);
+        coords[i].z = strtod(file.buffer[i + 2][2], &eptr);
+
+        coords_init[i] = coords[i];
+    }
 
-        ctx.coords_top[i] = ctx.coords[i];
+    if (ctx.run_gpu) {
+        ctx.coords->upload();
+        ctx.coords_init->upload();
     }
 
     clean_csv(file);
@@ -421,8 +455,8 @@ void init_bonds(const char* filename) {
 
     ctx.n_bonds = atoi(file.buffer[0][0]);
     ctx.n_bonds_solute = atoi(file.buffer[1][0]);
-
-    ctx.bonds.resize(ctx.n_bonds);
+    ctx.bonds = std::make_unique<HostDeviceBuffer<bond_t>>(ctx.n_bonds, true, ctx.run_gpu);
+    auto &bonds = ctx.bonds->cpu_data_p;
 
     for (int i = 0; i < ctx.n_bonds; i++) {
         bond_t bond;
@@ -431,7 +465,11 @@ void init_bonds(const char* filename) {
         bond.aj = atoi(file.buffer[i + 2][1]);
         bond.code = atoi(file.buffer[i + 2][2]);
 
-        ctx.bonds[i] = bond;
+        bonds[i] = bond;
+    }
+
+    if (ctx.run_gpu) {
+        ctx.bonds->upload();
     }
 
     clean_csv(file);
@@ -449,7 +487,8 @@ void init_cbonds(const char* filename) {
     }
 
     ctx.n_cbonds = atoi(file.buffer[0][0]);
-    ctx.cbonds.resize(ctx.n_cbonds);
+    ctx.cbonds = std::make_unique<HostDeviceBuffer<cbond_t>>(ctx.n_cbonds, true, ctx.run_gpu);
+    auto &cbonds = ctx.cbonds->cpu_data_p;
 
     for (int i = 0; i < ctx.n_cbonds; i++) {
         cbond_t cbond;
@@ -459,7 +498,11 @@ void init_cbonds(const char* filename) {
         cbond.kb = strtod(file.buffer[i + 1][1], &eptr);
         cbond.b0 = strtod(file.buffer[i + 1][2], &eptr);
 
-        ctx.cbonds[i] = cbond;
+        cbonds[i] = cbond;
+    }
+
+    if (ctx.run_gpu) {
+        ctx.cbonds->upload();
     }
 
     clean_csv(file);
@@ -479,8 +522,8 @@ void init_angles(const char* filename) {
 
     ctx.n_angles = atoi(file.buffer[0][0]);
     ctx.n_angles_solute = atoi(file.buffer[1][0]);
-
-    ctx.angles.resize(ctx.n_angles);
+    ctx.angles = std::make_unique<HostDeviceBuffer<angle_t>>(ctx.n_angles, true, ctx.run_gpu);
+    auto &angles = ctx.angles->cpu_data_p;
 
     for (int i = 0; i < ctx.n_angles; i++) {
         angle_t angle;
@@ -490,9 +533,14 @@ void init_angles(const char* filename) {
         angle.ak = atoi(file.buffer[i + 2][2]);
         angle.code = atoi(file.buffer[i + 2][3]);
 
-        ctx.angles[i] = angle;
+        angles[i] = angle;
     }
 
+    if (ctx.run_gpu) {
+        ctx.angles->upload();
+    }
+
+
     clean_csv(file);
 }
 
@@ -508,7 +556,8 @@ void init_cangles(const char* filename) {
     }
 
     ctx.n_cangles = atoi(file.buffer[0][0]);
-    ctx.cangles.resize(ctx.n_cangles);
+    ctx.cangles = std::make_unique<HostDeviceBuffer<cangle_t>>(ctx.n_cangles, true, ctx.run_gpu);
+    auto &cangles = ctx.cangles->cpu_data_p;
 
     for (int i = 0; i < ctx.n_cangles; i++) {
         cangle_t cangle;
@@ -518,7 +567,11 @@ void init_cangles(const char* filename) {
         cangle.kth = strtod(file.buffer[i + 1][1], &eptr);
         cangle.th0 = strtod(file.buffer[i + 1][2], &eptr);
 
-        ctx.cangles[i] = cangle;
+        cangles[i] = cangle;
+    }
+
+    if (ctx.run_gpu) {
+        ctx.cangles->upload();
     }
 
     clean_csv(file);
@@ -526,9 +579,10 @@ void init_cangles(const char* filename) {
 
 void init_excluded(const char* filename) {
     auto& ctx = Context::instance();
-    ctx.excluded = std::make_unique<bool[]>(ctx.n_atoms);
+    ctx.excluded = std::make_unique<HostDeviceBuffer<bool>>(ctx.n_atoms, true, ctx.run_gpu);
+    auto *excluded = ctx.excluded->cpu_data_p;
     for (int i = 0; i < ctx.n_atoms; ++i) {
-        ctx.excluded[i] = false;
+        excluded[i] = false;
     }
     ctx.n_excluded = 0;
 
@@ -562,10 +616,14 @@ void init_excluded(const char* filename) {
     ctx.n_excluded = 0;
     for (int i = 0; i < ctx.n_atoms && i < read_len; i++) {
         bool excl = (line[i] == '1');
-        ctx.excluded[i] = excl;
+        excluded[i] = excl;
         if (excl) ctx.n_excluded++;
     }
 
+    if (ctx.run_gpu) {
+        ctx.excluded->upload();
+    }
+
     printf("Number of excluded atoms: %d\n", ctx.n_excluded);
 
     free(line);
@@ -586,8 +644,8 @@ void init_torsions(const char* filename) {
 
     ctx.n_torsions = atoi(file.buffer[0][0]);
     ctx.n_torsions_solute = atoi(file.buffer[1][0]);
-
-    ctx.torsions.resize(ctx.n_torsions);
+    ctx.torsions = std::make_unique<HostDeviceBuffer<torsion_t>>(ctx.n_torsions, true, ctx.run_gpu);
+    auto &torsions = ctx.torsions->cpu_data_p;
 
     for (int i = 0; i < ctx.n_torsions; i++) {
         torsion_t torsion;
@@ -598,7 +656,11 @@ void init_torsions(const char* filename) {
         torsion.al = atoi(file.buffer[i + 2][3]);
         torsion.code = atoi(file.buffer[i + 2][4]);
 
-        ctx.torsions[i] = torsion;
+        torsions[i] = torsion;
+    }
+
+    if (ctx.run_gpu) {
+        ctx.torsions->upload();
     }
 
     clean_csv(file);
@@ -616,8 +678,8 @@ void init_ctorsions(const char* filename) {
     }
 
     ctx.n_ctorsions = atoi(file.buffer[0][0]);
-
-    ctx.ctorsions.resize(ctx.n_ctorsions);
+    ctx.ctorsions = std::make_unique<HostDeviceBuffer<ctorsion_t>>(ctx.n_ctorsions, true, ctx.run_gpu);
+    auto &ctorsions = ctx.ctorsions->cpu_data_p;
 
     for (int i = 0; i < ctx.n_ctorsions; i++) {
         ctorsion_t ctorsion;
@@ -630,7 +692,11 @@ void init_ctorsions(const char* filename) {
         ctorsion.paths = strtod(file.buffer[i + 1][4], &eptr);
         ctorsion.paths = 1.0 / (ctorsion.paths);
 
-        ctx.ctorsions[i] = ctorsion;
+        ctorsions[i] = ctorsion;
+    }
+
+    if (ctx.run_gpu) {
+        ctx.ctorsions->upload();
     }
 
     clean_csv(file);
@@ -650,8 +716,8 @@ void init_impropers(const char* filename) {
 
     ctx.n_impropers = atoi(file.buffer[0][0]);
     ctx.n_impropers_solute = atoi(file.buffer[1][0]);
-
-    ctx.impropers.resize(ctx.n_impropers);
+    ctx.impropers = std::make_unique<HostDeviceBuffer<improper_t>>(ctx.n_impropers, true, ctx.run_gpu);
+    auto &impropers = ctx.impropers->cpu_data_p;
 
     for (int i = 0; i < ctx.n_impropers; i++) {
         improper_t improper;
@@ -662,7 +728,11 @@ void init_impropers(const char* filename) {
         improper.al = atoi(file.buffer[i + 2][3]);
         improper.code = atoi(file.buffer[i + 2][4]);
 
-        ctx.impropers[i] = improper;
+        impropers[i] = improper;
+    }
+
+    if (ctx.run_gpu) {
+        ctx.impropers->upload();
     }
 
     clean_csv(file);
@@ -680,8 +750,8 @@ void init_cimpropers(const char* filename) {
     }
 
     ctx.n_cimpropers = atoi(file.buffer[0][0]);
-
-    ctx.cimpropers.resize(ctx.n_cimpropers);
+    ctx.cimpropers = std::make_unique<HostDeviceBuffer<cimproper_t>>(ctx.n_cimpropers, true, ctx.run_gpu);
+    auto &cimpropers = ctx.cimpropers->cpu_data_p;
 
     for (int i = 0; i < ctx.n_cimpropers; i++) {
         cimproper_t cimproper;
@@ -691,7 +761,11 @@ void init_cimpropers(const char* filename) {
         cimproper.k = strtod(file.buffer[i + 1][1], &eptr);
         cimproper.phi0 = strtod(file.buffer[i + 1][2], &eptr);
 
-        ctx.cimpropers[i] = cimproper;
+        cimpropers[i] = cimproper;
+    }
+
+    if (ctx.run_gpu) {
+        ctx.cimpropers->upload();
     }
 
     clean_csv(file);
@@ -709,8 +783,8 @@ void init_charges(const char* filename) {
     }
 
     ctx.n_charges = atoi(file.buffer[0][0]);
-
-    ctx.charges.resize(ctx.n_charges);
+    ctx.charges = std::make_unique<HostDeviceBuffer<charge_t>>(ctx.n_charges, true, ctx.run_gpu);
+    auto &charges = ctx.charges->cpu_data_p;
 
     for (int i = 0; i < ctx.n_charges; i++) {
         charge_t charge;
@@ -718,7 +792,11 @@ void init_charges(const char* filename) {
         charge.a = atoi(file.buffer[i + 1][0]);
         charge.code = atoi(file.buffer[i + 1][1]);
 
-        ctx.charges[i] = charge;
+        charges[i] = charge;
+    }
+
+    if (ctx.run_gpu) {
+        ctx.charges->upload();
     }
 
     clean_csv(file);
@@ -736,8 +814,8 @@ void init_ccharges(const char* filename) {
     }
 
     ctx.n_ccharges = atoi(file.buffer[0][0]);
-
-    ctx.ccharges.resize(ctx.n_ccharges);
+    ctx.ccharges = std::make_unique<HostDeviceBuffer<ccharge_t>>(ctx.n_ccharges, true, ctx.run_gpu);
+    auto &ccharges = ctx.ccharges->cpu_data_p;
 
     for (int i = 0; i < ctx.n_ccharges; i++) {
         ccharge_t ccharge;
@@ -746,7 +824,11 @@ void init_ccharges(const char* filename) {
         ccharge.code = atoi(file.buffer[i + 1][0]);
         ccharge.charge = strtod(file.buffer[i + 1][1], &eptr);
 
-        ctx.ccharges[i] = ccharge;
+        ccharges[i] = ccharge;
+    }
+
+    if (ctx.run_gpu) {
+        ctx.ccharges->upload();
     }
 
     clean_csv(file);
@@ -754,6 +836,7 @@ void init_ccharges(const char* filename) {
 
 void init_ngbrs14(const char* filename) {
     auto& ctx = Context::instance();
+    auto &LJ_matrix = ctx.LJ_matrix->cpu_data_p;
     FILE* fp;
 
     char path[1024];
@@ -787,10 +870,10 @@ void init_ngbrs14(const char* filename) {
                 int ix = lineI;
                 int jx = (lineI + i + 1) % lines;
                 // if (ix < 100 && jx < 100) printf("i = %d j = %d\n", ix+1, jx+1);
-                ctx.LJ_matrix[ix * ctx.n_atoms_solute + jx] = 1;
-                ctx.LJ_matrix[jx * ctx.n_atoms_solute + ix] = 1;
+                LJ_matrix[ix * ctx.n_atoms_solute + jx] = 1;
+                LJ_matrix[jx * ctx.n_atoms_solute + ix] = 1;
 
-                ctx.ngbrs_14.push_back({ix, jx, NONBONDED_14_PP}); // the type may is wrong, just set in here
+                ctx.ngbrs_14_builder.push_back({ix, jx, NONBONDED_14_PP}); // the type may is wrong, just set in here
             }
         }
         lineI++;
@@ -801,6 +884,7 @@ void init_ngbrs14(const char* filename) {
 
 void init_ngbrs23(const char* filename) {
     auto& ctx = Context::instance();
+    auto &LJ_matrix = ctx.LJ_matrix->cpu_data_p;
     FILE* fp;
 
     char path[1024];
@@ -834,8 +918,8 @@ void init_ngbrs23(const char* filename) {
                 int ix = lineI;
                 int jx = (lineI + i + 1) % lines;
                 // if (ix < 100 && jx < 100) printf("i = %d j = %d\n", ix+1, jx+1);
-                ctx.LJ_matrix[ix * ctx.n_atoms_solute + jx] = 3;
-                ctx.LJ_matrix[jx * ctx.n_atoms_solute + ix] = 3;
+                LJ_matrix[ix * ctx.n_atoms_solute + jx] = 3;
+                LJ_matrix[jx * ctx.n_atoms_solute + ix] = 3;
             }
         }
         lineI++;
@@ -846,6 +930,7 @@ void init_ngbrs23(const char* filename) {
 
 void init_ngbrs14_long(const char* filename) {
     auto& ctx = Context::instance();
+    auto &LJ_matrix = ctx.LJ_matrix->cpu_data_p;
     csvfile_t file = read_csv(filename, 0, ctx.base_folder.c_str());
 
     if (file.n_lines < 1) {
@@ -858,13 +943,13 @@ void init_ngbrs14_long(const char* filename) {
     for (int i = 0; i < n_ngbrs14_long; i++) {
         int ix = atoi(file.buffer[i + 1][0]) - 1;
         int jx = atoi(file.buffer[i + 1][1]) - 1;
-        ctx.LJ_matrix[ix * ctx.n_atoms_solute + jx] = 1;
-        ctx.LJ_matrix[jx * ctx.n_atoms_solute + ix] = 1;
+        LJ_matrix[ix * ctx.n_atoms_solute + jx] = 1;
+        LJ_matrix[jx * ctx.n_atoms_solute + ix] = 1;
 
 
         int mi_x = std::min(ix, jx);
         int mx_x = std::max(ix, jx);
-        ctx.ngbrs_14.push_back({mi_x, mx_x, NONBONDED_14_PP}); // the type may is wrong, just set in here
+        ctx.ngbrs_14_builder.push_back({mi_x, mx_x, NONBONDED_14_PP}); // the type may is wrong, just set in here
 
     }
 
@@ -873,6 +958,7 @@ void init_ngbrs14_long(const char* filename) {
 
 void init_ngbrs23_long(const char* filename) {
     auto& ctx = Context::instance();
+    auto &LJ_matrix = ctx.LJ_matrix->cpu_data_p;
     csvfile_t file = read_csv(filename, 0, ctx.base_folder.c_str());
 
     if (file.n_lines < 1) {
@@ -885,8 +971,8 @@ void init_ngbrs23_long(const char* filename) {
     for (int i = 0; i < n_ngbrs23_long; i++) {
         int ix = atoi(file.buffer[i + 1][0]) - 1;
         int jx = atoi(file.buffer[i + 1][1]) - 1;
-        ctx.LJ_matrix[ix * ctx.n_atoms_solute + jx] = 3;
-        ctx.LJ_matrix[jx * ctx.n_atoms_solute + ix] = 3;
+        LJ_matrix[ix * ctx.n_atoms_solute + jx] = 3;
+        LJ_matrix[jx * ctx.n_atoms_solute + ix] = 3;
     }
 
     clean_csv(file);
@@ -894,7 +980,7 @@ void init_ngbrs23_long(const char* filename) {
 
 void init_LJ_matrix() {
     auto& ctx = Context::instance();
-    ctx.LJ_matrix.assign(ctx.n_atoms_solute * ctx.n_atoms_solute, 0);
+    ctx.LJ_matrix = std::make_unique<HostDeviceBuffer<int>>(ctx.n_atoms_solute * ctx.n_atoms_solute, true, ctx.run_gpu);
 }
 
 void init_catypes(const char* filename) {
@@ -909,7 +995,8 @@ void init_catypes(const char* filename) {
     }
 
     ctx.n_catypes = atoi(file.buffer[0][0]);
-    ctx.catypes.resize(ctx.n_catypes);
+    ctx.catypes = std::make_unique<HostDeviceBuffer<catype_t>>(ctx.n_catypes, true, ctx.run_gpu);
+    auto &catypes = ctx.catypes->cpu_data_p;
 
     for (int i = 0; i < ctx.n_catypes; i++) {
         catype_t catype;
@@ -925,21 +1012,25 @@ void init_catypes(const char* filename) {
         catype.aii_1_4 = strtod(file.buffer[i + 1][6], &eptr);
         catype.bii_1_4 = strtod(file.buffer[i + 1][7], &eptr);
 
-        ctx.catypes[i] = catype;
+        catypes[i] = catype;
     }
 
     // Preprocess bii parameters for arithmetic rule: convert ε to √ε
     // This matches Fortran md.f90:14747-14752 preprocessing
     if (ctx.topo.vdw_rule == VDW_ARITHMETIC) {
         for (int i = 0; i < ctx.n_catypes; i++) {
-            ctx.catypes[i].bii_normal = sqrt(fabs(ctx.catypes[i].bii_normal));
-            ctx.catypes[i].bii_1_4 = sqrt(fabs(ctx.catypes[i].bii_1_4));
+            catypes[i].bii_normal = sqrt(fabs(catypes[i].bii_normal));
+            catypes[i].bii_1_4 = sqrt(fabs(catypes[i].bii_1_4));
         }
 #ifdef VERBOSE
         printf("Preprocessed catypes bii parameters for arithmetic vdW rule\n");
 #endif
     }
 
+    if (ctx.run_gpu) {
+        ctx.catypes->upload();
+    }
+
     clean_csv(file);
 }
 
@@ -956,14 +1047,19 @@ void init_atypes(const char* filename) {
 
     ctx.n_atypes = atoi(file.buffer[0][0]);
 
-    ctx.atypes.resize(ctx.n_atypes);
+    ctx.atypes = std::make_unique<HostDeviceBuffer<atype_t>>(ctx.n_atypes, true, ctx.run_gpu);
+    auto &atypes = ctx.atypes->cpu_data_p;
     for (int i = 0; i < ctx.n_atypes; i++) {
         atype_t atype;
 
         atype.a = atoi(file.buffer[i + 1][0]);
         atype.code = atoi(file.buffer[i + 1][1]);
 
-        ctx.atypes[i] = atype;
+        atypes[i] = atype;
+    }
+
+    if (ctx.run_gpu) {
+        ctx.atypes->upload();
     }
 
     clean_csv(file);
@@ -1398,22 +1494,29 @@ void init_qelscales(const char* filename) {
     csvfile_t file = read_csv(filename, 0, ctx.base_folder.c_str());
 
     if (file.n_lines < 1) {
+        ctx.n_qelscales = 0;
+        ctx.q_elscales = std::make_unique<HostDeviceBuffer<q_elscale_t>>(0, true, ctx.run_gpu);
         clean_csv(file);
         return;
     }
 
     ctx.n_qelscales = atoi(file.buffer[0][0]) / ctx.n_lambdas;
-    ctx.q_elscales.resize(ctx.n_qelscales * ctx.n_lambdas);
+    ctx.q_elscales = std::make_unique<HostDeviceBuffer<q_elscale_t>>(ctx.n_qelscales * ctx.n_lambdas, true, ctx.run_gpu);
+    auto *q_elscales = ctx.q_elscales->cpu_data_p;
 
     for (int i = 0; i < ctx.n_qelscales; i++) {
         for (int j = 0; j < ctx.n_lambdas; j++) {
             char* eptr;
-            ctx.q_elscales[i + j * ctx.n_qelscales].qi = atoi(file.buffer[i + j * ctx.n_qelscales + 1][0]);
-            ctx.q_elscales[i + j * ctx.n_qelscales].qj = atoi(file.buffer[i + j * ctx.n_qelscales + 1][1]);
-            ctx.q_elscales[i + j * ctx.n_qelscales].mu = strtod(file.buffer[i + j * ctx.n_qelscales + 1][2], &eptr);
+            q_elscales[i + j * ctx.n_qelscales].qi = atoi(file.buffer[i + j * ctx.n_qelscales + 1][0]);
+            q_elscales[i + j * ctx.n_qelscales].qj = atoi(file.buffer[i + j * ctx.n_qelscales + 1][1]);
+            q_elscales[i + j * ctx.n_qelscales].mu = strtod(file.buffer[i + j * ctx.n_qelscales + 1][2], &eptr);
         }
     }
 
+    if (ctx.run_gpu) {
+        ctx.q_elscales->upload();
+    }
+
     clean_csv(file);
 }
 
@@ -1552,11 +1655,12 @@ void init_icoords(const char* filename) {
         return;
     }
 
+    auto &coords = ctx.coords->cpu_data_p;
     for (int i = 0; i < ctx.n_atoms; i++) {
         char* eptr;
-        ctx.coords[i].x = strtod(file.buffer[i + 1][0], &eptr);
-        ctx.coords[i].y = strtod(file.buffer[i + 1][1], &eptr);
-        ctx.coords[i].z = strtod(file.buffer[i + 1][2], &eptr);
+        coords[i].x = strtod(file.buffer[i + 1][0], &eptr);
+        coords[i].y = strtod(file.buffer[i + 1][1], &eptr);
+        coords[i].z = strtod(file.buffer[i + 1][2], &eptr);
     }
 
     clean_csv(file);
@@ -1571,11 +1675,12 @@ void init_ivelocities(const char* filename) {
         return;
     }
 
+    auto &velocities = ctx.velocities->cpu_data_p;
     for (int i = 0; i < ctx.n_atoms; i++) {
         char* eptr;
-        ctx.velocities[i].x = strtod(file.buffer[i + 1][0], &eptr);
-        ctx.velocities[i].y = strtod(file.buffer[i + 1][1], &eptr);
-        ctx.velocities[i].z = strtod(file.buffer[i + 1][2], &eptr);
+        velocities[i].x = strtod(file.buffer[i + 1][0], &eptr);
+        velocities[i].y = strtod(file.buffer[i + 1][1], &eptr);
+        velocities[i].z = strtod(file.buffer[i + 1][2], &eptr);
     }
 
     clean_csv(file);
diff --git a/src/core/cpu/src/cpu_angle_force.cpp b/src/core/cpu/src/cpu_angle_force.cpp
index cd49f36c..a9c29c1e 100644
--- a/src/core/cpu/src/cpu_angle_force.cpp
+++ b/src/core/cpu/src/cpu_angle_force.cpp
@@ -7,6 +7,8 @@
 
 double calc_angle_forces(int start, int end) {
     auto& ctx = Context::instance();
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
     int aii, aji, aki;
 
     coord_t ai, aj, ak;
@@ -19,15 +21,18 @@ double calc_angle_forces(int start, int end) {
     double ener;
     double angle = 0;
 
+    auto &angles = ctx.angles->cpu_data_p;
+    auto &cangles = ctx.cangles->cpu_data_p;
+
     for (int i = start; i < end; i++) {
-        aii = ctx.angles[i].ai - 1;
-        aji = ctx.angles[i].aj - 1;
-        aki = ctx.angles[i].ak - 1;
-        ai = ctx.coords[aii];
-        aj = ctx.coords[aji];
-        ak = ctx.coords[aki];
+        aii = angles[i].ai - 1;
+        aji = angles[i].aj - 1;
+        aki = angles[i].ak - 1;
+        ai = coords[aii];
+        aj = coords[aji];
+        ak = coords[aki];
 
-        cangle = ctx.cangles[ctx.angles[i].code - 1];
+        cangle = cangles[angles[i].code - 1];
 
         rji.x = ai.x - aj.x;
         rji.y = ai.y - aj.y;
@@ -78,17 +83,17 @@ double calc_angle_forces(int start, int end) {
         dk.y = f1 * (rji.y * bjiinv * bjkinv - cos_th * rjk.y * bjk2inv);
         dk.z = f1 * (rji.z * bjiinv * bjkinv - cos_th * rjk.z * bjk2inv);
 
-        ctx.dvelocities[aii].x += dv * di.x;
-        ctx.dvelocities[aii].y += dv * di.y;
-        ctx.dvelocities[aii].z += dv * di.z;
+        dvelocities[aii].x += dv * di.x;
+        dvelocities[aii].y += dv * di.y;
+        dvelocities[aii].z += dv * di.z;
 
-        ctx.dvelocities[aki].x += dv * dk.x;
-        ctx.dvelocities[aki].y += dv * dk.y;
-        ctx.dvelocities[aki].z += dv * dk.z;
+        dvelocities[aki].x += dv * dk.x;
+        dvelocities[aki].y += dv * dk.y;
+        dvelocities[aki].z += dv * dk.z;
 
-        ctx.dvelocities[aji].x -= dv * (di.x + dk.x);
-        ctx.dvelocities[aji].y -= dv * (di.y + dk.y);
-        ctx.dvelocities[aji].z -= dv * (di.z + dk.z);
+        dvelocities[aji].x -= dv * (di.x + dk.x);
+        dvelocities[aji].y -= dv * (di.y + dk.y);
+        dvelocities[aji].z -= dv * (di.z + dk.z);
 
         // printf("ANGLE ener = %f\n", ener);
     }
diff --git a/src/core/cpu/src/cpu_bond_force.cpp b/src/core/cpu/src/cpu_bond_force.cpp
index 38ad3862..2a539f90 100644
--- a/src/core/cpu/src/cpu_bond_force.cpp
+++ b/src/core/cpu/src/cpu_bond_force.cpp
@@ -6,6 +6,10 @@
 
 double calc_bond_forces(int start, int end) {
     auto& ctx = Context::instance();
+    auto &bonds = ctx.bonds->cpu_data_p;
+    auto &cbonds = ctx.cbonds->cpu_data_p;
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
     int aii, aji;
     coord_t ai, aj, dx;
     cbond_t cbond;
@@ -13,12 +17,12 @@ double calc_bond_forces(int start, int end) {
     double bond = 0;
 
     for (int i = start; i < end; i++) {
-        aii = ctx.bonds[i].ai - 1;
-        aji = ctx.bonds[i].aj - 1;
-        ai = ctx.coords[aii];
-        aj = ctx.coords[aji];
+        aii = bonds[i].ai - 1;
+        aji = bonds[i].aj - 1;
+        ai = coords[aii];
+        aj = coords[aji];
 
-        cbond = ctx.cbonds[ctx.bonds[i].code - 1];
+        cbond = cbonds[bonds[i].code - 1];
 
         // Calculate distance vector, norm of distance vector
         dx.x = aj.x - ai.x;
@@ -36,13 +40,13 @@ double calc_bond_forces(int start, int end) {
         // Update forces
         ampl = cbond.kb * ddx / dx1;
 
-        ctx.dvelocities[aji].x += ampl * dx.x;
-        ctx.dvelocities[aji].y += ampl * dx.y;
-        ctx.dvelocities[aji].z += ampl * dx.z;
+        dvelocities[aji].x += ampl * dx.x;
+        dvelocities[aji].y += ampl * dx.y;
+        dvelocities[aji].z += ampl * dx.z;
 
-        ctx.dvelocities[aii].x -= ampl * dx.x;
-        ctx.dvelocities[aii].y -= ampl * dx.y;
-        ctx.dvelocities[aii].z -= ampl * dx.z;
+        dvelocities[aii].x -= ampl * dx.x;
+        dvelocities[aii].y -= ampl * dx.y;
+        dvelocities[aii].z -= ampl * dx.z;
     }
 
     return bond;
diff --git a/src/core/cpu/src/cpu_improper2_force.cpp b/src/core/cpu/src/cpu_improper2_force.cpp
index c632ec9c..af73a9cc 100644
--- a/src/core/cpu/src/cpu_improper2_force.cpp
+++ b/src/core/cpu/src/cpu_improper2_force.cpp
@@ -7,6 +7,10 @@
 
 double calc_improper2_forces(int start, int end) {
     auto& ctx = Context::instance();
+    auto &impropers = ctx.impropers->cpu_data_p;
+    auto &cimpropers = ctx.cimpropers->cpu_data_p;
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
     int aii, aji, aki, ali;
 
     coord_t ai, aj, ak, al;
@@ -20,18 +24,18 @@ double calc_improper2_forces(int start, int end) {
     double improper = 0;
 
     for (int i = start; i < end; i++) {
-        imp = ctx.impropers[i];
-        cimp = ctx.cimpropers[imp.code - 1];
+        imp = impropers[i];
+        cimp = cimpropers[imp.code - 1];
 
         aii = imp.ai - 1;
         aji = imp.aj - 1;
         aki = imp.ak - 1;
         ali = imp.al - 1;
 
-        ai = ctx.coords[aii];
-        aj = ctx.coords[aji];
-        ak = ctx.coords[aki];
-        al = ctx.coords[ali];
+        ai = coords[aii];
+        aj = coords[aji];
+        ak = coords[aki];
+        al = coords[ali];
 
         rji.x = ai.x - aj.x;
         rji.y = ai.y - aj.y;
@@ -110,21 +114,21 @@ double calc_improper2_forces(int start, int end) {
         // Update energy and forces
         improper += ener;
 
-        ctx.dvelocities[aii].x += dv * dpi.x;
-        ctx.dvelocities[aii].y += dv * dpi.y;
-        ctx.dvelocities[aii].z += dv * dpi.z;
+        dvelocities[aii].x += dv * dpi.x;
+        dvelocities[aii].y += dv * dpi.y;
+        dvelocities[aii].z += dv * dpi.z;
 
-        ctx.dvelocities[aji].x += dv * dpj.x;
-        ctx.dvelocities[aji].y += dv * dpj.y;
-        ctx.dvelocities[aji].z += dv * dpj.z;
+        dvelocities[aji].x += dv * dpj.x;
+        dvelocities[aji].y += dv * dpj.y;
+        dvelocities[aji].z += dv * dpj.z;
 
-        ctx.dvelocities[aki].x += dv * dpk.x;
-        ctx.dvelocities[aki].y += dv * dpk.y;
-        ctx.dvelocities[aki].z += dv * dpk.z;
+        dvelocities[aki].x += dv * dpk.x;
+        dvelocities[aki].y += dv * dpk.y;
+        dvelocities[aki].z += dv * dpk.z;
 
-        ctx.dvelocities[ali].x += dv * dpl.x;
-        ctx.dvelocities[ali].y += dv * dpl.y;
-        ctx.dvelocities[ali].z += dv * dpl.z;
+        dvelocities[ali].x += dv * dpl.x;
+        dvelocities[ali].y += dv * dpl.y;
+        dvelocities[ali].z += dv * dpl.z;
     }
 
     return improper;
diff --git a/src/core/cpu/src/cpu_leapfrog.cpp b/src/core/cpu/src/cpu_leapfrog.cpp
index 31b96bb7..9d1ff43a 100644
--- a/src/core/cpu/src/cpu_leapfrog.cpp
+++ b/src/core/cpu/src/cpu_leapfrog.cpp
@@ -5,49 +5,55 @@
 
 void calc_leapfrog() {
     auto& ctx = Context::instance();
+    auto &atypes = ctx.atypes->cpu_data_p;
+    auto &catypes = ctx.catypes->cpu_data_p;
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &velocities = ctx.velocities->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
+    auto *xcoords = ctx.xcoords->cpu_data_p;
     double mass_i;
     double winv_i;
 
     for (int i = 0; i < ctx.n_atoms_solute; i++) {
-        mass_i = ctx.catypes[ctx.atypes[i].code - 1].m;
+        mass_i = catypes[atypes[i].code - 1].m;
         winv_i = 1 / mass_i;
 
-        ctx.velocities[i].x = (ctx.velocities[i].x - ctx.dvelocities[i].x * ctx.dt * winv_i) * ctx.Tscale_solute;
-        ctx.velocities[i].y = (ctx.velocities[i].y - ctx.dvelocities[i].y * ctx.dt * winv_i) * ctx.Tscale_solute;
-        ctx.velocities[i].z = (ctx.velocities[i].z - ctx.dvelocities[i].z * ctx.dt * winv_i) * ctx.Tscale_solute;
+        velocities[i].x = (velocities[i].x - dvelocities[i].x * ctx.dt * winv_i) * ctx.Tscale_solute;
+        velocities[i].y = (velocities[i].y - dvelocities[i].y * ctx.dt * winv_i) * ctx.Tscale_solute;
+        velocities[i].z = (velocities[i].z - dvelocities[i].z * ctx.dt * winv_i) * ctx.Tscale_solute;
 
-        ctx.xcoords[i].x = ctx.coords[i].x;
-        ctx.xcoords[i].y = ctx.coords[i].y;
-        ctx.xcoords[i].z = ctx.coords[i].z;
+        xcoords[i].x = coords[i].x;
+        xcoords[i].y = coords[i].y;
+        xcoords[i].z = coords[i].z;
 
-        ctx.coords[i].x += ctx.velocities[i].x * ctx.dt;
-        ctx.coords[i].y += ctx.velocities[i].y * ctx.dt;
-        ctx.coords[i].z += ctx.velocities[i].z * ctx.dt;
+        coords[i].x += velocities[i].x * ctx.dt;
+        coords[i].y += velocities[i].y * ctx.dt;
+        coords[i].z += velocities[i].z * ctx.dt;
     }
 
     for (int i = ctx.n_atoms_solute; i < ctx.n_atoms; i++) {
-        mass_i = ctx.catypes[ctx.atypes[i].code - 1].m;
+        mass_i = catypes[atypes[i].code - 1].m;
         winv_i = 1 / mass_i;
 
-        ctx.velocities[i].x = (ctx.velocities[i].x - ctx.dvelocities[i].x * ctx.dt * winv_i) * ctx.Tscale_solvent;
-        ctx.velocities[i].y = (ctx.velocities[i].y - ctx.dvelocities[i].y * ctx.dt * winv_i) * ctx.Tscale_solvent;
-        ctx.velocities[i].z = (ctx.velocities[i].z - ctx.dvelocities[i].z * ctx.dt * winv_i) * ctx.Tscale_solvent;
+        velocities[i].x = (velocities[i].x - dvelocities[i].x * ctx.dt * winv_i) * ctx.Tscale_solvent;
+        velocities[i].y = (velocities[i].y - dvelocities[i].y * ctx.dt * winv_i) * ctx.Tscale_solvent;
+        velocities[i].z = (velocities[i].z - dvelocities[i].z * ctx.dt * winv_i) * ctx.Tscale_solvent;
 
-        ctx.xcoords[i].x = ctx.coords[i].x;
-        ctx.xcoords[i].y = ctx.coords[i].y;
-        ctx.xcoords[i].z = ctx.coords[i].z;
+        xcoords[i].x = coords[i].x;
+        xcoords[i].y = coords[i].y;
+        xcoords[i].z = coords[i].z;
 
-        ctx.coords[i].x += ctx.velocities[i].x * ctx.dt;
-        ctx.coords[i].y += ctx.velocities[i].y * ctx.dt;
-        ctx.coords[i].z += ctx.velocities[i].z * ctx.dt;
+        coords[i].x += velocities[i].x * ctx.dt;
+        coords[i].y += velocities[i].y * ctx.dt;
+        coords[i].z += velocities[i].z * ctx.dt;
     }
 
     if (ctx.n_shake_constraints > 0) {
-        calc_shake_constraints(ctx.coords.data(), ctx.xcoords.data());
+        calc_shake_constraints(coords, xcoords);
         for (int i = 0; i < ctx.n_atoms; i++) {
-            ctx.velocities[i].x = (ctx.coords[i].x - ctx.xcoords[i].x) / ctx.dt;
-            ctx.velocities[i].y = (ctx.coords[i].y - ctx.xcoords[i].y) / ctx.dt;
-            ctx.velocities[i].z = (ctx.coords[i].z - ctx.xcoords[i].z) / ctx.dt;
+            velocities[i].x = (coords[i].x - xcoords[i].x) / ctx.dt;
+            velocities[i].y = (coords[i].y - xcoords[i].y) / ctx.dt;
+            velocities[i].z = (coords[i].z - xcoords[i].z) / ctx.dt;
         }
     }
 }
diff --git a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp
index 302586f0..ce744ad0 100644
--- a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp
@@ -8,6 +8,10 @@
 
 void calc_nonbonded_pp_forces() {
     auto& ctx = Context::instance();
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
+    auto &LJ_matrix = ctx.LJ_matrix->cpu_data_p;
+    auto *excluded = ctx.excluded->cpu_data_p;
     bool bond14, bond23;
     double scaling;
     coord_t da;
@@ -21,11 +25,11 @@ void calc_nonbonded_pp_forces() {
         for (int pj = pi + 1; pj < ctx.n_patoms; pj++) {
             i = ctx.p_atoms[pi];
             j = ctx.p_atoms[pj];
-            bond23 = ctx.LJ_matrix[i * ctx.n_atoms_solute + j] == 3;
-            bond14 = ctx.LJ_matrix[i * ctx.n_atoms_solute + j] == 1;
+            bond23 = LJ_matrix[i * ctx.n_atoms_solute + j] == 3;
+            bond14 = LJ_matrix[i * ctx.n_atoms_solute + j] == 1;
 
             if (bond23) continue;
-            if (ctx.excluded[i] || ctx.excluded[j]) continue;
+            if (excluded[i] || excluded[j]) continue;
 
             scaling = bond14 ? ctx.topo.el14_scale : 1;
 
@@ -35,9 +39,9 @@ void calc_nonbonded_pp_forces() {
             const catype_t& ai_type = ctx.unified_catype(i, 0);
             const catype_t& aj_type = ctx.unified_catype(j, 0);
 
-            da.x = ctx.coords[j].x - ctx.coords[i].x;
-            da.y = ctx.coords[j].y - ctx.coords[i].y;
-            da.z = ctx.coords[j].z - ctx.coords[i].z;
+            da.x = coords[j].x - coords[i].x;
+            da.y = coords[j].y - coords[i].y;
+            da.z = coords[j].z - coords[i].z;
             r2a = 1 / (std::pow(da.x, 2) + std::pow(da.y, 2) + std::pow(da.z, 2));
             ra = sqrt(r2a);
             r6a = r2a * r2a * r2a;
@@ -56,13 +60,13 @@ void calc_nonbonded_pp_forces() {
             }
             dva = r2a * (-Vela - 12 * V_a + 6 * V_b);
 
-            ctx.dvelocities[i].x -= dva * da.x;
-            ctx.dvelocities[i].y -= dva * da.y;
-            ctx.dvelocities[i].z -= dva * da.z;
+            dvelocities[i].x -= dva * da.x;
+            dvelocities[i].y -= dva * da.y;
+            dvelocities[i].z -= dva * da.z;
 
-            ctx.dvelocities[j].x += dva * da.x;
-            ctx.dvelocities[j].y += dva * da.y;
-            ctx.dvelocities[j].z += dva * da.z;
+            dvelocities[j].x += dva * da.x;
+            dvelocities[j].y += dva * da.y;
+            dvelocities[j].z += dva * da.z;
 
             ctx.E_nonbond_pp.Ucoul += Vela;
             ctx.E_nonbond_pp.Uvdw += (V_a - V_b);
diff --git a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp
index 881f7d54..6bf2c27e 100644
--- a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp
@@ -8,6 +8,9 @@
 
 void calc_nonbonded_pw_forces() {
     auto& ctx = Context::instance();
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
+    auto *excluded = ctx.excluded->cpu_data_p;
     if (ctx.n_waters == 0 || ctx.n_patoms == 0) {
         return;
     }
@@ -15,7 +18,7 @@ void calc_nonbonded_pw_forces() {
     for (int pi = 0; pi < ctx.n_patoms; ++pi) {
         const int atom_i = ctx.p_atoms[pi];
         for (int atom_j = ctx.n_atoms_solute; atom_j < ctx.n_atoms; ++atom_j) {
-            if (ctx.excluded[atom_i] || ctx.excluded[atom_j]) {
+            if (excluded[atom_i] || excluded[atom_j]) {
                 continue;
             }
 
@@ -27,9 +30,9 @@ void calc_nonbonded_pw_forces() {
 
             double v_a = 0.0;
             double v_b = 0.0;
-            const double dx = ctx.coords[atom_j].x - ctx.coords[atom_i].x;
-            const double dy = ctx.coords[atom_j].y - ctx.coords[atom_i].y;
-            const double dz = ctx.coords[atom_j].z - ctx.coords[atom_i].z;
+            const double dx = coords[atom_j].x - coords[atom_i].x;
+            const double dy = coords[atom_j].y - coords[atom_i].y;
+            const double dz = coords[atom_j].z - coords[atom_i].z;
             const double r2inv = 1.0 / (dx * dx + dy * dy + dz * dz);
             const double rinv = std::sqrt(r2inv);
             const double r6inv = r2inv * r2inv * r2inv;
@@ -55,13 +58,13 @@ void calc_nonbonded_pw_forces() {
 
             const double scale = r2inv * (-ecoul - 12.0 * v_a + 6.0 * v_b);
 
-            ctx.dvelocities[atom_i].x -= scale * dx;
-            ctx.dvelocities[atom_i].y -= scale * dy;
-            ctx.dvelocities[atom_i].z -= scale * dz;
+            dvelocities[atom_i].x -= scale * dx;
+            dvelocities[atom_i].y -= scale * dy;
+            dvelocities[atom_i].z -= scale * dz;
 
-            ctx.dvelocities[atom_j].x += scale * dx;
-            ctx.dvelocities[atom_j].y += scale * dy;
-            ctx.dvelocities[atom_j].z += scale * dz;
+            dvelocities[atom_j].x += scale * dx;
+            dvelocities[atom_j].y += scale * dy;
+            dvelocities[atom_j].z += scale * dz;
 
             ctx.E_nonbond_pw.Ucoul += ecoul;
             ctx.E_nonbond_pw.Uvdw += (v_a - v_b);
diff --git a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp
index c12c81ae..65a74a6c 100644
--- a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp
@@ -8,6 +8,11 @@
 
 void calc_nonbonded_qp_forces() {
     auto& ctx = Context::instance();
+    auto *lambdas = ctx.lambdas->cpu_data_p;
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
+    auto &LJ_matrix = ctx.LJ_matrix->cpu_data_p;
+    auto *excluded = ctx.excluded->cpu_data_p;
     int i, j;
     coord_t da;
     double r2, r6, r;
@@ -20,17 +25,17 @@ void calc_nonbonded_qp_forces() {
             i = ctx.q_atoms[qi];
             j = ctx.p_atoms[pj];
 
-            bond23 = ctx.LJ_matrix[i * ctx.n_atoms_solute + j] == 3;
-            bond14 = ctx.LJ_matrix[i * ctx.n_atoms_solute + j] == 1;
+            bond23 = LJ_matrix[i * ctx.n_atoms_solute + j] == 3;
+            bond14 = LJ_matrix[i * ctx.n_atoms_solute + j] == 1;
 
             if (bond23) continue;
-            if (ctx.excluded[i] || ctx.excluded[j]) continue;
+            if (excluded[i] || excluded[j]) continue;
 
             scaling = bond14 ? ctx.topo.el14_scale : 1;
 
-            da.x = ctx.coords[j].x - ctx.coords[i].x;
-            da.y = ctx.coords[j].y - ctx.coords[i].y;
-            da.z = ctx.coords[j].z - ctx.coords[i].z;
+            da.x = coords[j].x - coords[i].x;
+            da.y = coords[j].y - coords[i].y;
+            da.z = coords[j].z - coords[i].z;
 
             r2 = pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2);
 
@@ -54,15 +59,15 @@ void calc_nonbonded_qp_forces() {
                 } else {
                     calc_vdw_arithmetic(ai_aii, aj_aii, ai_bii, aj_bii, r6inv, &V_a, &V_b);
                 }
-                dv = r2 * (-Vel - (12 * V_a - 6 * V_b)) * ctx.lambdas[state];
+                dv = r2 * (-Vel - (12 * V_a - 6 * V_b)) * lambdas[state];
 
                 // Update forces
-                ctx.dvelocities[i].x -= dv * da.x;
-                ctx.dvelocities[i].y -= dv * da.y;
-                ctx.dvelocities[i].z -= dv * da.z;
-                ctx.dvelocities[j].x += dv * da.x;
-                ctx.dvelocities[j].y += dv * da.y;
-                ctx.dvelocities[j].z += dv * da.z;
+                dvelocities[i].x -= dv * da.x;
+                dvelocities[i].y -= dv * da.y;
+                dvelocities[i].z -= dv * da.z;
+                dvelocities[j].x += dv * da.x;
+                dvelocities[j].y += dv * da.y;
+                dvelocities[j].z += dv * da.z;
 
                 // Update Q totals
                 ctx.EQ_nonbond_qp[state].Ucoul += Vel;
diff --git a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp
index f4fcfc26..2b062d48 100644
--- a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp
@@ -8,6 +8,12 @@
 
 void calc_nonbonded_qq_forces() {
     auto& ctx = Context::instance();
+    auto *lambdas = ctx.lambdas->cpu_data_p;
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
+    auto &LJ_matrix = ctx.LJ_matrix->cpu_data_p;
+    auto *excluded = ctx.excluded->cpu_data_p;
+    auto *q_elscales = ctx.q_elscales->cpu_data_p;
     int ai, aj;
     double crg_i, crg_j;
     double elscale, scaling;
@@ -27,27 +33,27 @@ void calc_nonbonded_qq_forces() {
                 crg_i = ctx.unified_ccharge(ai, state).charge;
                 crg_j = ctx.unified_ccharge(aj, state).charge;
 
-                bond23 = ctx.LJ_matrix[ai * ctx.n_atoms_solute + aj] == 3;
-                bond14 = ctx.LJ_matrix[ai * ctx.n_atoms_solute + aj] == 1;
+                bond23 = LJ_matrix[ai * ctx.n_atoms_solute + aj] == 3;
+                bond14 = LJ_matrix[ai * ctx.n_atoms_solute + aj] == 1;
 
                 if (bond23) continue;
-                if (ctx.excluded[ai] || ctx.excluded[aj]) continue;
+                if (excluded[ai] || excluded[aj]) continue;
 
                 scaling = bond14 ? ctx.topo.el14_scale : 1;
 
                 elscale = 1;
                 for (int k = 0; k < ctx.n_qelscales; k++) {
-                    if (ctx.q_elscales[k + ctx.n_qelscales * state].qi == qi + 1 && ctx.q_elscales[k + ctx.n_qelscales * state].qj == qj + 1) {
-                        elscale = ctx.q_elscales[k + ctx.n_qelscales * state].mu;
+                    if (q_elscales[k + ctx.n_qelscales * state].qi == qi + 1 && q_elscales[k + ctx.n_qelscales * state].qj == qj + 1) {
+                        elscale = q_elscales[k + ctx.n_qelscales * state].mu;
                     }
                 }
 
                 const catype_t& qi_type = ctx.unified_catype(ai, state);
                 const catype_t& qj_type = ctx.unified_catype(aj, state);
 
-                da.x = ctx.coords[aj].x - ctx.coords[ai].x;
-                da.y = ctx.coords[aj].y - ctx.coords[ai].y;
-                da.z = ctx.coords[aj].z - ctx.coords[ai].z;
+                da.x = coords[aj].x - coords[ai].x;
+                da.y = coords[aj].y - coords[ai].y;
+                da.z = coords[aj].z - coords[ai].z;
                 r2a = 1 / (pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2));
                 ra = sqrt(r2a);
                 r6a = r2a * r2a * r2a;
@@ -64,15 +70,15 @@ void calc_nonbonded_qq_forces() {
                 } else {
                     calc_vdw_arithmetic(ai_aii, aj_aii, ai_bii, aj_bii, r6a, &V_a, &V_b);
                 }
-                dva = r2a * (-Vela - 12 * V_a + 6 * V_b) * ctx.lambdas[state];
+                dva = r2a * (-Vela - 12 * V_a + 6 * V_b) * lambdas[state];
 
-                ctx.dvelocities[ai].x -= dva * da.x;
-                ctx.dvelocities[ai].y -= dva * da.y;
-                ctx.dvelocities[ai].z -= dva * da.z;
+                dvelocities[ai].x -= dva * da.x;
+                dvelocities[ai].y -= dva * da.y;
+                dvelocities[ai].z -= dva * da.z;
 
-                ctx.dvelocities[aj].x += dva * da.x;
-                ctx.dvelocities[aj].y += dva * da.y;
-                ctx.dvelocities[aj].z += dva * da.z;
+                dvelocities[aj].x += dva * da.x;
+                dvelocities[aj].y += dva * da.y;
+                dvelocities[aj].z += dva * da.z;
 
                 ctx.EQ_nonbond_qq[state].Ucoul += Vela;
                 ctx.EQ_nonbond_qq[state].Uvdw += (V_a - V_b);
diff --git a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp
index 7b9b2f23..71d07247 100644
--- a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp
@@ -7,6 +7,10 @@
 #include "vdw_rules.h"
 void calc_nonbonded_qw_forces() {
     auto& ctx = Context::instance();
+    auto *lambdas = ctx.lambdas->cpu_data_p;
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
+    auto *excluded = ctx.excluded->cpu_data_p;
     int i;
     coord_t dO, dH1, dH2;
     double r2O, rH1, rH2, r6O, rO, r2H1, r2H2;
@@ -25,16 +29,16 @@ void calc_nonbonded_qw_forces() {
     for (int j = ctx.n_atoms_solute; j < ctx.n_atoms; j += 3) {
         for (int qi = 0; qi < ctx.n_qatoms; qi++) {
             i = ctx.q_atoms[qi];
-            if (ctx.excluded[i] || ctx.excluded[j]) continue;
-            dO.x = ctx.coords[j].x - ctx.coords[i].x;
-            dO.y = ctx.coords[j].y - ctx.coords[i].y;
-            dO.z = ctx.coords[j].z - ctx.coords[i].z;
-            dH1.x = ctx.coords[j + 1].x - ctx.coords[i].x;
-            dH1.y = ctx.coords[j + 1].y - ctx.coords[i].y;
-            dH1.z = ctx.coords[j + 1].z - ctx.coords[i].z;
-            dH2.x = ctx.coords[j + 2].x - ctx.coords[i].x;
-            dH2.y = ctx.coords[j + 2].y - ctx.coords[i].y;
-            dH2.z = ctx.coords[j + 2].z - ctx.coords[i].z;
+            if (excluded[i] || excluded[j]) continue;
+            dO.x = coords[j].x - coords[i].x;
+            dO.y = coords[j].y - coords[i].y;
+            dO.z = coords[j].z - coords[i].z;
+            dH1.x = coords[j + 1].x - coords[i].x;
+            dH1.y = coords[j + 1].y - coords[i].y;
+            dH1.z = coords[j + 1].z - coords[i].z;
+            dH2.x = coords[j + 2].x - coords[i].x;
+            dH2.y = coords[j + 2].y - coords[i].y;
+            dH2.z = coords[j + 2].z - coords[i].z;
             r2O = pow(dO.x, 2) + pow(dO.y, 2) + pow(dO.z, 2);
             rH1 = sqrt(1.0 / (pow(dH1.x, 2) + pow(dH1.y, 2) + pow(dH1.z, 2)));
             rH2 = sqrt(1.0 / (pow(dH2.x, 2) + pow(dH2.y, 2) + pow(dH2.z, 2)));
@@ -69,9 +73,9 @@ void calc_nonbonded_qw_forces() {
 
                 // if (state == 0 && qi == 1) printf("j = %d ai__aii = %f A_O = %f B_O = %f V_a = %f V_b = %f r6O = %f\n", j, ai_aii, A_O, B_O, V_a, V_b, r6O);
 
-                dvO += r2O * (-VelO - (12 * V_a - 6 * V_b)) * ctx.lambdas[state];
-                dvH1 -= r2H1 * VelH1 * ctx.lambdas[state];
-                dvH2 -= r2H2 * VelH2 * ctx.lambdas[state];
+                dvO += r2O * (-VelO - (12 * V_a - 6 * V_b)) * lambdas[state];
+                dvH1 -= r2H1 * VelH1 * lambdas[state];
+                dvH2 -= r2H2 * VelH2 * lambdas[state];
 
                 ctx.EQ_nonbond_qw[state].Ucoul += (VelO + VelH1 + VelH2);
                 ctx.EQ_nonbond_qw[state].Uvdw += (V_a - V_b);
@@ -80,20 +84,20 @@ void calc_nonbonded_qw_forces() {
             // Note r6O is not the usual 1/rO^6, but rather rO^6. be careful!!!
 
             // Update forces on Q-atom
-            ctx.dvelocities[i].x -= (dvO * dO.x + dvH1 * dH1.x + dvH2 * dH2.x);
-            ctx.dvelocities[i].y -= (dvO * dO.y + dvH1 * dH1.y + dvH2 * dH2.y);
-            ctx.dvelocities[i].z -= (dvO * dO.z + dvH1 * dH1.z + dvH2 * dH2.z);
+            dvelocities[i].x -= (dvO * dO.x + dvH1 * dH1.x + dvH2 * dH2.x);
+            dvelocities[i].y -= (dvO * dO.y + dvH1 * dH1.y + dvH2 * dH2.y);
+            dvelocities[i].z -= (dvO * dO.z + dvH1 * dH1.z + dvH2 * dH2.z);
 
             // Update forces on water
-            ctx.dvelocities[j].x += dvO * dO.x;
-            ctx.dvelocities[j].y += dvO * dO.y;
-            ctx.dvelocities[j].z += dvO * dO.z;
-            ctx.dvelocities[j + 1].x += dvH1 * dH1.x;
-            ctx.dvelocities[j + 1].y += dvH1 * dH1.y;
-            ctx.dvelocities[j + 1].z += dvH1 * dH1.z;
-            ctx.dvelocities[j + 2].x += dvH2 * dH2.x;
-            ctx.dvelocities[j + 2].y += dvH2 * dH2.y;
-            ctx.dvelocities[j + 2].z += dvH2 * dH2.z;
+            dvelocities[j].x += dvO * dO.x;
+            dvelocities[j].y += dvO * dO.y;
+            dvelocities[j].z += dvO * dO.z;
+            dvelocities[j + 1].x += dvH1 * dH1.x;
+            dvelocities[j + 1].y += dvH1 * dH1.y;
+            dvelocities[j + 1].z += dvH1 * dH1.z;
+            dvelocities[j + 2].x += dvH2 * dH2.x;
+            dvelocities[j + 2].y += dvH2 * dH2.y;
+            dvelocities[j + 2].z += dvH2 * dH2.z;
         }
     }
 
diff --git a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp
index e094ffca..cf823e0e 100644
--- a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp
@@ -41,9 +41,11 @@ void accumulate_pair_force(Context& ctx,
                            double vdw_a,
                            double vdw_b,
                            E_nonbonded_t& energy) {
-    const double dx = ctx.coords[atom_j].x - ctx.coords[atom_i].x;
-    const double dy = ctx.coords[atom_j].y - ctx.coords[atom_i].y;
-    const double dz = ctx.coords[atom_j].z - ctx.coords[atom_i].z;
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
+    const double dx = coords[atom_j].x - coords[atom_i].x;
+    const double dy = coords[atom_j].y - coords[atom_i].y;
+    const double dz = coords[atom_j].z - coords[atom_i].z;
 
     const double r2inv = 1.0 / (dx * dx + dy * dy + dz * dz);
     const double rinv = std::sqrt(r2inv);
@@ -61,13 +63,13 @@ void accumulate_pair_force(Context& ctx,
 
     const double scale = r2inv * dva;
 
-    ctx.dvelocities[atom_i].x -= scale * dx;
-    ctx.dvelocities[atom_i].y -= scale * dy;
-    ctx.dvelocities[atom_i].z -= scale * dz;
+    dvelocities[atom_i].x -= scale * dx;
+    dvelocities[atom_i].y -= scale * dy;
+    dvelocities[atom_i].z -= scale * dz;
 
-    ctx.dvelocities[atom_j].x += scale * dx;
-    ctx.dvelocities[atom_j].y += scale * dy;
-    ctx.dvelocities[atom_j].z += scale * dz;
+    dvelocities[atom_j].x += scale * dx;
+    dvelocities[atom_j].y += scale * dy;
+    dvelocities[atom_j].z += scale * dz;
 
     energy.Ucoul += ecoul;
     energy.Uvdw += evdw;
diff --git a/src/core/cpu/src/cpu_polx_water_force.cpp b/src/core/cpu/src/cpu_polx_water_force.cpp
index 44bfa1c4..9d0e4711 100644
--- a/src/core/cpu/src/cpu_polx_water_force.cpp
+++ b/src/core/cpu/src/cpu_polx_water_force.cpp
@@ -8,6 +8,9 @@
 
 void calc_polx_w_forces(int iteration) {
     auto& ctx = Context::instance();
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
+    auto *wshells = ctx.wshells->cpu_data_p;
 
     int wi, imin, jw, ii, iis, jmin;
     double tmin;
@@ -18,9 +21,9 @@ void calc_polx_w_forces(int iteration) {
     double ener;
 
     for (int is = 0; is < ctx.n_shells; is++) {
-        ctx.wshells[is].n_inshell = 0;
+        wshells[is].n_inshell = 0;
         if (iteration == 0) {
-            ctx.wshells[is].theta_corr = 0;
+            wshells[is].theta_corr = 0;
         }
     }
 
@@ -30,9 +33,9 @@ void calc_polx_w_forces(int iteration) {
 
         wi = ctx.n_atoms_solute + 3 * i;
 
-        rmu.x = ctx.coords[wi + 1].x + ctx.coords[wi + 2].x - 2 * ctx.coords[wi].x;
-        rmu.y = ctx.coords[wi + 1].y + ctx.coords[wi + 2].y - 2 * ctx.coords[wi].y;
-        rmu.z = ctx.coords[wi + 1].z + ctx.coords[wi + 2].z - 2 * ctx.coords[wi].z;
+        rmu.x = coords[wi + 1].x + coords[wi + 2].x - 2 * coords[wi].x;
+        rmu.y = coords[wi + 1].y + coords[wi + 2].y - 2 * coords[wi].y;
+        rmu.z = coords[wi + 1].z + coords[wi + 2].z - 2 * coords[wi].z;
 
         rm = sqrt(pow(rmu.x, 2) + pow(rmu.y, 2) + pow(rmu.z, 2));
 
@@ -40,9 +43,9 @@ void calc_polx_w_forces(int iteration) {
         rmu.y /= rm;
         rmu.z /= rm;
 
-        rcu.x = ctx.coords[wi].x - ctx.topo.solvent_center.x;
-        rcu.y = ctx.coords[wi].y - ctx.topo.solvent_center.y;
-        rcu.z = ctx.coords[wi].z - ctx.topo.solvent_center.z;
+        rcu.x = coords[wi].x - ctx.topo.solvent_center.x;
+        rcu.y = coords[wi].y - ctx.topo.solvent_center.y;
+        rcu.z = coords[wi].z - ctx.topo.solvent_center.z;
         rc = sqrt(pow(rcu.x, 2) + pow(rcu.y, 2) + pow(rcu.z, 2));
         rcu.x /= rc;
         rcu.y /= rc;
@@ -58,23 +61,23 @@ void calc_polx_w_forces(int iteration) {
         ctx.theta[i] = acos(cos_th);
         ctx.tdum[i] = ctx.theta[i];
 
-        if (rc > ctx.wshells[ctx.n_shells - 1].router - ctx.wshells[ctx.n_shells - 1].dr) {
+        if (rc > wshells[ctx.n_shells - 1].router - wshells[ctx.n_shells - 1].dr) {
             for (iis = ctx.n_shells - 1; iis > 0; iis--) {
-                if (rc <= ctx.wshells[iis].router) {
+                if (rc <= wshells[iis].router) {
                     break;
                 }
             }
 
-            ctx.wshells[iis].n_inshell += 1;
-            ctx.list_sh[ctx.wshells[iis].n_inshell - 1][iis] = i;
+            wshells[iis].n_inshell += 1;
+            ctx.list_sh[wshells[iis].n_inshell - 1][iis] = i;
         }
     }
 
     for (int is = 0; is < ctx.n_shells; is++) {
         imin = 0;
-        for (int il = 0; il < ctx.wshells[is].n_inshell; il++) {
+        for (int il = 0; il < wshells[is].n_inshell; il++) {
             tmin = 2 * M_PI;
-            for (int jl = 0; jl < ctx.wshells[is].n_inshell; jl++) {
+            for (int jl = 0; jl < wshells[is].n_inshell; jl++) {
                 jw = ctx.list_sh[jl][is];
                 if (ctx.tdum[jw] < tmin) {
                     jmin = jw;
@@ -90,29 +93,29 @@ void calc_polx_w_forces(int iteration) {
     if (iteration != 0 && iteration % itdis_update == 0) {
         for (int is = 0; is < ctx.n_shells; is++) {
             printf("SHELL %d\n", is);
-            ctx.wshells[is].avtheta /= (double)itdis_update;
-            ctx.wshells[is].avn_inshell /= (double)itdis_update;
-            ctx.wshells[is].theta_corr =
-                ctx.wshells[is].theta_corr + ctx.wshells[is].avtheta - acos(ctx.wshells[is].cstb);
+            wshells[is].avtheta /= (double)itdis_update;
+            wshells[is].avn_inshell /= (double)itdis_update;
+            wshells[is].theta_corr =
+                wshells[is].theta_corr + wshells[is].avtheta - acos(wshells[is].cstb);
             printf("average theta = %f, average in shell = %f, theta_corr = %f\n",
-                   ctx.wshells[is].avtheta * 180 / M_PI, ctx.wshells[is].avn_inshell,
-                   ctx.wshells[is].theta_corr * 180 / M_PI);
-            ctx.wshells[is].avtheta = 0;
-            ctx.wshells[is].avn_inshell = 0;
+                   wshells[is].avtheta * 180 / M_PI, wshells[is].avn_inshell,
+                   wshells[is].theta_corr * 180 / M_PI);
+            wshells[is].avtheta = 0;
+            wshells[is].avn_inshell = 0;
         }
     }
 
     for (int is = 0; is < ctx.n_shells; is++) {
-        if (ctx.wshells[is].n_inshell == 0) {
+        if (wshells[is].n_inshell == 0) {
             continue;
         }
 
         avtdum = 0;
-        for (int il = 0; il < ctx.wshells[is].n_inshell; il++) {
+        for (int il = 0; il < wshells[is].n_inshell; il++) {
             ii = ctx.nsort[il][is];
-            arg = 1 + ((1 - 2 * (double)(il + 1)) / (double)ctx.wshells[is].n_inshell);
+            arg = 1 + ((1 - 2 * (double)(il + 1)) / (double)wshells[is].n_inshell);
             ctx.theta0[il] = acos(arg);
-            ctx.theta0[il] = ctx.theta0[il] - 3 * sin(ctx.theta0[il]) * ctx.wshells[is].cstb / 2;
+            ctx.theta0[il] = ctx.theta0[il] - 3 * sin(ctx.theta0[il]) * wshells[is].cstb / 2;
             if (ctx.theta0[il] < 0) {
                 ctx.theta0[il] = 0;
             }
@@ -122,16 +125,16 @@ void calc_polx_w_forces(int iteration) {
 
             avtdum += ctx.theta[ii];
             ener = .5 * ctx.md.polarisation_force *
-                   pow(ctx.theta[ii] - ctx.theta0[il] + ctx.wshells[is].theta_corr, 2);
+                   pow(ctx.theta[ii] - ctx.theta0[il] + wshells[is].theta_corr, 2);
             ctx.E_restraint.Upolx += ener;
 
-            dv = ctx.md.polarisation_force * (ctx.theta[ii] - ctx.theta0[il] + ctx.wshells[is].theta_corr);
+            dv = ctx.md.polarisation_force * (ctx.theta[ii] - ctx.theta0[il] + wshells[is].theta_corr);
 
             wi = ctx.n_atoms_solute + 3 * ii;
 
-            rmu.x = ctx.coords[wi + 1].x + ctx.coords[wi + 2].x - 2 * ctx.coords[wi].x;
-            rmu.y = ctx.coords[wi + 1].y + ctx.coords[wi + 2].y - 2 * ctx.coords[wi].y;
-            rmu.z = ctx.coords[wi + 1].z + ctx.coords[wi + 2].z - 2 * ctx.coords[wi].z;
+            rmu.x = coords[wi + 1].x + coords[wi + 2].x - 2 * coords[wi].x;
+            rmu.y = coords[wi + 1].y + coords[wi + 2].y - 2 * coords[wi].y;
+            rmu.z = coords[wi + 1].z + coords[wi + 2].z - 2 * coords[wi].z;
 
             rm = sqrt(pow(rmu.x, 2) + pow(rmu.y, 2) + pow(rmu.z, 2));
 
@@ -139,9 +142,9 @@ void calc_polx_w_forces(int iteration) {
             rmu.y /= rm;
             rmu.z /= rm;
 
-            rcu.x = ctx.coords[wi].x - ctx.topo.solvent_center.x;
-            rcu.y = ctx.coords[wi].y - ctx.topo.solvent_center.y;
-            rcu.z = ctx.coords[wi].z - ctx.topo.solvent_center.z;
+            rcu.x = coords[wi].x - ctx.topo.solvent_center.x;
+            rcu.y = coords[wi].y - ctx.topo.solvent_center.y;
+            rcu.z = coords[wi].z - ctx.topo.solvent_center.z;
             rc = sqrt(pow(rcu.x, 2) + pow(rcu.y, 2) + pow(rcu.z, 2));
             rcu.x /= rc;
             rcu.y /= rc;
@@ -175,18 +178,18 @@ void calc_polx_w_forces(int iteration) {
             f2.y = (rmu.y - rcu.y * cos_th) / rc;
             f2.z = (rmu.z - rcu.z * cos_th) / rc;
 
-            ctx.dvelocities[wi].x += f0 * (f1O.x + f2.x);
-            ctx.dvelocities[wi].y += f0 * (f1O.y + f2.y);
-            ctx.dvelocities[wi].z += f0 * (f1O.z + f2.z);
-            ctx.dvelocities[wi + 1].x += f0 * f1H1.x;
-            ctx.dvelocities[wi + 1].y += f0 * f1H1.y;
-            ctx.dvelocities[wi + 1].z += f0 * f1H1.z;
-            ctx.dvelocities[wi + 2].x += f0 * f1H2.x;
-            ctx.dvelocities[wi + 2].y += f0 * f1H2.y;
-            ctx.dvelocities[wi + 2].z += f0 * f1H2.z;
+            dvelocities[wi].x += f0 * (f1O.x + f2.x);
+            dvelocities[wi].y += f0 * (f1O.y + f2.y);
+            dvelocities[wi].z += f0 * (f1O.z + f2.z);
+            dvelocities[wi + 1].x += f0 * f1H1.x;
+            dvelocities[wi + 1].y += f0 * f1H1.y;
+            dvelocities[wi + 1].z += f0 * f1H1.z;
+            dvelocities[wi + 2].x += f0 * f1H2.x;
+            dvelocities[wi + 2].y += f0 * f1H2.y;
+            dvelocities[wi + 2].z += f0 * f1H2.z;
         }
 
-        ctx.wshells[is].avtheta += avtdum / (double)ctx.wshells[is].n_inshell;
-        ctx.wshells[is].avn_inshell += ctx.wshells[is].n_inshell;
+        wshells[is].avtheta += avtdum / (double)wshells[is].n_inshell;
+        wshells[is].avn_inshell += wshells[is].n_inshell;
     }
 }
diff --git a/src/core/cpu/src/cpu_pshell_force.cpp b/src/core/cpu/src/cpu_pshell_force.cpp
index eef4e4d6..9ff083cc 100644
--- a/src/core/cpu/src/cpu_pshell_force.cpp
+++ b/src/core/cpu/src/cpu_pshell_force.cpp
@@ -7,34 +7,38 @@
 
 void calc_pshell_forces() {
     auto& ctx = Context::instance();
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
+    auto *excluded = ctx.excluded->cpu_data_p;
+    auto *shell = ctx.shell->cpu_data_p;
 
     coord_t dr;
     double k, r2, ener;
 
     for (int i = 0; i < ctx.n_atoms_solute; i++) {
-        if (ctx.shell[i] || ctx.excluded[i]) {
-            if (ctx.excluded[i]) {
+        if (shell[i] || excluded[i]) {
+            if (excluded[i]) {
                 k = k_fix;
             } else {
                 k = k_pshell;
             }
 
-            dr.x = ctx.coords[i].x - ctx.coords_top[i].x;
-            dr.y = ctx.coords[i].y - ctx.coords_top[i].y;
-            dr.z = ctx.coords[i].z - ctx.coords_top[i].z;
+            dr.x = coords[i].x - ctx.coords_init->cpu_data_p[i].x;
+            dr.y = coords[i].y - ctx.coords_init->cpu_data_p[i].y;
+            dr.z = coords[i].z - ctx.coords_init->cpu_data_p[i].z;
             r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2);
             ener = 0.5 * k * r2;
 
-            if (ctx.excluded[i]) {
+            if (excluded[i]) {
                 ctx.E_restraint.Ufix += ener;
             }
-            if (ctx.shell[i]) {
+            if (shell[i]) {
                 ctx.E_restraint.Ushell += ener;
             }
 
-            ctx.dvelocities[i].x += k * dr.x;
-            ctx.dvelocities[i].y += k * dr.y;
-            ctx.dvelocities[i].z += k * dr.z;
+            dvelocities[i].x += k * dr.x;
+            dvelocities[i].y += k * dr.y;
+            dvelocities[i].z += k * dr.z;
         }
     }
 }
diff --git a/src/core/cpu/src/cpu_q_angle_force.cpp b/src/core/cpu/src/cpu_q_angle_force.cpp
index 9cd70f4e..c9c2ea65 100644
--- a/src/core/cpu/src/cpu_q_angle_force.cpp
+++ b/src/core/cpu/src/cpu_q_angle_force.cpp
@@ -8,6 +8,9 @@
 
 void calc_qangle_forces(int state) {
     auto& ctx = Context::instance();
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
+    auto *lambdas = ctx.lambdas->cpu_data_p;
     int ic;
     int ai, aj, ak;
     coord_t rji, rjk;
@@ -28,13 +31,13 @@ void calc_qangle_forces(int state) {
         aj = ctx.q_angles[i + ctx.n_qangles * state].aj - 1;
         ak = ctx.q_angles[i + ctx.n_qangles * state].ak - 1;
 
-        rji.x = ctx.coords[ai].x - ctx.coords[aj].x;
-        rji.y = ctx.coords[ai].y - ctx.coords[aj].y;
-        rji.z = ctx.coords[ai].z - ctx.coords[aj].z;
+        rji.x = coords[ai].x - coords[aj].x;
+        rji.y = coords[ai].y - coords[aj].y;
+        rji.z = coords[ai].z - coords[aj].z;
 
-        rjk.x = ctx.coords[ak].x - ctx.coords[aj].x;
-        rjk.y = ctx.coords[ak].y - ctx.coords[aj].y;
-        rjk.z = ctx.coords[ak].z - ctx.coords[aj].z;
+        rjk.x = coords[ak].x - coords[aj].x;
+        rjk.y = coords[ak].y - coords[aj].y;
+        rjk.z = coords[ak].z - coords[aj].z;
 
         bji = sqrt(pow(rji.x, 2) + pow(rji.y, 2) + pow(rji.z, 2));
         bjk = sqrt(pow(rjk.x, 2) + pow(rjk.y, 2) + pow(rjk.z, 2));
@@ -51,7 +54,7 @@ void calc_qangle_forces(int state) {
         ener = .5 * ctx.q_cangles[ic].kth * pow(dth, 2);
         ctx.EQ_bond[state].Uangle += ener;
 
-        dv = ctx.q_cangles[ic].kth * dth * ctx.lambdas[state];
+        dv = ctx.q_cangles[ic].kth * dth * lambdas[state];
         f1 = sin(th);
         if (abs(f1) < 1E-12) {
             f1 = 1E-12;
@@ -65,14 +68,14 @@ void calc_qangle_forces(int state) {
         dk.y = f1 * (rji.y / (bji * bjk) - cos_th * rjk.y / pow(bjk, 2));
         dk.z = f1 * (rji.z / (bji * bjk) - cos_th * rjk.z / pow(bjk, 2));
 
-        ctx.dvelocities[ai].x += dv * di.x;
-        ctx.dvelocities[ai].y += dv * di.y;
-        ctx.dvelocities[ai].z += dv * di.z;
-        ctx.dvelocities[ak].x += dv * dk.x;
-        ctx.dvelocities[ak].y += dv * dk.y;
-        ctx.dvelocities[ak].z += dv * dk.z;
-        ctx.dvelocities[aj].x -= dv * (di.x + dk.x);
-        ctx.dvelocities[aj].y -= dv * (di.y + dk.y);
-        ctx.dvelocities[aj].z -= dv * (di.z + dk.z);
+        dvelocities[ai].x += dv * di.x;
+        dvelocities[ai].y += dv * di.y;
+        dvelocities[ai].z += dv * di.z;
+        dvelocities[ak].x += dv * dk.x;
+        dvelocities[ak].y += dv * dk.y;
+        dvelocities[ak].z += dv * dk.z;
+        dvelocities[aj].x -= dv * (di.x + dk.x);
+        dvelocities[aj].y -= dv * (di.y + dk.y);
+        dvelocities[aj].z -= dv * (di.z + dk.z);
     }
 }
diff --git a/src/core/cpu/src/cpu_q_bond_force.cpp b/src/core/cpu/src/cpu_q_bond_force.cpp
index 35d86abe..5f2f7203 100644
--- a/src/core/cpu/src/cpu_q_bond_force.cpp
+++ b/src/core/cpu/src/cpu_q_bond_force.cpp
@@ -6,6 +6,9 @@
 
 void calc_qbond_forces(int state) {
     auto& ctx = Context::instance();
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
+    auto *lambdas = ctx.lambdas->cpu_data_p;
     int ic;
     int ai, aj;
     double b, db, ener, dv;
@@ -21,22 +24,22 @@ void calc_qbond_forces(int state) {
         ai = ctx.q_bonds[i + ctx.n_qbonds * state].ai - 1;
         aj = ctx.q_bonds[i + ctx.n_qbonds * state].aj - 1;
 
-        rij.x = ctx.coords[aj].x - ctx.coords[ai].x;
-        rij.y = ctx.coords[aj].y - ctx.coords[ai].y;
-        rij.z = ctx.coords[aj].z - ctx.coords[ai].z;
+        rij.x = coords[aj].x - coords[ai].x;
+        rij.y = coords[aj].y - coords[ai].y;
+        rij.z = coords[aj].z - coords[ai].z;
 
         b = sqrt(pow(rij.x, 2) + pow(rij.y, 2) + pow(rij.z, 2));
         db = b - ctx.q_cbonds[ic].b0;
 
         ener = 0.5 * ctx.q_cbonds[ic].kb * pow(db, 2);
         ctx.EQ_bond[state].Ubond += ener;
-        dv = db * ctx.q_cbonds[ic].kb * ctx.lambdas[state] / b;
-
-        ctx.dvelocities[ai].x -= dv * rij.x;
-        ctx.dvelocities[ai].y -= dv * rij.y;
-        ctx.dvelocities[ai].z -= dv * rij.z;
-        ctx.dvelocities[aj].x += dv * rij.x;
-        ctx.dvelocities[aj].y += dv * rij.y;
-        ctx.dvelocities[aj].z += dv * rij.z;
+        dv = db * ctx.q_cbonds[ic].kb * lambdas[state] / b;
+
+        dvelocities[ai].x -= dv * rij.x;
+        dvelocities[ai].y -= dv * rij.y;
+        dvelocities[ai].z -= dv * rij.z;
+        dvelocities[aj].x += dv * rij.x;
+        dvelocities[aj].y += dv * rij.y;
+        dvelocities[aj].z += dv * rij.z;
     }
 }
diff --git a/src/core/cpu/src/cpu_q_torsion_force.cpp b/src/core/cpu/src/cpu_q_torsion_force.cpp
index 35ab4b14..be309347 100644
--- a/src/core/cpu/src/cpu_q_torsion_force.cpp
+++ b/src/core/cpu/src/cpu_q_torsion_force.cpp
@@ -7,6 +7,9 @@
 
 void calc_qtorsion_forces(int state) {
     auto& ctx = Context::instance();
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
+    auto *lambdas = ctx.lambdas->cpu_data_p;
     int ic;
     int ai, aj, ak, al;
     coord_t rji, rjk, rkl, rnj, rnk, rki, rlj;
@@ -29,15 +32,15 @@ void calc_qtorsion_forces(int state) {
         ak = ctx.q_torsions[i + ctx.n_qtorsions * state].ak - 1;
         al = ctx.q_torsions[i + ctx.n_qtorsions * state].al - 1;
 
-        rji.x = ctx.coords[ai].x - ctx.coords[aj].x;
-        rji.y = ctx.coords[ai].y - ctx.coords[aj].y;
-        rji.z = ctx.coords[ai].z - ctx.coords[aj].z;
-        rjk.x = ctx.coords[ak].x - ctx.coords[aj].x;
-        rjk.y = ctx.coords[ak].y - ctx.coords[aj].y;
-        rjk.z = ctx.coords[ak].z - ctx.coords[aj].z;
-        rkl.x = ctx.coords[al].x - ctx.coords[ak].x;
-        rkl.y = ctx.coords[al].y - ctx.coords[ak].y;
-        rkl.z = ctx.coords[al].z - ctx.coords[ak].z;
+        rji.x = coords[ai].x - coords[aj].x;
+        rji.y = coords[ai].y - coords[aj].y;
+        rji.z = coords[ai].z - coords[aj].z;
+        rjk.x = coords[ak].x - coords[aj].x;
+        rjk.y = coords[ak].y - coords[aj].y;
+        rjk.z = coords[ak].z - coords[aj].z;
+        rkl.x = coords[al].x - coords[ak].x;
+        rkl.y = coords[al].y - coords[ak].y;
+        rkl.z = coords[al].z - coords[ak].z;
         rnj.x = rji.y * rjk.z - rji.z * rjk.y;
         rnj.y = rji.z * rjk.x - rji.x * rjk.z;
         rnj.z = rji.x * rjk.y - rji.y * rjk.x;
@@ -69,7 +72,7 @@ void calc_qtorsion_forces(int state) {
         // Energy
         arg = ctx.q_ctorsions[ic].n * phi - to_radians(ctx.q_ctorsions[ic].d);
         ener = ctx.q_ctorsions[ic].k * (1 + cos(arg));
-        dv = -ctx.q_ctorsions[ic].n * ctx.q_ctorsions[ic].k * sin(arg) * ctx.lambdas[state];
+        dv = -ctx.q_ctorsions[ic].n * ctx.q_ctorsions[ic].k * sin(arg) * lambdas[state];
 
         // Forces
         f1 = sin(phi);
@@ -108,20 +111,20 @@ void calc_qtorsion_forces(int state) {
         // Update energy and forces
         ctx.EQ_bond[state].Utor += ener;
 
-        ctx.dvelocities[ai].x += dv * dpi.x;
-        ctx.dvelocities[ai].y += dv * dpi.y;
-        ctx.dvelocities[ai].z += dv * dpi.z;
+        dvelocities[ai].x += dv * dpi.x;
+        dvelocities[ai].y += dv * dpi.y;
+        dvelocities[ai].z += dv * dpi.z;
 
-        ctx.dvelocities[aj].x += dv * dpj.x;
-        ctx.dvelocities[aj].y += dv * dpj.y;
-        ctx.dvelocities[aj].z += dv * dpj.z;
+        dvelocities[aj].x += dv * dpj.x;
+        dvelocities[aj].y += dv * dpj.y;
+        dvelocities[aj].z += dv * dpj.z;
 
-        ctx.dvelocities[ak].x += dv * dpk.x;
-        ctx.dvelocities[ak].y += dv * dpk.y;
-        ctx.dvelocities[ak].z += dv * dpk.z;
+        dvelocities[ak].x += dv * dpk.x;
+        dvelocities[ak].y += dv * dpk.y;
+        dvelocities[ak].z += dv * dpk.z;
 
-        ctx.dvelocities[al].x += dv * dpl.x;
-        ctx.dvelocities[al].y += dv * dpl.y;
-        ctx.dvelocities[al].z += dv * dpl.z;
+        dvelocities[al].x += dv * dpl.x;
+        dvelocities[al].y += dv * dpl.y;
+        dvelocities[al].z += dv * dpl.z;
     }
 }
diff --git a/src/core/cpu/src/cpu_radix_water_force.cpp b/src/core/cpu/src/cpu_radix_water_force.cpp
index 217b343f..a887ad31 100644
--- a/src/core/cpu/src/cpu_radix_water_force.cpp
+++ b/src/core/cpu/src/cpu_radix_water_force.cpp
@@ -7,6 +7,8 @@
 
 void calc_radix_w_forces() {
     auto& ctx = Context::instance();
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
 
     double b, db, ener, dv, fexp;
     coord_t dr;
@@ -19,9 +21,9 @@ void calc_radix_w_forces() {
     }
 
     for (int i = ctx.n_atoms_solute; i < ctx.n_atoms; i += 3) {
-        dr.x = ctx.coords[i].x - ctx.topo.solvent_center.x;
-        dr.y = ctx.coords[i].y - ctx.topo.solvent_center.y;
-        dr.z = ctx.coords[i].z - ctx.topo.solvent_center.z;
+        dr.x = coords[i].x - ctx.topo.solvent_center.x;
+        dr.y = coords[i].y - ctx.topo.solvent_center.y;
+        dr.z = coords[i].z - ctx.topo.solvent_center.z;
         b = sqrt(pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2));
         db = b - (ctx.topo.solvent_radius - shift);
 
@@ -38,8 +40,8 @@ void calc_radix_w_forces() {
         }
 
         ctx.E_restraint.Uradx += ener;
-        ctx.dvelocities[i].x += dv * dr.x;
-        ctx.dvelocities[i].y += dv * dr.y;
-        ctx.dvelocities[i].z += dv * dr.z;
+        dvelocities[i].x += dv * dr.x;
+        dvelocities[i].y += dv * dr.y;
+        dvelocities[i].z += dv * dr.z;
     }
 }
diff --git a/src/core/cpu/src/cpu_restrang_force.cpp b/src/core/cpu/src/cpu_restrang_force.cpp
index baaf2385..d809a9c1 100644
--- a/src/core/cpu/src/cpu_restrang_force.cpp
+++ b/src/core/cpu/src/cpu_restrang_force.cpp
@@ -7,6 +7,11 @@
 
 void calc_restrang_forces() {
     auto& ctx = Context::instance();
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
+    auto &restrangs = ctx.restrangs->cpu_data_p;
+    auto *lambdas = ctx.lambdas->cpu_data_p;
+    auto *EQ_restraint = ctx.EQ_restraint->cpu_data_p;
 
     int state, i, j, k;
     coord_t dr, dr2, di, dk;
@@ -14,21 +19,21 @@ void calc_restrang_forces() {
     double dth, dv, ener, f1;
 
     for (int ir = 0; ir < ctx.n_restrangs; ir++) {
-        state = ctx.restrangs[ir].ipsi - 1;
-        i = ctx.restrangs[ir].ai - 1;
-        j = ctx.restrangs[ir].aj - 1;
-        k = ctx.restrangs[ir].ak - 1;
+        state = restrangs[ir].ipsi - 1;
+        i = restrangs[ir].ai - 1;
+        j = restrangs[ir].aj - 1;
+        k = restrangs[ir].ak - 1;
 
-        dr.x = ctx.coords[i].x - ctx.coords[j].x;
-        dr.y = ctx.coords[i].y - ctx.coords[j].y;
-        dr.z = ctx.coords[i].z - ctx.coords[j].z;
+        dr.x = coords[i].x - coords[j].x;
+        dr.y = coords[i].y - coords[j].y;
+        dr.z = coords[i].z - coords[j].z;
 
-        dr2.x = ctx.coords[k].x - ctx.coords[j].x;
-        dr2.y = ctx.coords[k].y - ctx.coords[j].y;
-        dr2.z = ctx.coords[k].z - ctx.coords[j].z;
+        dr2.x = coords[k].x - coords[j].x;
+        dr2.y = coords[k].y - coords[j].y;
+        dr2.z = coords[k].z - coords[j].z;
 
-        if (ctx.restrangs[ir].ipsi != 0) {
-            lambda = ctx.lambdas[state];
+        if (restrangs[ir].ipsi != 0) {
+            lambda = lambdas[state];
         } else {
             lambda = 1;
         }
@@ -50,10 +55,10 @@ void calc_restrang_forces() {
         }
 
         th = acos(cos_th);
-        dth = th - to_radians(ctx.restrangs[ir].ang);
+        dth = th - to_radians(restrangs[ir].ang);
 
-        ener = .5 * ctx.restrangs[ir].k * pow(dth, 2);
-        dv = lambda * ctx.restrangs[ir].k * dth;
+        ener = .5 * restrangs[ir].k * pow(dth, 2);
+        dv = lambda * restrangs[ir].k * dth;
 
         f1 = sin(th);
         if (fabs(f1) < 1E-12) {
@@ -69,25 +74,25 @@ void calc_restrang_forces() {
         dk.y = f1 * (dr.y / (rij * rjk) - cos_th * dr2.y / r2jk);
         dk.z = f1 * (dr.z / (rij * rjk) - cos_th * dr2.z / r2jk);
 
-        ctx.dvelocities[i].x += dv * di.x;
-        ctx.dvelocities[i].y += dv * di.y;
-        ctx.dvelocities[i].z += dv * di.z;
-        ctx.dvelocities[k].x += dv * dk.x;
-        ctx.dvelocities[k].y += dv * dk.y;
-        ctx.dvelocities[k].z += dv * dk.z;
-        ctx.dvelocities[j].x -= dv * (di.x + dk.x);
-        ctx.dvelocities[j].y -= dv * (di.y + dk.y);
-        ctx.dvelocities[j].z -= dv * (di.z + dk.z);
-
-        if (ctx.restrangs[ir].ipsi == 0) {
+        dvelocities[i].x += dv * di.x;
+        dvelocities[i].y += dv * di.y;
+        dvelocities[i].z += dv * di.z;
+        dvelocities[k].x += dv * dk.x;
+        dvelocities[k].y += dv * dk.y;
+        dvelocities[k].z += dv * dk.z;
+        dvelocities[j].x -= dv * (di.x + dk.x);
+        dvelocities[j].y -= dv * (di.y + dk.y);
+        dvelocities[j].z -= dv * (di.z + dk.z);
+
+        if (restrangs[ir].ipsi == 0) {
             for (int lambda_idx = 0; lambda_idx < ctx.n_lambdas; lambda_idx++) {
-                ctx.EQ_restraint[lambda_idx].Urestr += ener;
+                EQ_restraint[lambda_idx].Urestr += ener;
             }
             if (ctx.n_lambdas == 0) {
                 ctx.E_restraint.Upres += ener;
             }
         } else {
-            ctx.EQ_restraint[state].Urestr += ener;
+            EQ_restraint[state].Urestr += ener;
         }
     }
 }
diff --git a/src/core/cpu/src/cpu_restrdis_force.cpp b/src/core/cpu/src/cpu_restrdis_force.cpp
index 8c6346b2..c15cbef7 100644
--- a/src/core/cpu/src/cpu_restrdis_force.cpp
+++ b/src/core/cpu/src/cpu_restrdis_force.cpp
@@ -6,54 +6,59 @@
 
 void calc_restrdis_forces() {
     auto& ctx = Context::instance();
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
+    auto &restrdists = ctx.restrdists->cpu_data_p;
+    auto *lambdas = ctx.lambdas->cpu_data_p;
+    auto *EQ_restraint = ctx.EQ_restraint->cpu_data_p;
 
     int state, i, j;
     coord_t dr;
     double lambda, b, db, dv, ener;
 
     for (int ir = 0; ir < ctx.n_restrdists; ir++) {
-        state = ctx.restrdists[ir].ipsi - 1;
-        i = ctx.restrdists[ir].ai - 1;
-        j = ctx.restrdists[ir].aj - 1;
+        state = restrdists[ir].ipsi - 1;
+        i = restrdists[ir].ai - 1;
+        j = restrdists[ir].aj - 1;
 
-        dr.x = ctx.coords[j].x - ctx.coords[i].x;
-        dr.y = ctx.coords[j].y - ctx.coords[i].y;
-        dr.z = ctx.coords[j].z - ctx.coords[i].z;
+        dr.x = coords[j].x - coords[i].x;
+        dr.y = coords[j].y - coords[i].y;
+        dr.z = coords[j].z - coords[i].z;
 
-        if (ctx.restrdists[ir].ipsi != 0) {
-            lambda = ctx.lambdas[state];
+        if (restrdists[ir].ipsi != 0) {
+            lambda = lambdas[state];
         } else {
             lambda = 1;
         }
 
         b = sqrt(pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2));
-        if (b < ctx.restrdists[ir].d1) {
-            db = b - ctx.restrdists[ir].d1;
-        } else if (b > ctx.restrdists[ir].d2) {
-            db = b - ctx.restrdists[ir].d2;
+        if (b < restrdists[ir].d1) {
+            db = b - restrdists[ir].d1;
+        } else if (b > restrdists[ir].d2) {
+            db = b - restrdists[ir].d2;
         } else {
             continue;
         }
 
-        ener = .5 * ctx.restrdists[ir].k * pow(db, 2);
-        dv = lambda * ctx.restrdists[ir].k * db / b;
+        ener = .5 * restrdists[ir].k * pow(db, 2);
+        dv = lambda * restrdists[ir].k * db / b;
 
-        ctx.dvelocities[j].x += dr.x * dv;
-        ctx.dvelocities[j].y += dr.y * dv;
-        ctx.dvelocities[j].z += dr.z * dv;
-        ctx.dvelocities[i].x -= dr.x * dv;
-        ctx.dvelocities[i].y -= dr.y * dv;
-        ctx.dvelocities[i].z -= dr.z * dv;
+        dvelocities[j].x += dr.x * dv;
+        dvelocities[j].y += dr.y * dv;
+        dvelocities[j].z += dr.z * dv;
+        dvelocities[i].x -= dr.x * dv;
+        dvelocities[i].y -= dr.y * dv;
+        dvelocities[i].z -= dr.z * dv;
 
-        if (ctx.restrdists[ir].ipsi == 0) {
+        if (restrdists[ir].ipsi == 0) {
             for (int k = 0; k < ctx.n_lambdas; k++) {
-                ctx.EQ_restraint[k].Urestr += ener;
+                EQ_restraint[k].Urestr += ener;
             }
             if (ctx.n_lambdas == 0) {
                 ctx.E_restraint.Upres += ener;
             }
         } else {
-            ctx.EQ_restraint[state].Urestr += ener;
+            EQ_restraint[state].Urestr += ener;
         }
     }
 }
diff --git a/src/core/cpu/src/cpu_restrpos_force.cpp b/src/core/cpu/src/cpu_restrpos_force.cpp
index 98e78352..6db044b4 100644
--- a/src/core/cpu/src/cpu_restrpos_force.cpp
+++ b/src/core/cpu/src/cpu_restrpos_force.cpp
@@ -6,21 +6,26 @@
 
 void calc_restrpos_forces() {
     auto& ctx = Context::instance();
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
+    auto &restrspos = ctx.restrspos->cpu_data_p;
+    auto *lambdas = ctx.lambdas->cpu_data_p;
+    auto *EQ_restraint = ctx.EQ_restraint->cpu_data_p;
 
     int state, i;
     coord_t dr;
     double lambda, ener, x2, y2, z2;
 
     for (int ir = 0; ir < ctx.n_restrspos; ir++) {
-        state = ctx.restrspos[ir].ipsi - 1;
-        i = ctx.restrspos[ir].a - 1;
+        state = restrspos[ir].ipsi - 1;
+        i = restrspos[ir].a - 1;
 
-        dr.x = ctx.coords[i].x - ctx.restrspos[ir].x.x;
-        dr.y = ctx.coords[i].y - ctx.restrspos[ir].x.y;
-        dr.z = ctx.coords[i].z - ctx.restrspos[ir].x.z;
+        dr.x = coords[i].x - restrspos[ir].x.x;
+        dr.y = coords[i].y - restrspos[ir].x.y;
+        dr.z = coords[i].z - restrspos[ir].x.z;
 
-        if (ctx.restrspos[ir].ipsi != 0) {
-            lambda = ctx.lambdas[state];
+        if (restrspos[ir].ipsi != 0) {
+            lambda = lambdas[state];
         } else {
             lambda = 1;
         }
@@ -29,21 +34,21 @@ void calc_restrpos_forces() {
         y2 = pow(dr.y, 2);
         z2 = pow(dr.z, 2);
 
-        ener = .5 * ctx.restrspos[ir].k.x * x2 + .5 * ctx.restrspos[ir].k.y * y2 + .5 * ctx.restrspos[ir].k.z * z2;
+        ener = .5 * restrspos[ir].k.x * x2 + .5 * restrspos[ir].k.y * y2 + .5 * restrspos[ir].k.z * z2;
 
-        ctx.dvelocities[i].x += ctx.restrspos[ir].k.x * dr.x * lambda;
-        ctx.dvelocities[i].y += ctx.restrspos[ir].k.y * dr.y * lambda;
-        ctx.dvelocities[i].z += ctx.restrspos[ir].k.z * dr.z * lambda;
+        dvelocities[i].x += restrspos[ir].k.x * dr.x * lambda;
+        dvelocities[i].y += restrspos[ir].k.y * dr.y * lambda;
+        dvelocities[i].z += restrspos[ir].k.z * dr.z * lambda;
 
-        if (ctx.restrspos[ir].ipsi == 0) {
+        if (restrspos[ir].ipsi == 0) {
             for (int k = 0; k < ctx.n_lambdas; k++) {
-                ctx.EQ_restraint[k].Urestr += ener;
+                EQ_restraint[k].Urestr += ener;
             }
             if (ctx.n_lambdas == 0) {
                 ctx.E_restraint.Upres += ener;
             }
         } else {
-            ctx.EQ_restraint[state].Urestr += ener;
+            EQ_restraint[state].Urestr += ener;
         }
     }
 }
diff --git a/src/core/cpu/src/cpu_restrseq_force.cpp b/src/core/cpu/src/cpu_restrseq_force.cpp
index e8da11d2..296762e8 100644
--- a/src/core/cpu/src/cpu_restrseq_force.cpp
+++ b/src/core/cpu/src/cpu_restrseq_force.cpp
@@ -6,13 +6,19 @@
 
 void calc_restrseq_forces() {
     auto& ctx = Context::instance();
+    auto &atypes = ctx.atypes->cpu_data_p;
+    auto &catypes = ctx.catypes->cpu_data_p;
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
+    auto &restrseqs = ctx.restrseqs->cpu_data_p;
+    auto *heavy = ctx.heavy->cpu_data_p;
 
     double k, mass, totmass;
     coord_t dr;
     double r2, ener;
 
     for (int s = 0; s < ctx.n_restrseqs; s++) {
-        k = ctx.restrseqs[s].k;
+        k = restrseqs[s].k;
 
         dr.x = 0;
         dr.y = 0;
@@ -20,13 +26,13 @@ void calc_restrseq_forces() {
         int n_ctr = 0;
         totmass = 0;
 
-        if (ctx.restrseqs[s].to_center == 1) {
-            for (int i = ctx.restrseqs[s].ai - 1; i < ctx.restrseqs[s].aj - 1; i++) {
-                if (ctx.heavy[i] || ctx.restrseqs[s].ih) {
+        if (restrseqs[s].to_center == 1) {
+            for (int i = restrseqs[s].ai - 1; i < restrseqs[s].aj - 1; i++) {
+                if (heavy[i] || restrseqs[s].ih) {
                     n_ctr++;
-                    dr.x += (ctx.coords[i].x - ctx.coords_top[i].x);
-                    dr.y += (ctx.coords[i].y - ctx.coords_top[i].y);
-                    dr.z += (ctx.coords[i].z - ctx.coords_top[i].z);
+                    dr.x += (coords[i].x - ctx.coords_init->cpu_data_p[i].x);
+                    dr.y += (coords[i].y - ctx.coords_init->cpu_data_p[i].y);
+                    dr.z += (coords[i].z - ctx.coords_init->cpu_data_p[i].z);
                 }
             }
 
@@ -38,23 +44,23 @@ void calc_restrseq_forces() {
                 ener = .5 * k * r2;
                 ctx.E_restraint.Upres += ener;
 
-                for (int i = ctx.restrseqs[s].ai - 1; i < ctx.restrseqs[s].aj - 1; i++) {
-                    if (ctx.heavy[i] || ctx.restrseqs[s].ih) {
-                        mass = ctx.catypes[ctx.atypes[i].code - 1].m;
-                        ctx.dvelocities[i].x += (k * dr.x * mass / 12.010);
-                        ctx.dvelocities[i].y += (k * dr.y * mass / 12.010);
-                        ctx.dvelocities[i].z += (k * dr.z * mass / 12.010);
+                for (int i = restrseqs[s].ai - 1; i < restrseqs[s].aj - 1; i++) {
+                    if (heavy[i] || restrseqs[s].ih) {
+                        mass = catypes[atypes[i].code - 1].m;
+                        dvelocities[i].x += (k * dr.x * mass / 12.010);
+                        dvelocities[i].y += (k * dr.y * mass / 12.010);
+                        dvelocities[i].z += (k * dr.z * mass / 12.010);
                     }
                 }
             }
-        } else if (ctx.restrseqs[s].to_center == 2) {
-            for (int i = ctx.restrseqs[s].ai - 1; i < ctx.restrseqs[s].aj - 1; i++) {
-                if (ctx.heavy[i] || ctx.restrseqs[s].ih) {
-                    mass = ctx.catypes[ctx.atypes[i].code - 1].m;
+        } else if (restrseqs[s].to_center == 2) {
+            for (int i = restrseqs[s].ai - 1; i < restrseqs[s].aj - 1; i++) {
+                if (heavy[i] || restrseqs[s].ih) {
+                    mass = catypes[atypes[i].code - 1].m;
                     totmass += mass;
-                    dr.x += (ctx.coords[i].x - ctx.coords_top[i].x) * mass;
-                    dr.y += (ctx.coords[i].y - ctx.coords_top[i].y) * mass;
-                    dr.z += (ctx.coords[i].z - ctx.coords_top[i].z) * mass;
+                    dr.x += (coords[i].x - ctx.coords_init->cpu_data_p[i].x) * mass;
+                    dr.y += (coords[i].y - ctx.coords_init->cpu_data_p[i].y) * mass;
+                    dr.z += (coords[i].z - ctx.coords_init->cpu_data_p[i].z) * mass;
                 }
             }
 
@@ -66,28 +72,28 @@ void calc_restrseq_forces() {
                 ener = .5 * k * r2;
                 ctx.E_restraint.Upres += ener;
 
-                for (int i = ctx.restrseqs[s].ai - 1; i < ctx.restrseqs[s].aj - 1; i++) {
-                    if (ctx.heavy[i] || ctx.restrseqs[s].ih) {
-                        ctx.dvelocities[i].x += k * dr.x;
-                        ctx.dvelocities[i].y += k * dr.y;
-                        ctx.dvelocities[i].z += k * dr.z;
+                for (int i = restrseqs[s].ai - 1; i < restrseqs[s].aj - 1; i++) {
+                    if (heavy[i] || restrseqs[s].ih) {
+                        dvelocities[i].x += k * dr.x;
+                        dvelocities[i].y += k * dr.y;
+                        dvelocities[i].z += k * dr.z;
                     }
                 }
             }
         } else {
-            for (int i = ctx.restrseqs[s].ai - 1; i < ctx.restrseqs[s].aj - 1; i++) {
-                if (ctx.heavy[i] || ctx.restrseqs[s].ih) {
-                    dr.x = ctx.coords[i].x - ctx.coords_top[i].x;
-                    dr.y = ctx.coords[i].y - ctx.coords_top[i].y;
-                    dr.z = ctx.coords[i].z - ctx.coords_top[i].z;
+            for (int i = restrseqs[s].ai - 1; i < restrseqs[s].aj - 1; i++) {
+                if (heavy[i] || restrseqs[s].ih) {
+                    dr.x = coords[i].x - ctx.coords_init->cpu_data_p[i].x;
+                    dr.y = coords[i].y - ctx.coords_init->cpu_data_p[i].y;
+                    dr.z = coords[i].z - ctx.coords_init->cpu_data_p[i].z;
 
                     r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2);
                     ener = .5 * k * r2;
                     ctx.E_restraint.Upres += ener;
 
-                    ctx.dvelocities[i].x += k * dr.x;
-                    ctx.dvelocities[i].y += k * dr.y;
-                    ctx.dvelocities[i].z += k * dr.z;
+                    dvelocities[i].x += k * dr.x;
+                    dvelocities[i].y += k * dr.y;
+                    dvelocities[i].z += k * dr.z;
                 }
             }
         }
diff --git a/src/core/cpu/src/cpu_restrwall_force.cpp b/src/core/cpu/src/cpu_restrwall_force.cpp
index 0c57038d..fd49749a 100644
--- a/src/core/cpu/src/cpu_restrwall_force.cpp
+++ b/src/core/cpu/src/cpu_restrwall_force.cpp
@@ -6,34 +6,38 @@
 
 void calc_restrwall_forces() {
     auto& ctx = Context::instance();
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
+    auto &restrwalls = ctx.restrwalls->cpu_data_p;
+    auto *heavy = ctx.heavy->cpu_data_p;
 
     double k, b, db, ener, dv, fexp;
     coord_t dr;
 
     for (int ir = 0; ir < ctx.n_restrwalls; ir++) {
-        k = ctx.restrwalls[ir].k;
-        for (int i = ctx.restrwalls[ir].ai - 1; i < ctx.restrwalls[ir].aj - 1; i++) {
-            if (ctx.heavy[i] || ctx.restrwalls[ir].ih) {
-                dr.x = ctx.coords[i].x - ctx.topo.solvent_center.x;
-                dr.y = ctx.coords[i].y - ctx.topo.solvent_center.y;
-                dr.z = ctx.coords[i].z - ctx.topo.solvent_center.z;
+        k = restrwalls[ir].k;
+        for (int i = restrwalls[ir].ai - 1; i < restrwalls[ir].aj - 1; i++) {
+            if (heavy[i] || restrwalls[ir].ih) {
+                dr.x = coords[i].x - ctx.topo.solvent_center.x;
+                dr.y = coords[i].y - ctx.topo.solvent_center.y;
+                dr.z = coords[i].z - ctx.topo.solvent_center.z;
 
                 b = sqrt(pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2));
-                db = b - ctx.restrwalls[ir].d;
+                db = b - restrwalls[ir].d;
 
                 if (db > 0) {
-                    ener = .5 * k * pow(db, 2) - ctx.restrwalls[ir].dMorse;
+                    ener = .5 * k * pow(db, 2) - restrwalls[ir].dMorse;
                     dv = k * db / b;
                 } else {
-                    fexp = exp(ctx.restrwalls[ir].aMorse * db);
-                    ener = ctx.restrwalls[ir].dMorse * (fexp * fexp - 2 * fexp);
-                    dv = -2 * ctx.restrwalls[ir].dMorse * ctx.restrwalls[ir].aMorse * (fexp - fexp * fexp) / b;
+                    fexp = exp(restrwalls[ir].aMorse * db);
+                    ener = restrwalls[ir].dMorse * (fexp * fexp - 2 * fexp);
+                    dv = -2 * restrwalls[ir].dMorse * restrwalls[ir].aMorse * (fexp - fexp * fexp) / b;
                 }
                 ctx.E_restraint.Upres += ener;
 
-                ctx.dvelocities[i].x += dv * dr.x;
-                ctx.dvelocities[i].y += dv * dr.y;
-                ctx.dvelocities[i].z += dv * dr.z;
+                dvelocities[i].x += dv * dr.x;
+                dvelocities[i].y += dv * dr.y;
+                dvelocities[i].z += dv * dr.z;
             }
         }
     }
diff --git a/src/core/cpu/src/cpu_shake.cpp b/src/core/cpu/src/cpu_shake.cpp
index 7bc98b73..cb29a0f0 100644
--- a/src/core/cpu/src/cpu_shake.cpp
+++ b/src/core/cpu/src/cpu_shake.cpp
@@ -9,23 +9,26 @@
 
 int calc_shake_constraints(coord_t* coords, coord_t* xcoords) {
     auto& ctx = Context::instance();
+    auto *winv = ctx.winv->cpu_data_p;
+    auto *mol_n_shakes = ctx.mol_n_shakes->cpu_data_p;
+    auto *shake_bonds = ctx.shake_bonds->cpu_data_p;
     int total_iterations = 0;
     int shake = 0;
 
     for (int mol = 0; mol < ctx.n_molecules; mol++) {
-        if (ctx.mol_n_shakes[mol] == 0) {
+        if (mol_n_shakes[mol] == 0) {
             continue;
         }
         int n_iterations = 0;
 
         bool converged = false;
         do {
-            for (int i = 0; i < ctx.mol_n_shakes[mol]; i++) {
-                ctx.shake_bonds[shake + i].ready = false;
+            for (int i = 0; i < mol_n_shakes[mol]; i++) {
+                shake_bonds[shake + i].ready = false;
             }
 
-            for (int i = 0; i < ctx.mol_n_shakes[mol]; i++) {
-                shake_bond_t& shake_bond = ctx.shake_bonds[shake + i];
+            for (int i = 0; i < mol_n_shakes[mol]; i++) {
+                shake_bond_t& shake_bond = shake_bonds[shake + i];
                 if (!shake_bond.ready) {
                     const int ai = shake_bond.ai - 1;
                     const int aj = shake_bond.aj - 1;
@@ -45,22 +48,22 @@ int calc_shake_constraints(coord_t* coords, coord_t* xcoords) {
                     xxij.y = xcoords[ai].y - xcoords[aj].y;
                     xxij.z = xcoords[ai].z - xcoords[aj].z;
                     scp = xij.x * xxij.x + xij.y * xxij.y + xij.z * xxij.z;
-                    corr = diff / (2.0 * scp * (ctx.winv[ai] + ctx.winv[aj]));
-
-                    coords[ai].x += xxij.x * corr * ctx.winv[ai];
-                    coords[ai].y += xxij.y * corr * ctx.winv[ai];
-                    coords[ai].z += xxij.z * corr * ctx.winv[ai];
-                    coords[aj].x -= xxij.x * corr * ctx.winv[aj];
-                    coords[aj].y -= xxij.y * corr * ctx.winv[aj];
-                    coords[aj].z -= xxij.z * corr * ctx.winv[aj];
+                    corr = diff / (2.0 * scp * (winv[ai] + winv[aj]));
+
+                    coords[ai].x += xxij.x * corr * winv[ai];
+                    coords[ai].y += xxij.y * corr * winv[ai];
+                    coords[ai].z += xxij.z * corr * winv[ai];
+                    coords[aj].x -= xxij.x * corr * winv[aj];
+                    coords[aj].y -= xxij.y * corr * winv[aj];
+                    coords[aj].z -= xxij.z * corr * winv[aj];
                 }
             }
 
             n_iterations++;
 
             converged = true;
-            for (int i = 0; i < ctx.mol_n_shakes[mol]; i++) {
-                if (!ctx.shake_bonds[shake + i].ready) {
+            for (int i = 0; i < mol_n_shakes[mol]; i++) {
+                if (!shake_bonds[shake + i].ready) {
                     converged = false;
                     break;
                 }
@@ -68,9 +71,9 @@ int calc_shake_constraints(coord_t* coords, coord_t* xcoords) {
         } while (n_iterations < shake_max_iter && !converged);
 
         if (!converged) {
-            for (int i = 0; i < ctx.mol_n_shakes[mol]; i++) {
-                const int ai = ctx.shake_bonds[shake + i].ai - 1;
-                const int aj = ctx.shake_bonds[shake + i].aj - 1;
+            for (int i = 0; i < mol_n_shakes[mol]; i++) {
+                const int ai = shake_bonds[shake + i].ai - 1;
+                const int aj = shake_bonds[shake + i].aj - 1;
                 coord_t xxij;
                 double xxij2;
 
@@ -79,12 +82,12 @@ int calc_shake_constraints(coord_t* coords, coord_t* xcoords) {
                 xxij.z = xcoords[ai].z - xcoords[aj].z;
                 xxij2 = std::pow(xxij.x, 2) + std::pow(xxij.y, 2) + std::pow(xxij.z, 2);
                 std::printf(">>> Shake failed, i = %d,j = %d, d = %f, d0 = %f", ai, aj, std::sqrt(xxij2),
-                            ctx.shake_bonds[shake + i].dist2);
+                            shake_bonds[shake + i].dist2);
             }
             std::exit(EXIT_FAILURE);
         }
 
-        shake += ctx.mol_n_shakes[mol];
+        shake += mol_n_shakes[mol];
         total_iterations += n_iterations;
     }
 
@@ -94,37 +97,43 @@ int calc_shake_constraints(coord_t* coords, coord_t* xcoords) {
 
 void initial_shaking() {
     auto& ctx = Context::instance();
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &velocities = ctx.velocities->cpu_data_p;
+    auto *xcoords = ctx.xcoords->cpu_data_p;
 
     for (int i = 0; i < ctx.n_atoms; i++) {
-        ctx.xcoords[i].x = ctx.coords[i].x;
-        ctx.xcoords[i].y = ctx.coords[i].y;
-        ctx.xcoords[i].z = ctx.coords[i].z;
+        xcoords[i].x = coords[i].x;
+        xcoords[i].y = coords[i].y;
+        xcoords[i].z = coords[i].z;
     }
-    calc_shake_constraints(ctx.coords.data(), ctx.xcoords.data());
+    calc_shake_constraints(coords, xcoords);
     for (int i = 0; i < ctx.n_atoms; i++) {
-        ctx.xcoords[i].x = ctx.coords[i].x - ctx.dt * ctx.velocities[i].x;
-        ctx.xcoords[i].y = ctx.coords[i].y - ctx.dt * ctx.velocities[i].y;
-        ctx.xcoords[i].z = ctx.coords[i].z - ctx.dt * ctx.velocities[i].z;
+        xcoords[i].x = coords[i].x - ctx.dt * velocities[i].x;
+        xcoords[i].y = coords[i].y - ctx.dt * velocities[i].y;
+        xcoords[i].z = coords[i].z - ctx.dt * velocities[i].z;
     }
-    calc_shake_constraints(ctx.xcoords.data(), ctx.coords.data());
+    calc_shake_constraints(xcoords, coords);
     for (int i = 0; i < ctx.n_atoms; i++) {
-        ctx.velocities[i].x = (ctx.coords[i].x - ctx.xcoords[i].x) / ctx.dt;
-        ctx.velocities[i].y = (ctx.coords[i].y - ctx.xcoords[i].y) / ctx.dt;
-        ctx.velocities[i].z = (ctx.coords[i].z - ctx.xcoords[i].z) / ctx.dt;
+        velocities[i].x = (coords[i].x - xcoords[i].x) / ctx.dt;
+        velocities[i].y = (coords[i].y - xcoords[i].y) / ctx.dt;
+        velocities[i].z = (coords[i].z - xcoords[i].z) / ctx.dt;
     }
 }
 
 void stop_cm_translation() {
     auto& ctx = Context::instance();
+    auto &atypes = ctx.atypes->cpu_data_p;
+    auto &catypes = ctx.catypes->cpu_data_p;
+    auto &velocities = ctx.velocities->cpu_data_p;
     double total_mass = 0;
     coord_t vcm = {};
 
     for (int ai = 0; ai < ctx.n_atoms; ai++) {
-        const double rmass = ctx.catypes[ctx.atypes[ai].code - 1].m;
+        const double rmass = catypes[atypes[ai].code - 1].m;
         total_mass += rmass;
-        vcm.x += ctx.velocities[ai].x * rmass;
-        vcm.y += ctx.velocities[ai].y;
-        vcm.z += ctx.velocities[ai].z;
+        vcm.x += velocities[ai].x * rmass;
+        vcm.y += velocities[ai].y;
+        vcm.z += velocities[ai].z;
     }
 
     vcm.x /= total_mass;
@@ -132,8 +141,8 @@ void stop_cm_translation() {
     vcm.z /= total_mass;
 
     for (int ai = 0; ai < ctx.n_atoms; ai++) {
-        ctx.velocities[ai].x -= vcm.x;
-        ctx.velocities[ai].y -= vcm.y;
-        ctx.velocities[ai].z -= vcm.z;
+        velocities[ai].x -= vcm.x;
+        velocities[ai].y -= vcm.y;
+        velocities[ai].z -= vcm.z;
     }
 }
diff --git a/src/core/cpu/src/cpu_temperature.cpp b/src/core/cpu/src/cpu_temperature.cpp
index 0542b418..f2080635 100644
--- a/src/core/cpu/src/cpu_temperature.cpp
+++ b/src/core/cpu/src/cpu_temperature.cpp
@@ -11,6 +11,10 @@
 
 void calc_temperature() {
     auto& ctx = Context::instance();
+    auto &atypes = ctx.atypes->cpu_data_p;
+    auto &catypes = ctx.catypes->cpu_data_p;
+    auto &velocities = ctx.velocities->cpu_data_p;
+    auto *excluded = ctx.excluded->cpu_data_p;
     printf("Ndegf = %f, Ndegfree = %f, n_excluded = %d, Ndegfree_solvent = %f, Ndegfree_solute = %f\n",
            ctx.Ndegf, ctx.Ndegfree, ctx.n_excluded, ctx.Ndegfree_solvent, ctx.Ndegfree_solute);
     ctx.Temp = 0;
@@ -23,10 +27,10 @@ void calc_temperature() {
 
     ctx.Temp = 0;
     for (int i = 0; i < ctx.n_atoms_solute; i++) {
-        mass_i = ctx.catypes[ctx.atypes[i].code - 1].m;
-        ener = .5 * mass_i * (pow(ctx.velocities[i].x, 2) + pow(ctx.velocities[i].y, 2) + pow(ctx.velocities[i].z, 2));
+        mass_i = catypes[atypes[i].code - 1].m;
+        ener = .5 * mass_i * (pow(velocities[i].x, 2) + pow(velocities[i].y, 2) + pow(velocities[i].z, 2));
         Temp_solute += ener;
-        if (!ctx.excluded[i]) {
+        if (!excluded[i]) {
             Tfree_solute += ener;
         } else {
             Texcl_solute += ener;
@@ -37,10 +41,10 @@ void calc_temperature() {
     }
 
     for (int i = ctx.n_atoms_solute; i < ctx.n_atoms; i++) {
-        mass_i = ctx.catypes[ctx.atypes[i].code - 1].m;
-        ener = .5 * mass_i * (pow(ctx.velocities[i].x, 2) + pow(ctx.velocities[i].y, 2) + pow(ctx.velocities[i].z, 2));
+        mass_i = catypes[atypes[i].code - 1].m;
+        ener = .5 * mass_i * (pow(velocities[i].x, 2) + pow(velocities[i].y, 2) + pow(velocities[i].z, 2));
         Temp_solvent += ener;
-        if (!ctx.excluded[i]) {
+        if (!excluded[i]) {
             Tfree_solvent += ener;
         } else {
             Texcl_solvent += ener;
@@ -66,4 +70,4 @@ void calc_temperature() {
         ctx.Tscale_solute = ctx.Tscale_solvent;
     }
     printf("Tscale = %f, tau_T = %f, Temp = %f, Tfree = %f\n", ctx.Tscale_solvent, ctx.tau_T, ctx.Temp, ctx.Tfree);
-}
\ No newline at end of file
+}
diff --git a/src/core/cpu/src/cpu_torsion_force.cpp b/src/core/cpu/src/cpu_torsion_force.cpp
index b352a31f..e8aaa2a3 100644
--- a/src/core/cpu/src/cpu_torsion_force.cpp
+++ b/src/core/cpu/src/cpu_torsion_force.cpp
@@ -7,6 +7,10 @@
 
 double calc_torsion_forces(int start, int end) {
     auto& ctx = Context::instance();
+    auto &torsions = ctx.torsions->cpu_data_p;
+    auto &ctorsions = ctx.ctorsions->cpu_data_p;
+    auto &coords = ctx.coords->cpu_data_p;
+    auto &dvelocities = ctx.dvelocities->cpu_data_p;
     int aii, aji, aki, ali;
 
     coord_t ai, aj, ak, al;
@@ -23,18 +27,18 @@ double calc_torsion_forces(int start, int end) {
     ctorsion_t ctors;
 
     for (int i = start; i < end; i++) {
-        t = ctx.torsions[i];
-        ctors = ctx.ctorsions[t.code - 1];
+        t = torsions[i];
+        ctors = ctorsions[t.code - 1];
 
         aii = t.ai - 1;
         aji = t.aj - 1;
         aki = t.ak - 1;
         ali = t.al - 1;
 
-        ai = ctx.coords[aii];
-        aj = ctx.coords[aji];
-        ak = ctx.coords[aki];
-        al = ctx.coords[ali];
+        ai = coords[aii];
+        aj = coords[aji];
+        ak = coords[aki];
+        al = coords[ali];
 
         rji.x = ai.x - aj.x;
         rji.y = ai.y - aj.y;
@@ -119,21 +123,21 @@ double calc_torsion_forces(int start, int end) {
         // Update energy and forces
         torsion += ener;
 
-        ctx.dvelocities[aii].x += dv * dpi.x;
-        ctx.dvelocities[aii].y += dv * dpi.y;
-        ctx.dvelocities[aii].z += dv * dpi.z;
+        dvelocities[aii].x += dv * dpi.x;
+        dvelocities[aii].y += dv * dpi.y;
+        dvelocities[aii].z += dv * dpi.z;
 
-        ctx.dvelocities[aji].x += dv * dpj.x;
-        ctx.dvelocities[aji].y += dv * dpj.y;
-        ctx.dvelocities[aji].z += dv * dpj.z;
+        dvelocities[aji].x += dv * dpj.x;
+        dvelocities[aji].y += dv * dpj.y;
+        dvelocities[aji].z += dv * dpj.z;
 
-        ctx.dvelocities[aki].x += dv * dpk.x;
-        ctx.dvelocities[aki].y += dv * dpk.y;
-        ctx.dvelocities[aki].z += dv * dpk.z;
+        dvelocities[aki].x += dv * dpk.x;
+        dvelocities[aki].y += dv * dpk.y;
+        dvelocities[aki].z += dv * dpk.z;
 
-        ctx.dvelocities[ali].x += dv * dpl.x;
-        ctx.dvelocities[ali].y += dv * dpl.y;
-        ctx.dvelocities[ali].z += dv * dpl.z;
+        dvelocities[ali].x += dv * dpl.x;
+        dvelocities[ali].y += dv * dpl.y;
+        dvelocities[ali].z += dv * dpl.z;
     }
 
     return torsion;
diff --git a/src/core/cuda/include/cuda_context.cuh b/src/core/cuda/include/cuda_context.cuh
deleted file mode 100644
index c3aad50c..00000000
--- a/src/core/cuda/include/cuda_context.cuh
+++ /dev/null
@@ -1,187 +0,0 @@
-#pragma once
-
-#include <cuda_runtime.h>
-
-#include <array>
-#include <map>
-#include <vector>
-
-#include "common/include/md_types.h"
-
-#include "cuda_utility.cuh"
-
-class CudaContext {
-   public:
-    /*
-    Common data
-    */
-    coord_t* d_coords = nullptr;
-    dvel_t* d_dvelocities = nullptr;
-    vel_t* d_velocities = nullptr;
-
-    /*
-    Used in cuda_angle_force.cu
-    */
-    angle_t* d_angles = nullptr;
-    cangle_t* d_cangles = nullptr;
-
-    /*
-    Used in cuda_bond_force.cu
-    */
-    bond_t* d_bonds = nullptr;
-    cbond_t* d_cbonds = nullptr;
-
-    /*
-    Used in cuda_improper2_force.cu
-    */
-    improper_t* d_impropers = nullptr;
-    cimproper_t* d_cimpropers = nullptr;
-
-    /*
-    Used in cuda_leapfrog.cu
-    */
-    atype_t* d_atypes = nullptr;
-    catype_t* d_catypes = nullptr;
-
-    /*
-    Used in cuda_shake_constraints.cu
-    */
-    int* d_mol_n_shakes = nullptr;
-    shake_bond_t* d_shake_bonds = nullptr;
-    double* d_winv = nullptr;
-    coord_t* d_xcoords = nullptr;
-
-    /*
-    Used in cuda_nonbonded_qq_force.cu
-    */
-    int* d_q_atoms = nullptr;
-    ccharge_t* d_q_charges = nullptr;
-    int* d_LJ_matrix = nullptr;
-    bool* d_excluded = nullptr;
-    q_elscale_t* d_q_elscales = nullptr;
-    atype_t* d_q_atypes = nullptr;
-    E_nonbonded_t* d_EQ_nonbond_qq = nullptr;
-    double* d_lambdas = nullptr;
-
-    /*
-    Used in cuda_polx_water_force.cu
-    */
-    shell_t* d_wshells = nullptr;
-    /*
-    Used in cuda_pshell_force.cu
-    */
-    bool* d_shell = nullptr;
-    coord_t* d_coords_top = nullptr;
-
-    /*
-    Used in cuda_restrang_force.cu
-    */
-    restrang_t* d_restrangs = nullptr;
-    E_restraint_t* d_EQ_restraint = nullptr;
-    restrdis_t* d_restrdists = nullptr;
-
-    /*
-    Used in cuda_restrpos_force.cu
-    */
-    restrpos_t* d_restrspos = nullptr;
-
-    /*
-    Used in cuda_restrseq_force.cu
-    */
-    restrseq_t* d_restrseqs = nullptr;
-    bool* d_heavy = nullptr;
-
-    /*
-    Used in cuda_restrwall_force.cu
-    */
-    restrwall_t* d_restrwalls = nullptr;
-
-    /*
-    Used in cuda_torsion_force.cu
-    */
-    torsion_t* d_torsions = nullptr;
-    ctorsion_t* d_ctorsions = nullptr;
-
-    /*
-    Used in cuda_nonbonded_pp_force.cu
-    */
-    ccharge_t* d_ccharges;
-    charge_t* d_charges;
-    int* d_p_atoms = nullptr;
-
-    /*
-    Other helper arrays
-    */
-    ccharge_t* d_charge_table_all;  // Device copy of h_charge_table_all
-    double* d_charge_pair_products = nullptr;
-    catype_t* d_catype_table_all;   // Device copy of h_catype_table_all
-    vdw_pair_param_t* d_catype_pair_params = nullptr;
-    ccharge_t* d_unified_ccharges = nullptr;
-    catype_t* d_unified_catypes = nullptr;
-    int3* d_ngbrs_14 = nullptr;
-    std::map<std::array<double, 4>, int> catype_to_type_host;
-    int n_charge_types = 0;
-    int zero_charge_type = -1;
-    int n_catype_types = 0;
-    int zero_catype_type = -1;
-    int* d_p_charge_types;
-    int* d_w_charge_types;
-    int* d_q_charge_types;  // [0, lambdas * n_qatoms) is the normal q_charge_type, [lambdas * n_qatoms, ... ) is the lambda-scaled q_charge_type]
-    
-    int* d_p_catype_types;
-    int* d_w_catype_types;
-    int* d_q_catype_types;  // [0, lambdas * n_qatoms) is the normal q_catype_type, [lambdas * n_qatoms, ... ) is the lambda-scaled q_catype_type]
-
-    int *d_p_atoms_list;
-    int *d_w_atoms_list;
-    int *d_q_atoms_list;
-    std::vector<int> h_p_atoms_list;
-    std::vector<int> h_w_atoms_list;
-    std::vector<int> h_q_atoms_list;
-
-    static CudaContext& instance() {
-        static CudaContext ctx;
-        return ctx;
-    }
-    void init();
-
-    template <typename T>
-    void sync_array_to_device(T* dst, const T* src, int count);
-
-    template <typename T>
-    void sync_array_to_host(T* dst, const T* src, int count);
-
-    void sync_all_to_device();
-    void sync_all_to_host();
-    void reset_energies();
-
-    std::array<double, 4> get_catype_key(const catype_t& catype) {
-        return {
-            catype.aii_normal,
-            catype.bii_normal,
-            catype.aii_1_4,
-            catype.bii_1_4};
-    }
-
-   private:
-    CudaContext() = default;
-
-    void free();
-
-    ~CudaContext() { free(); }
-    CudaContext(const CudaContext&) = delete;
-    CudaContext& operator=(const CudaContext&) = delete;
-
-    void initialize_charge_tables_host();
-    void initialize_catype_tables_host();
-    void initialize_atom_lists_host();
-    void initialize_ngbrs14_host();
-};
-template <typename T>
-void CudaContext::sync_array_to_device(T* dst, const T* src, int count) {
-    check_cuda(cudaMemcpy(dst, src, count * sizeof(T), cudaMemcpyHostToDevice));
-}
-template <typename T>
-void CudaContext::sync_array_to_host(T* dst, const T* src, int count) {
-    check_cuda(cudaMemcpy(dst, src, count * sizeof(T), cudaMemcpyDeviceToHost));
-}
diff --git a/src/core/cuda/include/cuda_handler.cuh b/src/core/cuda/include/cuda_handler.cuh
index ec34d0f4..68325c5b 100644
--- a/src/core/cuda/include/cuda_handler.cuh
+++ b/src/core/cuda/include/cuda_handler.cuh
@@ -1,7 +1,6 @@
 #pragma once
 
 #include "handler.h"
-#include "cuda_context.cuh"
 
 class CudaHandler : public Handler {
    public:
@@ -19,7 +18,6 @@ class CudaHandler : public Handler {
 
    protected:
     bool initialized_ = false;
-    CudaContext& ctx_ = CudaContext::instance();
 
     void calc_internal_forces(int iteration) override;
     void calc_nonbonded_forces() override;
diff --git a/src/core/cuda/include/cuda_utility.cuh b/src/core/cuda/include/cuda_utility.cuh
index 784cebf2..36767be0 100644
--- a/src/core/cuda/include/cuda_utility.cuh
+++ b/src/core/cuda/include/cuda_utility.cuh
@@ -1,12 +1,9 @@
 #pragma once
 
-#include <cuda_runtime.h>
-
 #include <math.h>
 
+#include "common/include/cuda_runtime_utility.h"
+
 __device__ inline double to_radians_device(double degrees) {
     return degrees * (M_PI / 180.0);
 }
-
-void check_cudaMalloc(void** devPtr, size_t size);
-void check_cuda(cudaError_t status);
diff --git a/src/core/cuda/src/cuda_angle_force.cu b/src/core/cuda/src/cuda_angle_force.cu
index 3ed51957..7c49cffb 100644
--- a/src/core/cuda/src/cuda_angle_force.cu
+++ b/src/core/cuda/src/cuda_angle_force.cu
@@ -1,6 +1,6 @@
 #include "cuda/include/cuda_angle_force.cuh"
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_utility.cuh"
+#include "context.h"
 
 namespace CudaAngleForce {
 bool is_initialized = false;
@@ -77,11 +77,11 @@ double calc_angle_forces_host(int start, int end) {
     int blockSize = 256;
     int numBlocks = (N + blockSize - 1) / blockSize;
 
-    CudaContext& ctx = CudaContext::instance();
-    auto d_angles = ctx.d_angles;
-    auto d_coords = ctx.d_coords;
-    auto d_cangles = ctx.d_cangles;
-    auto d_dvelocities = ctx.d_dvelocities;
+    auto &host_ctx = Context::instance();
+    auto d_angles = host_ctx.angles->gpu_data_p;
+    auto d_coords = host_ctx.coords->gpu_data_p;
+    auto d_cangles = host_ctx.cangles->gpu_data_p;
+    auto d_dvelocities = host_ctx.dvelocities->gpu_data_p;
     // todo: now have to do that, after moving all to CudaContext, can remove it
     // ctx.sync_all_to_device();
 
diff --git a/src/core/cuda/src/cuda_bond_force.cu b/src/core/cuda/src/cuda_bond_force.cu
index 77fab9a8..9b31a660 100644
--- a/src/core/cuda/src/cuda_bond_force.cu
+++ b/src/core/cuda/src/cuda_bond_force.cu
@@ -1,5 +1,5 @@
 #include "cuda/include/cuda_bond_force.cuh"
-#include "cuda/include/cuda_context.cuh"
+#include "context.h"
 #include "cuda_utility.cuh"
 namespace CudaBondForce {
 bool is_initialized = false;
@@ -43,11 +43,11 @@ double calc_bond_forces_host(int start, int end) {
     double energy = 0.0;
     cudaMemcpy(d_energy_sum, &energy, sizeof(double), cudaMemcpyHostToDevice);
 
-    CudaContext& context = CudaContext::instance();
-    bond_t* d_bonds = context.d_bonds;
-    coord_t* d_coords = context.d_coords;
-    cbond_t* d_cbonds = context.d_cbonds;
-    dvel_t* d_dvelocities = context.d_dvelocities;
+    auto& host_ctx = Context::instance();
+    bond_t* d_bonds = host_ctx.bonds->gpu_data_p;
+    coord_t* d_coords = host_ctx.coords->gpu_data_p;
+    cbond_t* d_cbonds = host_ctx.cbonds->gpu_data_p;
+    dvel_t* d_dvelocities = host_ctx.dvelocities->gpu_data_p;
 
     calc_bond_forces_kernel<<<numBlocks, blockSize>>>(start, end, d_bonds, d_coords, d_cbonds, d_dvelocities, d_energy_sum);
     cudaDeviceSynchronize();
diff --git a/src/core/cuda/src/cuda_context.cu b/src/core/cuda/src/cuda_context.cu
index 7092d017..a1d95e5c 100644
--- a/src/core/cuda/src/cuda_context.cu
+++ b/src/core/cuda/src/cuda_context.cu
@@ -1,327 +1,153 @@
-#include "cuda_context.cuh"
+#include <algorithm>
+#include <map>
+#include <vector>
+
 #include "context.h"
 #include "common/include/constants.h"
 #include "common/include/vdw_rules.h"
 
-void CudaContext::init() {
-    auto& host = Context::instance();
-
-    check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * host.n_atoms);
-    check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * host.n_atoms);
-    check_cudaMalloc((void**)&d_velocities, sizeof(vel_t) * host.n_atoms);
-    check_cudaMalloc((void**)&d_angles, sizeof(angle_t) * host.n_angles);
-    check_cudaMalloc((void**)&d_cangles, sizeof(cangle_t) * host.n_cangles);
-    check_cudaMalloc((void**)&d_bonds, sizeof(bond_t) * host.n_bonds);
-    check_cudaMalloc((void**)&d_cbonds, sizeof(cbond_t) * host.n_cbonds);
-    check_cudaMalloc((void**)&d_impropers, sizeof(improper_t) * host.n_impropers);
-    check_cudaMalloc((void**)&d_cimpropers, sizeof(cimproper_t) * host.n_cimpropers);
-
-    check_cudaMalloc((void**)&d_mol_n_shakes, sizeof(int) * host.n_molecules);
-    check_cudaMalloc((void**)&d_shake_bonds, sizeof(shake_bond_t) * host.n_shake_constraints);
-    check_cudaMalloc((void**)&d_winv, sizeof(double) * host.n_atoms);
-    check_cudaMalloc((void**)&d_xcoords, sizeof(coord_t) * host.n_atoms);
-
-    check_cudaMalloc((void**)&d_atypes, sizeof(atype_t) * host.n_atypes);
-    check_cudaMalloc((void**)&d_catypes, sizeof(catype_t) * host.n_catypes);
-
-    check_cudaMalloc((void**)&d_q_atoms, sizeof(int) * host.n_qatoms);
-    check_cudaMalloc((void**)&d_q_charges, sizeof(ccharge_t) * host.n_qatoms * host.n_lambdas);
-    check_cudaMalloc((void**)&d_LJ_matrix, sizeof(int) * host.n_atoms_solute * host.n_atoms_solute);
-    check_cudaMalloc((void**)&d_excluded, sizeof(bool) * host.n_atoms);
-    check_cudaMalloc((void**)&d_q_elscales, sizeof(q_elscale_t) * host.n_qelscales);
-    check_cudaMalloc((void**)&d_q_atypes, sizeof(atype_t) * host.n_qatoms * host.n_lambdas);
-    check_cudaMalloc((void**)&d_EQ_nonbond_qq, sizeof(E_nonbonded_t) * host.n_lambdas);
-    check_cudaMalloc((void**)&d_lambdas, sizeof(double) * host.n_lambdas);
-
-    check_cudaMalloc((void**)&d_wshells, host.n_shells * sizeof(shell_t));
-    check_cudaMalloc((void**)&d_shell, sizeof(bool) * host.n_atoms);
-    check_cudaMalloc((void**)&d_coords_top, sizeof(coord_t) * host.n_atoms);
-
-    check_cudaMalloc((void**)&d_restrangs, sizeof(restrang_t) * host.n_restrangs);
-    check_cudaMalloc((void**)&d_EQ_restraint, sizeof(E_restraint_t) * host.n_lambdas);
-    check_cudaMalloc((void**)&d_restrdists, sizeof(restrdis_t) * host.n_restrdists);
-
-    check_cudaMalloc((void**)&d_restrspos, sizeof(restrpos_t) * host.n_restrspos);
-
-    check_cudaMalloc((void**)&d_restrseqs, sizeof(restrseq_t) * host.n_restrseqs);
-    check_cudaMalloc((void**)&d_heavy, sizeof(bool) * host.n_atoms);
-
-    check_cudaMalloc((void**)&d_restrwalls, sizeof(restrwall_t) * host.n_restrwalls);
-
-    check_cudaMalloc((void**)&d_torsions, sizeof(torsion_t) * host.n_torsions);
-    check_cudaMalloc((void**)&d_ctorsions, sizeof(ctorsion_t) * host.n_ctorsions);
-
-    check_cudaMalloc((void**)&d_ccharges, sizeof(ccharge_t) * host.n_ccharges);
-    check_cudaMalloc((void**)&d_charges, sizeof(charge_t) * host.n_charges);
-    check_cudaMalloc((void**)&d_p_atoms, sizeof(int) * host.n_patoms);
-    check_cudaMalloc((void**)&d_unified_ccharges, sizeof(ccharge_t) * host.unified_ccharges.size());
-    check_cudaMalloc((void**)&d_unified_catypes, sizeof(catype_t) * host.unified_catypes.size());
-
-    initialize_atom_lists_host();
-    initialize_ngbrs14_host();
-    initialize_charge_tables_host();
-    initialize_catype_tables_host();
-
-    sync_all_to_device();
+namespace {
+template <typename T>
+std::unique_ptr<HostDeviceBuffer<T>> make_host_device_buffer_from_vector(
+    const std::vector<T>& src,
+    bool run_gpu) {
+    auto buffer = std::make_unique<HostDeviceBuffer<T>>(src.size(), true, run_gpu);
+    if (!src.empty()) {
+        std::copy(src.begin(), src.end(), buffer->cpu_data_p);
+    }
+    return buffer;
 }
-
-void CudaContext::sync_all_to_device() {
-    auto& host = Context::instance();
-
-    sync_array_to_device<coord_t>(d_coords, host.coords.data(), host.n_atoms);
-    sync_array_to_device<dvel_t>(d_dvelocities, host.dvelocities.data(), host.n_atoms);
-    sync_array_to_device<vel_t>(d_velocities, host.velocities.data(), host.n_atoms);
-    sync_array_to_device<angle_t>(d_angles, host.angles.data(), host.n_angles);
-    sync_array_to_device<cangle_t>(d_cangles, host.cangles.data(), host.n_cangles);
-    sync_array_to_device<bond_t>(d_bonds, host.bonds.data(), host.n_bonds);
-    sync_array_to_device<cbond_t>(d_cbonds, host.cbonds.data(), host.n_cbonds);
-    sync_array_to_device<improper_t>(d_impropers, host.impropers.data(), host.n_impropers);
-    sync_array_to_device<cimproper_t>(d_cimpropers, host.cimpropers.data(), host.n_cimpropers);
-
-    sync_array_to_device<int>(d_mol_n_shakes, host.mol_n_shakes.data(), host.n_molecules);
-    sync_array_to_device<shake_bond_t>(d_shake_bonds, host.shake_bonds.data(), host.n_shake_constraints);
-    sync_array_to_device<double>(d_winv, host.winv.data(), host.n_atoms);
-    sync_array_to_device<coord_t>(d_xcoords, host.xcoords.data(), host.n_atoms);
-
-    sync_array_to_device<atype_t>(d_atypes, host.atypes.data(), host.n_atypes);
-    sync_array_to_device<catype_t>(d_catypes, host.catypes.data(), host.n_catypes);
-
-    sync_array_to_device<int>(d_q_atoms, host.q_atoms.data(), host.n_qatoms);
-    sync_array_to_device<ccharge_t>(d_q_charges, host.q_charges.data(), host.n_qatoms * host.n_lambdas);
-    sync_array_to_device<int>(d_LJ_matrix, host.LJ_matrix.data(), host.n_atoms_solute * host.n_atoms_solute);
-    sync_array_to_device<bool>(d_excluded, host.excluded.get(), host.n_atoms);
-    sync_array_to_device<q_elscale_t>(d_q_elscales, host.q_elscales.data(), host.n_qelscales);
-    sync_array_to_device<atype_t>(d_q_atypes, host.q_atypes.data(), host.n_qatoms * host.n_lambdas);
-    sync_array_to_device<E_nonbonded_t>(d_EQ_nonbond_qq, host.EQ_nonbond_qq.data(), host.n_lambdas);
-    sync_array_to_device<double>(d_lambdas, host.lambdas.data(), host.n_lambdas);
-    sync_array_to_device<shell_t>(d_wshells, host.wshells.data(), host.n_shells);
-
-    sync_array_to_device<bool>(d_shell, host.shell.get(), host.n_atoms);
-    sync_array_to_device<coord_t>(d_coords_top, host.coords_top.data(), host.n_atoms);
-
-    sync_array_to_device<restrang_t>(d_restrangs, host.restrangs.data(), host.n_restrangs);
-    sync_array_to_device<E_restraint_t>(d_EQ_restraint, host.EQ_restraint.data(), host.n_lambdas);
-    sync_array_to_device<restrdis_t>(d_restrdists, host.restrdists.data(), host.n_restrdists);
-
-    sync_array_to_device<restrpos_t>(d_restrspos, host.restrspos.data(), host.n_restrspos);
-
-    sync_array_to_device<restrseq_t>(d_restrseqs, host.restrseqs.data(), host.n_restrseqs);
-    sync_array_to_device<bool>(d_heavy, host.heavy.get(), host.n_atoms);
-    sync_array_to_device<restrwall_t>(d_restrwalls, host.restrwalls.data(), host.n_restrwalls);
-
-    sync_array_to_device<torsion_t>(d_torsions, host.torsions.data(), host.n_torsions);
-    sync_array_to_device<ctorsion_t>(d_ctorsions, host.ctorsions.data(), host.n_ctorsions);
-
-    sync_array_to_device<ccharge_t>(d_ccharges, host.ccharges.data(), host.n_ccharges);
-    sync_array_to_device<charge_t>(d_charges, host.charges.data(), host.n_charges);
-    sync_array_to_device<int>(d_p_atoms, host.p_atoms.data(), host.n_patoms);
-    sync_array_to_device<ccharge_t>(d_unified_ccharges, host.unified_ccharges.data(), host.unified_ccharges.size());
-    sync_array_to_device<catype_t>(d_unified_catypes, host.unified_catypes.data(), host.unified_catypes.size());
+}  // namespace
+
+void Context::cuda_initialize_helpers() {
+    cuda_initialize_atom_lists_host();
+    cuda_initialize_ngbrs14_host();
+    cuda_initialize_charge_tables_host();
+    cuda_initialize_catype_tables_host();
+    cuda_sync_all_to_device();
 }
 
-void CudaContext::sync_all_to_host() {
-    auto& host = Context::instance();
-
-    sync_array_to_host<coord_t>(host.coords.data(), d_coords, host.n_atoms);
-    sync_array_to_host<dvel_t>(host.dvelocities.data(), d_dvelocities, host.n_atoms);
-    sync_array_to_host<vel_t>(host.velocities.data(), d_velocities, host.n_atoms);
-    sync_array_to_host<angle_t>(host.angles.data(), d_angles, host.n_angles);
-    sync_array_to_host<cangle_t>(host.cangles.data(), d_cangles, host.n_cangles);
-    sync_array_to_host<bond_t>(host.bonds.data(), d_bonds, host.n_bonds);
-    sync_array_to_host<cbond_t>(host.cbonds.data(), d_cbonds, host.n_cbonds);
-    sync_array_to_host<improper_t>(host.impropers.data(), d_impropers, host.n_impropers);
-    sync_array_to_host<cimproper_t>(host.cimpropers.data(), d_cimpropers, host.n_cimpropers);
-
-    sync_array_to_host<int>(host.mol_n_shakes.data(), d_mol_n_shakes, host.n_molecules);
-    sync_array_to_host<shake_bond_t>(host.shake_bonds.data(), d_shake_bonds, host.n_shake_constraints);
-    sync_array_to_host<double>(host.winv.data(), d_winv, host.n_atoms);
-    sync_array_to_host<coord_t>(host.xcoords.data(), d_xcoords, host.n_atoms);
-
-    sync_array_to_host<atype_t>(host.atypes.data(), d_atypes, host.n_atypes);
-    sync_array_to_host<catype_t>(host.catypes.data(), d_catypes, host.n_catypes);
-
-    sync_array_to_host<int>(host.q_atoms.data(), d_q_atoms, host.n_qatoms);
-    sync_array_to_host<ccharge_t>(host.q_charges.data(), d_q_charges, host.n_qatoms * host.n_lambdas);
-    sync_array_to_host<int>(host.LJ_matrix.data(), d_LJ_matrix, host.n_atoms_solute * host.n_atoms_solute);
-    sync_array_to_host<bool>(host.excluded.get(), d_excluded, host.n_atoms);
-    sync_array_to_host<q_elscale_t>(host.q_elscales.data(), d_q_elscales, host.n_qelscales);
-    sync_array_to_host<atype_t>(host.q_atypes.data(), d_q_atypes, host.n_qatoms * host.n_lambdas);
-    sync_array_to_host<E_nonbonded_t>(host.EQ_nonbond_qq.data(), d_EQ_nonbond_qq, host.n_lambdas);
-    sync_array_to_host<double>(host.lambdas.data(), d_lambdas, host.n_lambdas);
-    sync_array_to_host<shell_t>(host.wshells.data(), d_wshells, host.n_shells);
-    sync_array_to_host<bool>(host.shell.get(), d_shell, host.n_atoms);
-    sync_array_to_host<coord_t>(host.coords_top.data(), d_coords_top, host.n_atoms);
-
-    sync_array_to_host<restrang_t>(host.restrangs.data(), d_restrangs, host.n_restrangs);
-    sync_array_to_host<E_restraint_t>(host.EQ_restraint.data(), d_EQ_restraint, host.n_lambdas);
-    sync_array_to_host<restrdis_t>(host.restrdists.data(), d_restrdists, host.n_restrdists);
-
-    sync_array_to_host<restrpos_t>(host.restrspos.data(), d_restrspos, host.n_restrspos);
-
-    sync_array_to_host<restrseq_t>(host.restrseqs.data(), d_restrseqs, host.n_restrseqs);
-    sync_array_to_host<bool>(host.heavy.get(), d_heavy, host.n_atoms);
-    sync_array_to_host<restrwall_t>(host.restrwalls.data(), d_restrwalls, host.n_restrwalls);
-
-    sync_array_to_host<torsion_t>(host.torsions.data(), d_torsions, host.n_torsions);
-    sync_array_to_host<ctorsion_t>(host.ctorsions.data(), d_ctorsions, host.n_ctorsions);
-
-    sync_array_to_host<ccharge_t>(host.ccharges.data(), d_ccharges, host.n_ccharges);
-    sync_array_to_host<charge_t>(host.charges.data(), d_charges, host.n_charges);
-    sync_array_to_host<int>(host.p_atoms.data(), d_p_atoms, host.n_patoms);
-    sync_array_to_host<ccharge_t>(host.unified_ccharges.data(), d_unified_ccharges, host.unified_ccharges.size());
-    sync_array_to_host<catype_t>(host.unified_catypes.data(), d_unified_catypes, host.unified_catypes.size());
+void Context::cuda_sync_all_to_device() {
+    coords->upload();
+    dvelocities->upload();
+    velocities->upload();
+    LJ_matrix->upload();
+    xcoords->upload();
+    winv->upload();
+    mol_n_shakes->upload();
+    shake_bonds->upload();
+    heavy->upload();
+    excluded->upload();
+    q_elscales->upload();
+    shell->upload();
+    wshells->upload();
+    lambdas->upload();
+    EQ_restraint->upload();
+    unified_ccharges->upload();
+    unified_catypes->upload();
+    ngbrs_14->upload();
+    p_atoms_list->upload();
+    w_atoms_list->upload();
+    q_atoms_list->upload();
+    charge_table_all->upload();
+    charge_pair_products->upload();
+    p_charge_types->upload();
+    w_charge_types->upload();
+    q_charge_types->upload();
+    catype_table_all->upload();
+    catype_pair_params->upload();
+    p_catype_types->upload();
+    w_catype_types->upload();
+    q_catype_types->upload();
 }
 
-void CudaContext::free() {
-    cudaFree(d_coords);
-    cudaFree(d_dvelocities);
-    cudaFree(d_velocities);
-    cudaFree(d_angles);
-    cudaFree(d_cangles);
-    cudaFree(d_bonds);
-    cudaFree(d_cbonds);
-    cudaFree(d_impropers);
-    cudaFree(d_cimpropers);
-
-    cudaFree(d_atypes);
-    cudaFree(d_catypes);
-
-    cudaFree(d_mol_n_shakes);
-    cudaFree(d_shake_bonds);
-    cudaFree(d_winv);
-    cudaFree(d_xcoords);
-
-    cudaFree(d_q_atoms);
-    cudaFree(d_q_charges);
-    cudaFree(d_LJ_matrix);
-    cudaFree(d_excluded);
-    cudaFree(d_q_elscales);
-    cudaFree(d_q_atypes);
-    cudaFree(d_EQ_nonbond_qq);
-    cudaFree(d_lambdas);
-
-    cudaFree(d_wshells);
-    cudaFree(d_shell);
-    cudaFree(d_coords_top);
-
-    cudaFree(d_restrangs);
-    cudaFree(d_EQ_restraint);
-    cudaFree(d_restrdists);
-
-    cudaFree(d_restrspos);
-
-    cudaFree(d_restrseqs);
-    cudaFree(d_heavy);
-
-    cudaFree(d_restrwalls);
-
-    cudaFree(d_torsions);
-    cudaFree(d_ctorsions);
-
-    cudaFree(d_ccharges);
-    cudaFree(d_charges);
-    cudaFree(d_p_atoms);
-    cudaFree(d_charge_table_all);
-    cudaFree(d_charge_pair_products);
-    cudaFree(d_catype_table_all);
-    cudaFree(d_catype_pair_params);
-    cudaFree(d_unified_ccharges);
-    cudaFree(d_unified_catypes);
-    cudaFree(d_ngbrs_14);
-    cudaFree(d_p_charge_types);
-    cudaFree(d_w_charge_types);
-    cudaFree(d_q_charge_types);
-    cudaFree(d_p_catype_types);
-    cudaFree(d_w_catype_types);
-    cudaFree(d_q_catype_types);
-    cudaFree(d_p_atoms_list);
-    cudaFree(d_w_atoms_list);
-    cudaFree(d_q_atoms_list);
-
-    d_charge_table_all = nullptr;
-    d_charge_pair_products = nullptr;
-    d_catype_table_all = nullptr;
-    d_catype_pair_params = nullptr;
-    d_unified_ccharges = nullptr;
-    d_unified_catypes = nullptr;
-    d_ngbrs_14 = nullptr;
+
+void Context::cuda_free_helpers() {
+    unified_ccharges.reset();
+    unified_catypes.reset();
+    ngbrs_14.reset();
+    p_atoms_list.reset();
+    w_atoms_list.reset();
+    q_atoms_list.reset();
+    charge_table_all.reset();
+    charge_pair_products.reset();
+    p_charge_types.reset();
+    w_charge_types.reset();
+    q_charge_types.reset();
+    catype_table_all.reset();
+    catype_pair_params.reset();
+    p_catype_types.reset();
+    w_catype_types.reset();
+    q_catype_types.reset();
+    catype_to_type_host.clear();
     n_charge_types = 0;
     zero_charge_type = -1;
     n_catype_types = 0;
     zero_catype_type = -1;
-    d_p_charge_types = nullptr;
-    d_w_charge_types = nullptr;
-    d_q_charge_types = nullptr;
-    d_p_catype_types = nullptr;
-    d_w_catype_types = nullptr;
-    d_q_catype_types = nullptr;
-    d_p_atoms_list = nullptr;
-    d_w_atoms_list = nullptr;
-    d_q_atoms_list = nullptr;
 }
 
-void CudaContext::reset_energies() {
-    auto& host = Context::instance();
-    cudaMemset(d_dvelocities, 0, sizeof(dvel_t) * host.n_atoms);
-    cudaMemset(d_EQ_nonbond_qq, 0, sizeof(E_nonbonded_t) * host.n_lambdas);
-    cudaMemset(d_EQ_restraint, 0, sizeof(E_restraint_t) * host.n_lambdas);
+void Context::cuda_reset_energies() {
+    cudaMemset(dvelocities->gpu_data_p, 0, sizeof(dvel_t) * n_atoms);
+    cudaMemset(EQ_restraint->gpu_data_p, 0, sizeof(E_restraint_t) * n_lambdas);
 }
 
-void CudaContext::initialize_atom_lists_host() {
-    auto& host = Context::instance();
+void Context::cuda_initialize_atom_lists_host() {
+    auto *excluded = this->excluded->cpu_data_p;
 
-    h_p_atoms_list.clear();
-    for (int i = 0; i < host.n_patoms; i++) {
-        int id = host.p_atoms[i];
-        if (!host.excluded[id]) {
+    std::vector<int> h_p_atoms_list;
+    h_p_atoms_list.reserve(n_patoms);
+    for (int i = 0; i < n_patoms; i++) {
+        int id = p_atoms[i];
+        if (!excluded[id]) {
             h_p_atoms_list.push_back(id);
         }
     }
 
-    h_w_atoms_list.clear();
-    for (int i = host.n_atoms_solute; i < host.n_atoms; i++) {
-        int id = i;
-        if (!host.excluded[id]) {
-            h_w_atoms_list.push_back(id);
+    std::vector<int> h_w_atoms_list;
+    h_w_atoms_list.reserve(n_atoms - n_atoms_solute);
+    for (int i = n_atoms_solute; i < n_atoms; i++) {
+        if (!excluded[i]) {
+            h_w_atoms_list.push_back(i);
         }
     }
-    printf("Number of water atoms: %d, number of all water atoms: %d\n", (int)h_w_atoms_list.size(), host.n_atoms - host.n_atoms_solute);
+    printf("Number of water atoms: %d, number of all water atoms: %d\n", (int)h_w_atoms_list.size(), n_atoms - n_atoms_solute);
 
-    h_q_atoms_list.clear();
-    for (int i = 0; i < host.n_qatoms; i++) {
-        int id = host.q_atoms[i];
-        if (!host.excluded[id]) {
+    std::vector<int> h_q_atoms_list;
+    h_q_atoms_list.reserve(n_qatoms);
+    for (int i = 0; i < n_qatoms; i++) {
+        int id = q_atoms[i];
+        if (!excluded[id]) {
             h_q_atoms_list.push_back(id);
         }
     }
 
-    check_cudaMalloc((void**)&d_p_atoms_list, sizeof(int) * h_p_atoms_list.size());
-    cudaMemcpy(d_p_atoms_list, h_p_atoms_list.data(), sizeof(int) * h_p_atoms_list.size(), cudaMemcpyHostToDevice);
-    check_cudaMalloc((void**)&d_w_atoms_list, sizeof(int) * h_w_atoms_list.size());
-    cudaMemcpy(d_w_atoms_list, h_w_atoms_list.data(), sizeof(int) * h_w_atoms_list.size(), cudaMemcpyHostToDevice);
-    check_cudaMalloc((void**)&d_q_atoms_list, sizeof(int) * h_q_atoms_list.size());
-    cudaMemcpy(d_q_atoms_list, h_q_atoms_list.data(), sizeof(int) * h_q_atoms_list.size(), cudaMemcpyHostToDevice);
+    p_atoms_list = make_host_device_buffer_from_vector(h_p_atoms_list, run_gpu);
+    w_atoms_list = make_host_device_buffer_from_vector(h_w_atoms_list, run_gpu);
+    q_atoms_list = make_host_device_buffer_from_vector(h_q_atoms_list, run_gpu);
 }
 
-void CudaContext::initialize_ngbrs14_host() {
-    auto& host = Context::instance();
-
-    if (!host.ngbrs_14.empty()) {
-        check_cudaMalloc((void**)&d_ngbrs_14, sizeof(int3) * host.ngbrs_14.size());
-        cudaMemcpy(d_ngbrs_14, host.ngbrs_14.data(), sizeof(int3) * host.ngbrs_14.size(), cudaMemcpyHostToDevice);
+void Context::cuda_initialize_ngbrs14_host() {
+    if (ngbrs_14 == nullptr) {
+        ngbrs_14 = std::make_unique<HostDeviceBuffer<int3>>(0, true, run_gpu);
     }
 }
 
-void CudaContext::initialize_charge_tables_host() {
-    auto& host = Context::instance();
+void Context::cuda_initialize_charge_tables_host() {
+    auto *excluded = this->excluded->cpu_data_p;
+    auto &charges = this->charges->cpu_data_p;
+    auto &ccharges = this->ccharges->cpu_data_p;
+    auto *lambda_values = lambdas->cpu_data_p;
+    auto *p_atoms_cpu = p_atoms_list->cpu_data_p;
+    auto *w_atoms_cpu = w_atoms_list->cpu_data_p;
+    auto *q_atoms_cpu = q_atoms_list->cpu_data_p;
 
     std::map<double, int> charge_to_type_host;
-    std::vector<ccharge_t> h_charge_table_all;  // h_charge_table_all[charge type] = (code, charge)
+    std::vector<ccharge_t> h_charge_table_all;
 
     auto add_charge = [&](double charge) -> int {
         if (charge_to_type_host.count(charge) == 0) {
-            int sz = h_charge_table_all.size();
-            ccharge_t new_ccharge;
+            int sz = static_cast<int>(h_charge_table_all.size());
+            ccharge_t new_ccharge = {};
             new_ccharge.code = sz;
             new_ccharge.charge = charge;
             h_charge_table_all.push_back(new_ccharge);
@@ -330,16 +156,14 @@ void CudaContext::initialize_charge_tables_host() {
         return charge_to_type_host[charge];
     };
 
-    for (int i = 0; i < host.n_ccharges; i++) {
-        double charge = host.ccharges[i].charge;
-        add_charge(charge);
+    for (int i = 0; i < n_ccharges; i++) {
+        add_charge(ccharges[i].charge);
     }
-    for (int state = 0; state < host.n_lambdas; state++) {
-        for (int i = 0; i < host.n_qatoms; i++) {
-            double charge = host.q_charges[i + host.n_qatoms * state].charge;
-            add_charge(charge);
-            charge *= host.lambdas[state];
+    for (int state = 0; state < n_lambdas; state++) {
+        for (int i = 0; i < n_qatoms; i++) {
+            double charge = q_charges[i + n_qatoms * state].charge;
             add_charge(charge);
+            add_charge(charge * lambda_values[state]);
         }
     }
 
@@ -353,61 +177,63 @@ void CudaContext::initialize_charge_tables_host() {
         }
     }
 
-    std::vector<int> p_charge_types(h_p_atoms_list.size());
-    for (int i = 0; i < static_cast<int>(h_p_atoms_list.size()); i++) {
-        int id = h_p_atoms_list[i];
-        double charge = host.ccharges[host.charges[id].code - 1].charge;
-        p_charge_types[i] = charge_to_type_host[charge];
+    std::vector<int> p_charge_types_cpu(p_atoms_list->length);
+    for (int i = 0; i < static_cast<int>(p_atoms_list->length); i++) {
+        const int id = p_atoms_cpu[i];
+        const double charge = ccharges[charges[id].code - 1].charge;
+        p_charge_types_cpu[i] = charge_to_type_host[charge];
     }
 
-    int h_q_atoms_size = h_q_atoms_list.size();
-    std::vector<int> q_charge_types(h_q_atoms_size * host.n_lambdas);
+    std::vector<int> q_charge_types_cpu(q_atoms_list->length * n_lambdas);
     std::map<int, int> q_idx;
-    for (int i = 0; i < host.n_qatoms; i++) {
-        int id = host.q_atoms[i];
-        if (!host.excluded[id]) {
+    for (int i = 0; i < n_qatoms; i++) {
+        int id = q_atoms[i];
+        if (!excluded[id]) {
             q_idx[id] = i;
         }
     }
 
-    for (int state = 0; state < host.n_lambdas; state++) {
-        for (int i = 0; i < h_q_atoms_size; i++) {
-            int id = h_q_atoms_list[i];
-            double charge = host.q_charges[q_idx[id] + host.n_qatoms * state].charge;
-            q_charge_types[state * h_q_atoms_size + i] = charge_to_type_host[charge];
+    for (int state = 0; state < n_lambdas; state++) {
+        for (int i = 0; i < static_cast<int>(q_atoms_list->length); i++) {
+            const int id = q_atoms_cpu[i];
+            const double charge = q_charges[q_idx[id] + n_qatoms * state].charge;
+            q_charge_types_cpu[state * q_atoms_list->length + i] = charge_to_type_host[charge];
         }
     }
 
-    int h_w_atoms_size = h_w_atoms_list.size();
-    std::vector<int> w_charge_types(h_w_atoms_size);
-    for (int i = 0; i < h_w_atoms_size; i++) {
-        int id = h_w_atoms_list[i];
-        double charge = host.ccharges[host.charges[id].code - 1].charge;
-        w_charge_types[i] = charge_to_type_host[charge];
+    std::vector<int> w_charge_types_cpu(w_atoms_list->length);
+    for (int i = 0; i < static_cast<int>(w_atoms_list->length); i++) {
+        const int id = w_atoms_cpu[i];
+        const double charge = ccharges[charges[id].code - 1].charge;
+        w_charge_types_cpu[i] = charge_to_type_host[charge];
     }
 
-    check_cudaMalloc((void**)&d_charge_table_all, sizeof(ccharge_t) * h_charge_table_all.size());
-    cudaMemcpy(d_charge_table_all, h_charge_table_all.data(), sizeof(ccharge_t) * h_charge_table_all.size(), cudaMemcpyHostToDevice);
-    check_cudaMalloc((void**)&d_charge_pair_products, sizeof(double) * h_charge_pair_products.size());
-    cudaMemcpy(d_charge_pair_products, h_charge_pair_products.data(), sizeof(double) * h_charge_pair_products.size(), cudaMemcpyHostToDevice);
-    check_cudaMalloc((void**)&d_p_charge_types, sizeof(int) * p_charge_types.size());
-    cudaMemcpy(d_p_charge_types, p_charge_types.data(), sizeof(int) * p_charge_types.size(), cudaMemcpyHostToDevice);
-    check_cudaMalloc((void**)&d_q_charge_types, sizeof(int) * q_charge_types.size());
-    cudaMemcpy(d_q_charge_types, q_charge_types.data(), sizeof(int) * q_charge_types.size(), cudaMemcpyHostToDevice);
-    check_cudaMalloc((void**)&d_w_charge_types, sizeof(int) * w_charge_types.size());
-    cudaMemcpy(d_w_charge_types, w_charge_types.data(), sizeof(int) * w_charge_types.size(), cudaMemcpyHostToDevice);
+    charge_table_all = make_host_device_buffer_from_vector(h_charge_table_all, run_gpu);
+    charge_pair_products = make_host_device_buffer_from_vector(h_charge_pair_products, run_gpu);
+    p_charge_types = make_host_device_buffer_from_vector(p_charge_types_cpu, run_gpu);
+    q_charge_types = make_host_device_buffer_from_vector(q_charge_types_cpu, run_gpu);
+    w_charge_types = make_host_device_buffer_from_vector(w_charge_types_cpu, run_gpu);
 }
 
-void CudaContext::initialize_catype_tables_host() {
-    auto& host = Context::instance();
+void Context::cuda_initialize_catype_tables_host() {
+    auto *excluded = this->excluded->cpu_data_p;
+    auto &atypes = this->atypes->cpu_data_p;
+    auto &catypes = this->catypes->cpu_data_p;
+    auto *p_atoms_cpu = p_atoms_list->cpu_data_p;
+    auto *w_atoms_cpu = w_atoms_list->cpu_data_p;
+    auto *q_atoms_cpu = q_atoms_list->cpu_data_p;
 
-    std::vector<catype_t> h_catype_table_all;  // h_catype_table_all[catype code] = catype_t
+    std::vector<catype_t> h_catype_table_all;
     catype_to_type_host.clear();
 
     auto add_catype = [&](catype_t catype) -> int {
-        auto key = get_catype_key(catype);
+        const std::array<double, 4> key = {
+            catype.aii_normal,
+            catype.bii_normal,
+            catype.aii_1_4,
+            catype.bii_1_4};
         if (catype_to_type_host.count(key) == 0) {
-            int sz = h_catype_table_all.size();
+            int sz = static_cast<int>(h_catype_table_all.size());
             catype.code = sz;
             h_catype_table_all.push_back(catype);
             catype_to_type_host[key] = sz;
@@ -415,14 +241,14 @@ void CudaContext::initialize_catype_tables_host() {
         return catype_to_type_host[key];
     };
 
-    for (int i = 0; i < host.n_catypes; i++) {
-        add_catype(host.catypes[i]);
+    for (int i = 0; i < n_catypes; i++) {
+        add_catype(catypes[i]);
     }
 
-    for (int state = 0; state < host.n_lambdas; state++) {
-        for (int i = 0; i < host.n_qatoms; i++) {
-            const atype_t& qat = host.q_atypes[i + host.n_qatoms * state];
-            add_catype(host.q_catypes[qat.code - 1]);
+    for (int state = 0; state < n_lambdas; state++) {
+        for (int i = 0; i < n_qatoms; i++) {
+            const atype_t& qat = q_atypes[i + n_qatoms * state];
+            add_catype(q_catypes[qat.code - 1]);
         }
     }
 
@@ -436,7 +262,7 @@ void CudaContext::initialize_catype_tables_host() {
             const catype_t& ci = h_catype_table_all[i];
             const catype_t& cj = h_catype_table_all[j];
             vdw_pair_param_t pair_param = {};
-            if (host.topo.vdw_rule == VDW_GEOMETRIC) {
+            if (topo.vdw_rule == VDW_GEOMETRIC) {
                 calc_vdw_geometric(ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, 1.0, &pair_param.a, &pair_param.b);
             } else {
                 calc_vdw_arithmetic(ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, 1.0, &pair_param.a, &pair_param.b);
@@ -445,53 +271,45 @@ void CudaContext::initialize_catype_tables_host() {
         }
     }
 
-    std::vector<int> p_catype_types(h_p_atoms_list.size());
-    for (int i = 0; i < static_cast<int>(h_p_atoms_list.size()); i++) {
-        int id = h_p_atoms_list[i];
-        auto catype = host.catypes[host.atypes[id].code - 1];
-        auto key = get_catype_key(catype);
-        p_catype_types[i] = catype_to_type_host[key];
+    std::vector<int> p_catype_types_cpu(p_atoms_list->length);
+    for (int i = 0; i < static_cast<int>(p_atoms_list->length); i++) {
+        const int id = p_atoms_cpu[i];
+        const catype_t catype = catypes[atypes[id].code - 1];
+        const std::array<double, 4> key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4};
+        p_catype_types_cpu[i] = catype_to_type_host[key];
     }
 
-    int h_q_atoms_size = h_q_atoms_list.size();
-    std::vector<int> q_catype_types(h_q_atoms_size * host.n_lambdas);
+    std::vector<int> q_catype_types_cpu(q_atoms_list->length * n_lambdas);
     std::map<int, int> q_idx;
-    for (int i = 0; i < host.n_qatoms; i++) {
-        int id = host.q_atoms[i];
-        if (!host.excluded[id]) {
+    for (int i = 0; i < n_qatoms; i++) {
+        int id = q_atoms[i];
+        if (!excluded[id]) {
             q_idx[id] = i;
         }
     }
 
-
-    for (int state = 0; state < host.n_lambdas; state++) {
-        for (int i = 0; i < h_q_atoms_size; i++) {
-            int id = h_q_atoms_list[i];
-            const atype_t& qat = host.q_atypes[q_idx[id] + host.n_qatoms * state];
-            auto key_normal = get_catype_key(host.q_catypes[qat.code - 1]);
-            q_catype_types[state * h_q_atoms_size + i] = catype_to_type_host[key_normal];
+    for (int state = 0; state < n_lambdas; state++) {
+        for (int i = 0; i < static_cast<int>(q_atoms_list->length); i++) {
+            const int id = q_atoms_cpu[i];
+            const atype_t& qat = q_atypes[q_idx[id] + n_qatoms * state];
+            const catype_t& qcatype = q_catypes[qat.code - 1];
+            const std::array<double, 4> key = {qcatype.aii_normal, qcatype.bii_normal, qcatype.aii_1_4, qcatype.bii_1_4};
+            q_catype_types_cpu[state * q_atoms_list->length + i] = catype_to_type_host[key];
         }
     }
 
-    int h_w_atoms_size = h_w_atoms_list.size();
-    std::vector<int> w_catype_types(h_w_atoms_size);
-    for (int i = 0; i < h_w_atoms_size; i++) {
-        int id = h_w_atoms_list[i];
-        auto catype = host.catypes[host.atypes[id].code - 1];
-        auto key = get_catype_key(catype);
-        w_catype_types[i] = catype_to_type_host[key];
-        // printf("Water atom %d: catype code %d mapped to %d, data: m=%f, aii_normal=%f, bii_normal=%f, aii_polar=%f, bii_polar=%f, aii_1_4=%f, bii_1_4=%f\n", id, atypes[id].code, w_catype_types[i], catype.m, catype.aii_normal, catype.bii_normal, catype.aii_polar, catype.bii_polar, catype.aii_1_4, catype.bii_1_4);
+    std::vector<int> w_catype_types_cpu(w_atoms_list->length);
+    for (int i = 0; i < static_cast<int>(w_atoms_list->length); i++) {
+        const int id = w_atoms_cpu[i];
+        const catype_t catype = catypes[atypes[id].code - 1];
+        const std::array<double, 4> key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4};
+        w_catype_types_cpu[i] = catype_to_type_host[key];
     }
-    printf("Total water atom number: %d, w_catype_types size: %lu\n", h_w_atoms_size, w_catype_types.size());
-
-    check_cudaMalloc((void**)&d_catype_table_all, sizeof(catype_t) * h_catype_table_all.size());
-    cudaMemcpy(d_catype_table_all, h_catype_table_all.data(), sizeof(catype_t) * h_catype_table_all.size(), cudaMemcpyHostToDevice);
-    check_cudaMalloc((void**)&d_catype_pair_params, sizeof(vdw_pair_param_t) * h_catype_pair_params.size());
-    cudaMemcpy(d_catype_pair_params, h_catype_pair_params.data(), sizeof(vdw_pair_param_t) * h_catype_pair_params.size(), cudaMemcpyHostToDevice);
-    check_cudaMalloc((void**)&d_p_catype_types, sizeof(int) * p_catype_types.size());
-    cudaMemcpy(d_p_catype_types, p_catype_types.data(), sizeof(int) * p_catype_types.size(), cudaMemcpyHostToDevice);
-    check_cudaMalloc((void**)&d_q_catype_types, sizeof(int) * q_catype_types.size());
-    cudaMemcpy(d_q_catype_types, q_catype_types.data(), sizeof(int) * q_catype_types.size(), cudaMemcpyHostToDevice);
-    check_cudaMalloc((void**)&d_w_catype_types, sizeof(int) * w_catype_types.size());
-    cudaMemcpy(d_w_catype_types, w_catype_types.data(), sizeof(int) * w_catype_types.size(), cudaMemcpyHostToDevice);
+    printf("Total water atom number: %lu, w_catype_types size: %lu\n", w_atoms_list->length, w_catype_types_cpu.size());
+
+    catype_table_all = make_host_device_buffer_from_vector(h_catype_table_all, run_gpu);
+    catype_pair_params = make_host_device_buffer_from_vector(h_catype_pair_params, run_gpu);
+    p_catype_types = make_host_device_buffer_from_vector(p_catype_types_cpu, run_gpu);
+    q_catype_types = make_host_device_buffer_from_vector(q_catype_types_cpu, run_gpu);
+    w_catype_types = make_host_device_buffer_from_vector(w_catype_types_cpu, run_gpu);
 }
diff --git a/src/core/cuda/src/cuda_handler.cu b/src/core/cuda/src/cuda_handler.cu
index bf45497c..b38c7376 100644
--- a/src/core/cuda/src/cuda_handler.cu
+++ b/src/core/cuda/src/cuda_handler.cu
@@ -3,7 +3,6 @@
 #include "common/include/context.h"
 #include "cuda/include/cuda_angle_force.cuh"
 #include "cuda/include/cuda_bond_force.cuh"
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_handler.cuh"
 #include "cuda/include/cuda_improper2_force.cuh"
 #include "cuda/include/cuda_leapfrog.cuh"
@@ -29,7 +28,7 @@
 
 void CudaHandler::initialize() {
     if (!initialized_) {
-        ctx_.init();
+        Context::instance().cuda_initialize_helpers();
         init_angle_force_kernel_data();
         init_bond_force_kernel_data();
         init_improper2_force_kernel_data();
@@ -82,6 +81,7 @@ void CudaHandler::shutdown() {
         cleanup_shake_constraints();
         cleanup_temperature();
         cleanup_torsion_force();
+        Context::instance().cuda_free_helpers();
         initialized_ = false;
     }
 }
@@ -135,5 +135,5 @@ void CudaHandler::calc_leapfrog() {
 
 void CudaHandler::reset_energies() {
     Handler::reset_energies();
-    ctx_.reset_energies();
+    Context::instance().cuda_reset_energies();
 }
diff --git a/src/core/cuda/src/cuda_improper2_force.cu b/src/core/cuda/src/cuda_improper2_force.cu
index 7e8cea14..e44678e0 100644
--- a/src/core/cuda/src/cuda_improper2_force.cu
+++ b/src/core/cuda/src/cuda_improper2_force.cu
@@ -1,6 +1,6 @@
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_improper2_force.cuh"
 #include "cuda/include/cuda_utility.cuh"
+#include "context.h"
 
 namespace CudaImproper2Force {
 bool is_initialized = false;
@@ -134,12 +134,11 @@ double calc_improper2_forces_host(int start, int end) {
     double energy = 0.0;
     cudaMemcpy(d_energy_sum, &energy, sizeof(double), cudaMemcpyHostToDevice);
 
-    CudaContext& context = CudaContext::instance();
-    // context.sync_all_to_device();
-    coord_t* d_coords = context.d_coords;
-    dvel_t* d_dvelocities = context.d_dvelocities;
-    improper_t* d_impropers = context.d_impropers;
-    cimproper_t* d_cimpropers = context.d_cimpropers;
+    auto& host_ctx = Context::instance();
+    coord_t* d_coords = host_ctx.coords->gpu_data_p;
+    dvel_t* d_dvelocities = host_ctx.dvelocities->gpu_data_p;
+    improper_t* d_impropers = host_ctx.impropers->gpu_data_p;
+    cimproper_t* d_cimpropers = host_ctx.cimpropers->gpu_data_p;
 
     calc_improper2_forces_kernel<<<numBlocks, blockSize>>>(start, end, d_impropers, d_cimpropers, d_coords, d_dvelocities, d_energy_sum);
     cudaDeviceSynchronize();
@@ -161,4 +160,4 @@ void cleanup_improper2_force() {
         cudaFree(d_energy_sum);
         is_initialized = false;
     }
-}
\ No newline at end of file
+}
diff --git a/src/core/cuda/src/cuda_leapfrog.cu b/src/core/cuda/src/cuda_leapfrog.cu
index 6b68ef99..49312337 100644
--- a/src/core/cuda/src/cuda_leapfrog.cu
+++ b/src/core/cuda/src/cuda_leapfrog.cu
@@ -1,7 +1,6 @@
 #include <iostream>
 
 #include "common/include/context.h"
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_leapfrog.cuh"
 #include "cuda/include/cuda_shake_constraints.cuh"
 #include "cuda_utility.cuh"
@@ -48,13 +47,12 @@ __global__ void calc_leapfrog_kernel(
 
 void calc_leapfrog_host() {
     auto& host = Context::instance();
-    auto& ctx = CudaContext::instance();
-    auto d_atypes = ctx.d_atypes;
-    auto d_catypes = ctx.d_catypes;
-    auto d_velocities = ctx.d_velocities;
-    auto d_dvelocities = ctx.d_dvelocities;
-    auto d_coords = ctx.d_coords;
-    auto d_xcoords = ctx.d_xcoords;
+    auto d_atypes = host.atypes->gpu_data_p;
+    auto d_catypes = host.catypes->gpu_data_p;
+    auto d_velocities = host.velocities->gpu_data_p;
+    auto d_dvelocities = host.dvelocities->gpu_data_p;
+    auto d_coords = host.coords->gpu_data_p;
+    auto d_xcoords = host.xcoords->gpu_data_p;
 
     int blockSize = 256;
     int numBlocks = (host.n_atoms + blockSize - 1) / blockSize;
@@ -72,20 +70,23 @@ void calc_leapfrog_host() {
         host.dt);
     check_cuda(cudaDeviceSynchronize());
 
-    check_cuda(cudaMemcpy(host.velocities.data(), d_velocities, sizeof(vel_t) * host.n_atoms, cudaMemcpyDeviceToHost));
-    check_cuda(cudaMemcpy(host.dvelocities.data(), d_dvelocities, sizeof(dvel_t) * host.n_atoms, cudaMemcpyDeviceToHost));
-    check_cuda(cudaMemcpy(host.coords.data(), d_coords, sizeof(coord_t) * host.n_atoms, cudaMemcpyDeviceToHost));
-    check_cuda(cudaMemcpy(host.xcoords.data(), d_xcoords, sizeof(coord_t) * host.n_atoms, cudaMemcpyDeviceToHost));
+    host.velocities->download();
+    host.dvelocities->download();
+    host.coords->download();
+    host.xcoords->download();
 
     // shake
     // todo: Here is some problem, it writes into cpu memory, but we use gpu..
     printf("n_shake_constraints: %d\n", host.n_shake_constraints);
     if (host.n_shake_constraints > 0) {
         calc_shake_constraints_host();
+        auto &velocities = host.velocities->cpu_data_p;
+        auto &coords = host.coords->cpu_data_p;
+        auto *xcoords = host.xcoords->cpu_data_p;
         for (int i = 0; i < host.n_atoms; i++) {
-            host.velocities[i].x = (host.coords[i].x - host.xcoords[i].x) / host.dt;
-            host.velocities[i].y = (host.coords[i].y - host.xcoords[i].y) / host.dt;
-            host.velocities[i].z = (host.coords[i].z - host.xcoords[i].z) / host.dt;
+            velocities[i].x = (coords[i].x - xcoords[i].x) / host.dt;
+            velocities[i].y = (coords[i].y - xcoords[i].y) / host.dt;
+            velocities[i].z = (coords[i].z - xcoords[i].z) / host.dt;
         }
     }
 }
diff --git a/src/core/cuda/src/cuda_nonbonded_14_force.cu b/src/core/cuda/src/cuda_nonbonded_14_force.cu
index 4d5c1cca..fa404ee7 100644
--- a/src/core/cuda/src/cuda_nonbonded_14_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_14_force.cu
@@ -1,7 +1,6 @@
 #include "constants.h"
 #include "common/include/context.h"
 #include "common/include/vdw_rules.h"
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_nonbonded_14_force.cuh"
 #include "cuda_utility.cuh"
 
@@ -153,7 +152,6 @@ static Nonbonded14EnergyBuckets calc_nonbonded_14_force_state_host(
     using namespace CudaNonbonded14Force;
 
     auto& host = Context::instance();
-    auto& context = CudaContext::instance();
     const int n_ngbrs_14 = host.n_ngbrs14;
     Nonbonded14EnergyBuckets energies = {};
     if (n_ngbrs_14 == 0) return energies;
@@ -166,14 +164,14 @@ static Nonbonded14EnergyBuckets calc_nonbonded_14_force_state_host(
 
     calc_nonbonded_14_force_kernel<<<num_blocks, block_size>>>(
         n_ngbrs_14,
-        context.d_ngbrs_14,
+        host.ngbrs_14->gpu_data_p,
         host.topo,
-        context.d_excluded,
+        host.excluded->gpu_data_p,
         d_atom_to_qi,
-        context.d_unified_ccharges,
-        context.d_unified_catypes,
-        context.d_coords,
-        context.d_dvelocities,
+        host.unified_ccharges->gpu_data_p,
+        host.unified_catypes->gpu_data_p,
+        host.coords->gpu_data_p,
+        host.dvelocities->gpu_data_p,
         d_evdw_totals,
         d_ecoul_totals,
         include_pp,
@@ -192,11 +190,12 @@ static Nonbonded14EnergyBuckets calc_nonbonded_14_force_state_host(
 
 void calc_nonbonded_14_forces_host() {
     auto& host = Context::instance();
+    auto *lambdas = host.lambdas->cpu_data_p;
 
     if (host.n_ngbrs14 == 0) return;
 
     for (int state = 0; state < host.n_lambdas; state++) {
-        const double lambda = host.lambdas[state];
+        const double lambda = lambdas[state];
         const bool include_pp = (state == 0);
         Nonbonded14EnergyBuckets energies = calc_nonbonded_14_force_state_host(state, lambda, include_pp);
 
diff --git a/src/core/cuda/src/cuda_nonbonded_force.cu b/src/core/cuda/src/cuda_nonbonded_force.cu
index f2ed84ad..432a7137 100644
--- a/src/core/cuda/src/cuda_nonbonded_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_force.cu
@@ -1,7 +1,6 @@
 #include <iostream>
 
 #include "context.h"
-#include "cuda_context.cuh"
 #include "cuda_nonbonded_force.cuh"
 #include "constants.h"
 #include "cuda_utility.cuh"
@@ -293,7 +292,6 @@ std::pair<double, double> calc_nonbonded_force_host(
     const bool disable_water_h_lj, const double lambda) {
     using namespace CudaNonbondedForce;
     Context& host = Context::instance();
-    CudaContext& context = CudaContext::instance();
     const int thread_num = 256;
     dim3 block_sz = dim3(thread_num);
 
@@ -315,29 +313,29 @@ std::pair<double, double> calc_nonbonded_force_host(
         ny,
         x_charges_types,
         y_charges_types,
-        context.d_charge_pair_products,
+        host.charge_pair_products->gpu_data_p,
         x_atypes_types,
         y_atypes_types,
-        context.d_catype_pair_params,
+        host.catype_pair_params->gpu_data_p,
         host.topo,
-        context.d_excluded,
-        context.d_LJ_matrix,
+        host.excluded->gpu_data_p,
+        host.LJ_matrix->gpu_data_p,
         x_idx_list,
         y_idx_list,
-        context.d_coords,
-        context.d_dvelocities,
+        host.coords->gpu_data_p,
+        host.dvelocities->gpu_data_p,
         d_evdw_total,
         d_ecoul_total,
         symmetric,
         disable_water_h_lj,
         host.n_atoms_solute,
-        context.n_charge_types,
-        context.zero_charge_type,
-        context.n_catype_types,
-        context.zero_catype_type,
+        host.n_charge_types,
+        host.zero_charge_type,
+        host.n_catype_types,
+        host.zero_catype_type,
         host.n_qelscales,
         lambda,
-        context.d_q_elscales);
+        host.q_elscales->gpu_data_p);
 
     cudaDeviceSynchronize();
 
diff --git a/src/core/cuda/src/cuda_nonbonded_pp_force.cu b/src/core/cuda/src/cuda_nonbonded_pp_force.cu
index 4e483c81..76992aeb 100644
--- a/src/core/cuda/src/cuda_nonbonded_pp_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_pp_force.cu
@@ -2,7 +2,6 @@
 #include <vector>
 
 #include "common/include/context.h"
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_nonbonded_force.cuh"
 #include "cuda/include/cuda_nonbonded_pp_force.cuh"
 #include "cuda_utility.cuh"
@@ -20,22 +19,21 @@ void init_nonbonded_pp_force_kernel_data() {
 }
 
 void calc_nonbonded_pp_forces_host_v2() {
-    int nx = CudaContext::instance().h_p_atoms_list.size();
+    Context& host = Context::instance();
+    int nx = static_cast<int>(host.p_atoms_list->length);
     int ny = nx;
 
     auto result = calc_nonbonded_force_host(
         nx,
         ny,
-        CudaContext::instance().d_p_atoms_list,
-        CudaContext::instance().d_p_atoms_list,
+        host.p_atoms_list->gpu_data_p,
+        host.p_atoms_list->gpu_data_p,
         true,  // symmetric
-        CudaContext::instance().d_p_charge_types,
-        CudaContext::instance().d_p_charge_types,
-        CudaContext::instance().d_p_catype_types,
-        CudaContext::instance().d_p_catype_types,
+        host.p_charge_types->gpu_data_p,
+        host.p_charge_types->gpu_data_p,
+        host.p_catype_types->gpu_data_p,
+        host.p_catype_types->gpu_data_p,
         false);
-
-    Context& host = Context::instance();
     host.E_nonbond_pp.Uvdw = result.first;
     host.E_nonbond_pp.Ucoul = result.second;
 }
diff --git a/src/core/cuda/src/cuda_nonbonded_pw_force.cu b/src/core/cuda/src/cuda_nonbonded_pw_force.cu
index efcf0865..461e46e2 100644
--- a/src/core/cuda/src/cuda_nonbonded_pw_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_pw_force.cu
@@ -2,7 +2,6 @@
 #include <vector>
 
 #include "common/include/context.h"
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_nonbonded_force.cuh"
 #include "cuda/include/cuda_nonbonded_pw_force.cuh"
 
@@ -14,21 +13,20 @@ bool is_initialized = false;
 void calc_nonbonded_pw_forces_host_v2() {
     using namespace CudaNonbondedPWForce;
 
-    int nx = CudaContext::instance().h_p_atoms_list.size();
-    int ny = CudaContext::instance().h_w_atoms_list.size();
+    Context& host = Context::instance();
+    int nx = static_cast<int>(host.p_atoms_list->length);
+    int ny = static_cast<int>(host.w_atoms_list->length);
     auto result = calc_nonbonded_force_host(
         nx, ny,
-        CudaContext::instance().d_p_atoms_list,
-        CudaContext::instance().d_w_atoms_list,
+        host.p_atoms_list->gpu_data_p,
+        host.w_atoms_list->gpu_data_p,
         false,
-        CudaContext::instance().d_p_charge_types,
-        CudaContext::instance().d_w_charge_types,
-        CudaContext::instance().d_p_catype_types,
-        CudaContext::instance().d_w_catype_types,
+        host.p_charge_types->gpu_data_p,
+        host.w_charge_types->gpu_data_p,
+        host.p_catype_types->gpu_data_p,
+        host.w_catype_types->gpu_data_p,
         false);
     // printf("Nonbonded PW Force (Host) - VdW: %f, Coulomb: %f\n", result.first, result.second);
-
-    Context& host = Context::instance();
     host.E_nonbond_pw.Uvdw = result.first;
     host.E_nonbond_pw.Ucoul = result.second;
 }
diff --git a/src/core/cuda/src/cuda_nonbonded_qp_force.cu b/src/core/cuda/src/cuda_nonbonded_qp_force.cu
index 79d2ea34..37e6d1e5 100644
--- a/src/core/cuda/src/cuda_nonbonded_qp_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_qp_force.cu
@@ -3,7 +3,6 @@
 #include <vector>
 
 #include "common/include/context.h"
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_nonbonded_force.cuh"
 #include "cuda/include/cuda_nonbonded_qp_force.cuh"
 #include "cuda_utility.cuh"
@@ -15,25 +14,25 @@ bool is_initialized = false;
 
 void calc_nonbonded_qp_forces_host_v2() {
     using namespace CudaNonbondedQPForce;
-    int nx = CudaContext::instance().h_q_atoms_list.size();
-    int ny = CudaContext::instance().h_p_atoms_list.size();
-    
     Context& host = Context::instance();
+    int nx = static_cast<int>(host.q_atoms_list->length);
+    int ny = static_cast<int>(host.p_atoms_list->length);
+    auto *lambdas = host.lambdas->cpu_data_p;
     for (int state = 0; state < host.n_lambdas; state++) {
         auto result = calc_nonbonded_force_host(
             nx,
             ny,
-            CudaContext::instance().d_q_atoms_list,
-            CudaContext::instance().d_p_atoms_list,
+            host.q_atoms_list->gpu_data_p,
+            host.p_atoms_list->gpu_data_p,
             false,
-            CudaContext::instance().d_q_charge_types + state * nx,
-            CudaContext::instance().d_p_charge_types,
-            CudaContext::instance().d_q_catype_types + state * nx,
-            CudaContext::instance().d_p_catype_types,
-            false, host.lambdas[state]);
+            host.q_charge_types->gpu_data_p + state * nx,
+            host.p_charge_types->gpu_data_p,
+            host.q_catype_types->gpu_data_p + state * nx,
+            host.p_catype_types->gpu_data_p,
+            false, lambdas[state]);
 
-        host.EQ_nonbond_qp[state].Uvdw = result.first / host.lambdas[state];
-        host.EQ_nonbond_qp[state].Ucoul = result.second / host.lambdas[state];
+        host.EQ_nonbond_qp[state].Uvdw = result.first / lambdas[state];
+        host.EQ_nonbond_qp[state].Ucoul = result.second / lambdas[state];
         // printf("Nonbonded QP Force State %d: Uvdw = %f, Ucoul = %f\n", state, result.first, result.second);
     }
 }
diff --git a/src/core/cuda/src/cuda_nonbonded_qq_force.cu b/src/core/cuda/src/cuda_nonbonded_qq_force.cu
index 25d82b05..b71a5fa8 100644
--- a/src/core/cuda/src/cuda_nonbonded_qq_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_qq_force.cu
@@ -2,7 +2,6 @@
 #include <iostream>
 
 #include "common/include/context.h"
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_nonbonded_force.cuh"
 #include "cuda/include/cuda_nonbonded_qq_force.cuh"
 #include "cuda_utility.cuh"
@@ -16,22 +15,23 @@ void calc_nonbonded_qq_forces_host() {
     using namespace CudaNonbondedQQForce;
 
     Context& host = Context::instance();
-    int n = CudaContext::instance().h_q_atoms_list.size();
+    auto *lambdas = host.lambdas->cpu_data_p;
+    int n = static_cast<int>(host.q_atoms_list->length);
     for (int state = 0; state < host.n_lambdas; state++) {
         auto result = calc_nonbonded_force_host(
             n,
             n,
-            CudaContext::instance().d_q_atoms_list,
-            CudaContext::instance().d_q_atoms_list,
+            host.q_atoms_list->gpu_data_p,
+            host.q_atoms_list->gpu_data_p,
             true,
-            CudaContext::instance().d_q_charge_types + state * n,
-            CudaContext::instance().d_q_charge_types + state * n,
-            CudaContext::instance().d_q_catype_types + state * n,
-            CudaContext::instance().d_q_catype_types + state * n,
-            false, host.lambdas[state]);
-
-        host.EQ_nonbond_qq[state].Uvdw = result.first / host.lambdas[state];
-        host.EQ_nonbond_qq[state].Ucoul = result.second / host.lambdas[state];
+            host.q_charge_types->gpu_data_p + state * n,
+            host.q_charge_types->gpu_data_p + state * n,
+            host.q_catype_types->gpu_data_p + state * n,
+            host.q_catype_types->gpu_data_p + state * n,
+            false, lambdas[state]);
+
+        host.EQ_nonbond_qq[state].Uvdw = result.first / lambdas[state];
+        host.EQ_nonbond_qq[state].Ucoul = result.second / lambdas[state];
     }
 }
 
diff --git a/src/core/cuda/src/cuda_nonbonded_qw_force.cu b/src/core/cuda/src/cuda_nonbonded_qw_force.cu
index bb826cc5..32978e19 100644
--- a/src/core/cuda/src/cuda_nonbonded_qw_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_qw_force.cu
@@ -2,7 +2,6 @@
 #include <iostream>
 
 #include "common/include/context.h"
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_nonbonded_force.cuh"
 #include "cuda/include/cuda_nonbonded_qw_force.cuh"
 #include "cuda_utility.cuh"
@@ -14,25 +13,26 @@ bool is_initialized = false;
 
 void calc_nonbonded_qw_forces_host_v2() {
     using namespace CudaNonbondedQWForce;
-    int nx = CudaContext::instance().h_q_atoms_list.size();
-    int ny = CudaContext::instance().h_w_atoms_list.size();
 
     Context& host = Context::instance();
+    int nx = static_cast<int>(host.q_atoms_list->length);
+    int ny = static_cast<int>(host.w_atoms_list->length);
+    auto *lambdas = host.lambdas->cpu_data_p;
     for (int state = 0; state < host.n_lambdas; state++) {
         auto result = calc_nonbonded_force_host(
             nx,
             ny,
-            CudaContext::instance().d_q_atoms_list,
-            CudaContext::instance().d_w_atoms_list,
+            host.q_atoms_list->gpu_data_p,
+            host.w_atoms_list->gpu_data_p,
             false,
-            CudaContext::instance().d_q_charge_types + state * nx,
-            CudaContext::instance().d_w_charge_types,
-            CudaContext::instance().d_q_catype_types + state * nx,
-            CudaContext::instance().d_w_catype_types,
-            true, host.lambdas[state]);
-
-        host.EQ_nonbond_qw[state].Uvdw = result.first / host.lambdas[state];
-        host.EQ_nonbond_qw[state].Ucoul = result.second / host.lambdas[state];
+            host.q_charge_types->gpu_data_p + state * nx,
+            host.w_charge_types->gpu_data_p,
+            host.q_catype_types->gpu_data_p + state * nx,
+            host.w_catype_types->gpu_data_p,
+            true, lambdas[state]);
+
+        host.EQ_nonbond_qw[state].Uvdw = result.first / lambdas[state];
+        host.EQ_nonbond_qw[state].Ucoul = result.second / lambdas[state];
         // printf("Nonbonded QW Force State %d: Uvdw = %f, Ucoul = %f\n", state, result.first, result.second);
     }
 }
diff --git a/src/core/cuda/src/cuda_nonbonded_ww_force.cu b/src/core/cuda/src/cuda_nonbonded_ww_force.cu
index 304fa21a..dc5dcbb1 100644
--- a/src/core/cuda/src/cuda_nonbonded_ww_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_ww_force.cu
@@ -2,7 +2,6 @@
 #include <vector>
 
 #include "common/include/context.h"
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_nonbonded_force.cuh"
 #include "cuda/include/cuda_nonbonded_ww_force.cuh"
 namespace CudaNonbondedWWForce {
@@ -12,19 +11,19 @@ bool is_initialized = false;
 void calc_nonbonded_ww_forces_host_v2() {
     using namespace CudaNonbondedWWForce;
 
-    int nx = CudaContext::instance().h_w_atoms_list.size();
+    Context& host = Context::instance();
+    int nx = static_cast<int>(host.w_atoms_list->length);
     int ny = nx;
     auto result = calc_nonbonded_force_host(
         nx, ny,
-        CudaContext::instance().d_w_atoms_list,
-        CudaContext::instance().d_w_atoms_list,
+        host.w_atoms_list->gpu_data_p,
+        host.w_atoms_list->gpu_data_p,
         true,
-        CudaContext::instance().d_w_charge_types,
-        CudaContext::instance().d_w_charge_types,
-        CudaContext::instance().d_w_catype_types,
-        CudaContext::instance().d_w_catype_types,
+        host.w_charge_types->gpu_data_p,
+        host.w_charge_types->gpu_data_p,
+        host.w_catype_types->gpu_data_p,
+        host.w_catype_types->gpu_data_p,
         true);
-    Context& host = Context::instance();
     host.E_nonbond_ww.Uvdw = result.first;
     host.E_nonbond_ww.Ucoul = result.second;
 }
diff --git a/src/core/cuda/src/cuda_polx_water_force.cu b/src/core/cuda/src/cuda_polx_water_force.cu
index ac2347aa..9b0eb667 100644
--- a/src/core/cuda/src/cuda_polx_water_force.cu
+++ b/src/core/cuda/src/cuda_polx_water_force.cu
@@ -3,7 +3,6 @@
 
 #include "common/include/context.h"
 #include "common/include/constants.h"
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_polx_water_force.cuh"
 #include "cuda_utility.cuh"
 
@@ -171,16 +170,17 @@ __global__ void calc_polx_water_forces_kernel(
 void sort_waters() {
     using namespace CudaPolxWaterForce;
     auto& ctx = Context::instance();
+    auto *wshells = ctx.wshells->cpu_data_p;
 
     int imin, jmin, jw;
     double tmin;
     // Sort the waters according to theta
     for (int is = 0; is < ctx.n_shells; is++) {
         imin = 0;
-        for (int il = 0; il < ctx.wshells[is].n_inshell; il++) {
+        for (int il = 0; il < wshells[is].n_inshell; il++) {
             tmin = 2 * M_PI;
-            for (int jl = 0; jl < ctx.wshells[is].n_inshell; jl++) {
-                // printf("Searching water %d in shell %d, total number: %d\n", jl, is, ctx.wshells[is].n_inshell);
+            for (int jl = 0; jl < wshells[is].n_inshell; jl++) {
+                // printf("Searching water %d in shell %d, total number: %d\n", jl, is, wshells[is].n_inshell);
                 jw = polx_list_sh[jl * ctx.n_shells + is];
                 // printf("Water %d in shell %d has theta %f\n", jw, is, ctx.tdum[jw] * 180 / M_PI);
                 if (ctx.tdum[jw] < tmin) {
@@ -198,23 +198,21 @@ void sort_waters() {
 }
 
 void calc_polx_water_forces_host(int iteration) {
-    CudaContext& cuda_ctx = CudaContext::instance();
     auto& ctx = Context::instance();
+    auto *wshells = ctx.wshells->cpu_data_p;
 
     for (int is = 0; is < ctx.n_shells; is++) {
-        ctx.wshells[is].n_inshell = 0;
+        wshells[is].n_inshell = 0;
         if (iteration == 0) {
-            ctx.wshells[is].theta_corr = 0;
+            wshells[is].theta_corr = 0;
         }
     }
     using namespace CudaPolxWaterForce;
 
-    // cuda_ctx.sync_all_to_device();
-    cudaMemcpy(cuda_ctx.d_wshells, ctx.wshells.data(), ctx.n_shells * sizeof(shell_t), cudaMemcpyHostToDevice);
-
-    coord_t* d_coords = cuda_ctx.d_coords;
-    dvel_t* d_dvelocities = cuda_ctx.d_dvelocities;
-    shell_t* d_wshells = cuda_ctx.d_wshells;
+    coord_t* d_coords = ctx.coords->gpu_data_p;
+    dvel_t* d_dvelocities = ctx.dvelocities->gpu_data_p;
+    shell_t* d_wshells = ctx.wshells->gpu_data_p;
+    ctx.wshells->upload();
 
     int blockSize = 256;
     int numBlocks = (ctx.n_waters + blockSize - 1) / blockSize;
@@ -224,7 +222,7 @@ void calc_polx_water_forces_host(int iteration) {
     // printf("Calculated theta for %d waters in %d shells\n", ctx.n_waters, ctx.n_shells);
 
     // todo: sort in cpu now..
-    cudaMemcpy(ctx.wshells.data(), d_wshells, ctx.n_shells * sizeof(shell_t), cudaMemcpyDeviceToHost);
+    ctx.wshells->download();
     cudaMemcpy(ctx.tdum.data(), d_tdum, ctx.n_waters * sizeof(double), cudaMemcpyDeviceToHost);
     cudaMemcpy(polx_list_sh, d_list_sh, ctx.n_max_inshell * ctx.n_shells * sizeof(int), cudaMemcpyDeviceToHost);
 
@@ -245,15 +243,15 @@ void calc_polx_water_forces_host(int iteration) {
     if (iteration != 0 && iteration % itdis_update == 0) {
         for (int is = 0; is < ctx.n_shells; is++) {
             printf("SHELL %d\n", is);
-            ctx.wshells[is].avtheta /= (double)itdis_update;
-            ctx.wshells[is].avn_inshell /= (double)itdis_update;
-            ctx.wshells[is].theta_corr = ctx.wshells[is].theta_corr + ctx.wshells[is].avtheta - acos(ctx.wshells[is].cstb);
+            wshells[is].avtheta /= (double)itdis_update;
+            wshells[is].avn_inshell /= (double)itdis_update;
+            wshells[is].theta_corr = wshells[is].theta_corr + wshells[is].avtheta - acos(wshells[is].cstb);
             printf("average theta = %f, average in shell = %f, theta_corr = %f\n",
-                   ctx.wshells[is].avtheta * 180 / M_PI, ctx.wshells[is].avn_inshell, ctx.wshells[is].theta_corr * 180 / M_PI);
-            ctx.wshells[is].avtheta = 0;
-            ctx.wshells[is].avn_inshell = 0;
+                   wshells[is].avtheta * 180 / M_PI, wshells[is].avn_inshell, wshells[is].theta_corr * 180 / M_PI);
+            wshells[is].avtheta = 0;
+            wshells[is].avn_inshell = 0;
         }
-        cudaMemcpy(d_wshells, ctx.wshells.data(), ctx.n_shells * sizeof(shell_t), cudaMemcpyHostToDevice);
+        ctx.wshells->upload();
     }
 
     // Calculate energy and force
@@ -264,7 +262,7 @@ void calc_polx_water_forces_host(int iteration) {
     double energy;
     cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost);
     ctx.E_restraint.Upolx += energy;
-    cudaMemcpy(ctx.wshells.data(), d_wshells, ctx.n_shells * sizeof(shell_t), cudaMemcpyDeviceToHost);
+    ctx.wshells->download();
     // Copy back forces for all atoms (solute + solvent); water forces were being dropped.
 }
 
diff --git a/src/core/cuda/src/cuda_pshell_force.cu b/src/core/cuda/src/cuda_pshell_force.cu
index 39fb4ff3..a01fb536 100644
--- a/src/core/cuda/src/cuda_pshell_force.cu
+++ b/src/core/cuda/src/cuda_pshell_force.cu
@@ -1,4 +1,3 @@
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_pshell_force.cuh"
 #include "common/include/constants.h"
 #include "common/include/context.h"
@@ -15,7 +14,7 @@ __global__ void calc_pshell_force_kernel(
     bool* shell,
     bool* excluded,
     coord_t* coords,
-    coord_t* coords_top,
+    coord_t* coords_init,
     double* ufix_energy,
     double* ushell_energy,
     dvel_t* dvelocities) {
@@ -32,9 +31,9 @@ __global__ void calc_pshell_force_kernel(
         } else {
             k = k_pshell;
         }
-        dr.x = coords[i].x - coords_top[i].x;
-        dr.y = coords[i].y - coords_top[i].y;
-        dr.z = coords[i].z - coords_top[i].z;
+        dr.x = coords[i].x - coords_init[i].x;
+        dr.y = coords[i].y - coords_init[i].y;
+        dr.z = coords[i].z - coords_init[i].z;
         r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2);
         ener = 0.5 * k * r2;
         // printf("dr = %f %f %f\n", dr.x, dr.y, dr.z);
@@ -49,15 +48,14 @@ __global__ void calc_pshell_force_kernel(
 }
 
 void calc_pshell_forces_host() {
-    CudaContext& ctx = CudaContext::instance();
     auto& host = Context::instance();
     using namespace CudaPshellForce;
 
-    auto d_shell = ctx.d_shell;
-    auto d_excluded = ctx.d_excluded;
-    auto d_coords = ctx.d_coords;
-    auto d_coords_top = ctx.d_coords_top;
-    auto d_dvelocities = ctx.d_dvelocities;
+    auto d_shell = host.shell->gpu_data_p;
+    auto d_excluded = host.excluded->gpu_data_p;
+    auto d_coords = host.coords->gpu_data_p;
+    auto d_coords_init = host.coords_init->gpu_data_p;
+    auto d_dvelocities = host.dvelocities->gpu_data_p;
 
     cudaMemset(d_ufix_energy, 0, sizeof(double));
     cudaMemset(d_ushell_energy, 0, sizeof(double));
@@ -69,7 +67,7 @@ void calc_pshell_forces_host() {
         d_shell,
         d_excluded,
         d_coords,
-        d_coords_top,
+        d_coords_init,
         d_ufix_energy,
         d_ushell_energy,
         d_dvelocities);
@@ -100,4 +98,4 @@ void cleanup_pshell_force() {
         cudaFree(d_ushell_energy);
         is_initialized = false;
     }
-}
\ No newline at end of file
+}
diff --git a/src/core/cuda/src/cuda_radix_water_force.cu b/src/core/cuda/src/cuda_radix_water_force.cu
index c911574d..06f5f5a3 100644
--- a/src/core/cuda/src/cuda_radix_water_force.cu
+++ b/src/core/cuda/src/cuda_radix_water_force.cu
@@ -2,7 +2,6 @@
 
 #include "common/include/constants.h"
 #include "common/include/context.h"
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_radix_water_force.cuh"
 #include "cuda/include/cuda_utility.cuh"
 namespace CudaRadixWaterForce {
@@ -69,11 +68,8 @@ void calc_radix_water_forces_host() {
     int oxygen_atoms = water_atoms / 3;
     int numBlocks = (oxygen_atoms + blockSize - 1) / blockSize;
 
-    CudaContext& ctx = CudaContext::instance();
-    // ctx.sync_all_to_device();
-
-    auto d_coords = ctx.d_coords;
-    auto d_dvelocities = ctx.d_dvelocities;
+    auto d_coords = host.coords->gpu_data_p;
+    auto d_dvelocities = host.dvelocities->gpu_data_p;
     check_cuda(cudaMemset(d_energy, 0, sizeof(double)));
 
     double shift;
@@ -95,7 +91,7 @@ void calc_radix_water_forces_host() {
                                                              d_dvelocities,
                                                              d_energy);
     check_cuda(cudaDeviceSynchronize());
-    check_cuda(cudaMemcpy(host.dvelocities.data(), d_dvelocities, sizeof(dvel_t) * host.n_atoms, cudaMemcpyDeviceToHost));
+    host.dvelocities->download();
     check_cuda(cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost));
     host.E_restraint.Uradx += energy;
 }
diff --git a/src/core/cuda/src/cuda_restrang_force.cu b/src/core/cuda/src/cuda_restrang_force.cu
index 76cc436f..eb0813f5 100644
--- a/src/core/cuda/src/cuda_restrang_force.cu
+++ b/src/core/cuda/src/cuda_restrang_force.cu
@@ -1,4 +1,3 @@
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_restrang_force.cuh"
 #include "cuda/include/cuda_utility.cuh"
 #include "common/include/context.h"
@@ -104,13 +103,12 @@ void calc_restrang_force_host() {
     auto& host = Context::instance();
     if (host.n_restrangs == 0) return;
     using namespace CudaRestrangForce;
-    CudaContext& ctx = CudaContext::instance();
 
-    auto d_restrangs = ctx.d_restrangs;
-    auto d_coords = ctx.d_coords;
-    auto d_lambdas = ctx.d_lambdas;
-    auto d_dvelocities = ctx.d_dvelocities;
-    auto d_EQ_restraint = ctx.d_EQ_restraint;
+    auto d_restrangs = host.restrangs->gpu_data_p;
+    auto d_coords = host.coords->gpu_data_p;
+    auto d_lambdas = host.lambdas->gpu_data_p;
+    auto d_dvelocities = host.dvelocities->gpu_data_p;
+    auto d_EQ_restraint = host.EQ_restraint->gpu_data_p;
 
     double val = 0;
     cudaMemcpy(d_E_restraint, &val, sizeof(double), cudaMemcpyHostToDevice);
@@ -127,7 +125,7 @@ void calc_restrang_force_host() {
         d_EQ_restraint,
         d_E_restraint);
     cudaDeviceSynchronize();
-    cudaMemcpy(host.EQ_restraint.data(), d_EQ_restraint, sizeof(E_restraint_t) * host.n_lambdas, cudaMemcpyDeviceToHost);
+    host.EQ_restraint->download();
     cudaMemcpy(&val, d_E_restraint, sizeof(double), cudaMemcpyDeviceToHost);
     host.E_restraint.Upres += val;
 }
diff --git a/src/core/cuda/src/cuda_restrdis_force.cu b/src/core/cuda/src/cuda_restrdis_force.cu
index d6fb0f3c..9aacf977 100644
--- a/src/core/cuda/src/cuda_restrdis_force.cu
+++ b/src/core/cuda/src/cuda_restrdis_force.cu
@@ -1,6 +1,5 @@
 #include <iostream>
 
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_restrdis_force.cuh"
 #include "cuda/include/cuda_utility.cuh"
 #include "common/include/context.h"
@@ -77,12 +76,11 @@ void calc_restrdis_forces_host() {
     auto& host = Context::instance();
     if (host.n_restrdists == 0) return;
     using namespace CudaRestrdisForce;
-    CudaContext& ctx = CudaContext::instance();
-    auto d_restrdists = ctx.d_restrdists;
-    auto d_coords = ctx.d_coords;
-    auto d_lambdas = ctx.d_lambdas;
-    auto d_dvelocities = ctx.d_dvelocities;
-    auto d_EQ_restraint = ctx.d_EQ_restraint;
+    auto d_restrdists = host.restrdists->gpu_data_p;
+    auto d_coords = host.coords->gpu_data_p;
+    auto d_lambdas = host.lambdas->gpu_data_p;
+    auto d_dvelocities = host.dvelocities->gpu_data_p;
+    auto d_EQ_restraint = host.EQ_restraint->gpu_data_p;
 
     cudaMemset(d_E_restraint, 0, sizeof(double));
 
@@ -98,7 +96,7 @@ void calc_restrdis_forces_host() {
         d_EQ_restraint,
         d_E_restraint);
     cudaDeviceSynchronize();
-    cudaMemcpy(host.EQ_restraint.data(), d_EQ_restraint, sizeof(E_restraint_t) * host.n_lambdas, cudaMemcpyDeviceToHost);
+    host.EQ_restraint->download();
     double ener;
     cudaMemcpy(&ener, d_E_restraint, sizeof(double), cudaMemcpyDeviceToHost);
     printf("Energy restraint: %f\n", ener);
diff --git a/src/core/cuda/src/cuda_restrpos_force.cu b/src/core/cuda/src/cuda_restrpos_force.cu
index 8409ee61..5f479364 100644
--- a/src/core/cuda/src/cuda_restrpos_force.cu
+++ b/src/core/cuda/src/cuda_restrpos_force.cu
@@ -1,6 +1,5 @@
 #include <iostream>
 
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_restrpos_force.cuh"
 #include "cuda/include/cuda_utility.cuh"
 #include "common/include/context.h"
@@ -68,12 +67,11 @@ void calc_restrpos_forces_host() {
     double val = 0.0;
     cudaMemcpy(d_E_restraint, &val, sizeof(double), cudaMemcpyHostToDevice);
 
-    CudaContext& ctx = CudaContext::instance();
-    auto d_restrspos = ctx.d_restrspos;
-    auto d_coords = ctx.d_coords;
-    auto d_lambdas = ctx.d_lambdas;
-    auto d_EQ_restraint = ctx.d_EQ_restraint;
-    auto d_dvelocities = ctx.d_dvelocities;
+    auto d_restrspos = host.restrspos->gpu_data_p;
+    auto d_coords = host.coords->gpu_data_p;
+    auto d_lambdas = host.lambdas->gpu_data_p;
+    auto d_EQ_restraint = host.EQ_restraint->gpu_data_p;
+    auto d_dvelocities = host.dvelocities->gpu_data_p;
 
     int blockSize = 256;
     int numBlocks = (host.n_restrspos + blockSize - 1) / blockSize;
@@ -89,7 +87,7 @@ void calc_restrpos_forces_host() {
     cudaDeviceSynchronize();
     cudaMemcpy(&val, d_E_restraint, sizeof(double), cudaMemcpyDeviceToHost);
     host.E_restraint.Upres += val;
-    cudaMemcpy(host.EQ_restraint.data(), d_EQ_restraint, sizeof(E_restraint_t) * host.n_lambdas, cudaMemcpyDeviceToHost);
+    host.EQ_restraint->download();
 }
 
 void init_restrpos_force_kernel_data() {
diff --git a/src/core/cuda/src/cuda_restrseq_force.cu b/src/core/cuda/src/cuda_restrseq_force.cu
index 488354e3..b5db3552 100644
--- a/src/core/cuda/src/cuda_restrseq_force.cu
+++ b/src/core/cuda/src/cuda_restrseq_force.cu
@@ -1,4 +1,3 @@
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_restrseq_force.cuh"
 #include "common/include/context.h"
 #include "iostream"
@@ -11,7 +10,7 @@ __global__ void calc_restrseq_forces_kernel(
     int n_restrseqs,
     restrseq_t* restrseqs,
     coord_t* coords,
-    coord_t* coords_top,
+    coord_t* coords_init,
     atype_t* atypes,
     catype_t* catypes,
     bool* heavy,
@@ -37,9 +36,9 @@ __global__ void calc_restrseq_forces_kernel(
         for (int i = restrseqs[s].ai - 1; i < restrseqs[s].aj - 1; i++) {
             if (heavy[i] || restrseqs[s].ih) {
                 n_ctr++;
-                dr.x += (coords[i].x - coords_top[i].x);
-                dr.y += (coords[i].y - coords_top[i].y);
-                dr.z += (coords[i].z - coords_top[i].z);
+                dr.x += (coords[i].x - coords_init[i].x);
+                dr.y += (coords[i].y - coords_init[i].y);
+                dr.z += (coords[i].z - coords_init[i].z);
             }
         }
 
@@ -68,9 +67,9 @@ __global__ void calc_restrseq_forces_kernel(
             if (heavy[i] || restrseqs[s].ih) {
                 mass = catypes[atypes[i].code - 1].m;
                 totmass += mass;
-                dr.x += (coords[i].x - coords_top[i].x) * mass;
-                dr.y += (coords[i].y - coords_top[i].y) * mass;
-                dr.z += (coords[i].z - coords_top[i].z) * mass;
+                dr.x += (coords[i].x - coords_init[i].x) * mass;
+                dr.y += (coords[i].y - coords_init[i].y) * mass;
+                dr.z += (coords[i].z - coords_init[i].z) * mass;
             }
         }
 
@@ -97,9 +96,9 @@ __global__ void calc_restrseq_forces_kernel(
     else {
         for (int i = restrseqs[s].ai - 1; i < restrseqs[s].aj - 1; i++) {
             if (heavy[i] || restrseqs[s].ih) {
-                dr.x = coords[i].x - coords_top[i].x;
-                dr.y = coords[i].y - coords_top[i].y;
-                dr.z = coords[i].z - coords_top[i].z;
+                dr.x = coords[i].x - coords_init[i].x;
+                dr.y = coords[i].y - coords_init[i].y;
+                dr.z = coords[i].z - coords_init[i].z;
 
                 r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2);
                 ener = .5 * k * r2;
@@ -117,14 +116,13 @@ void calc_restrseq_forces_host() {
     auto& host = Context::instance();
     if (host.n_restrseqs == 0) return;
     using namespace CudaRestrseqForce;
-    CudaContext& ctx = CudaContext::instance();
-    auto d_restrseq = ctx.d_restrseqs;
-    auto d_coords = ctx.d_coords;
-    auto d_coords_top = ctx.d_coords_top;
-    auto d_atypes = ctx.d_atypes;
-    auto d_catypes = ctx.d_catypes;
-    auto d_heavy = ctx.d_heavy;
-    auto d_dvelocities = ctx.d_dvelocities;
+    auto d_restrseq = host.restrseqs->gpu_data_p;
+    auto d_coords = host.coords->gpu_data_p;
+    auto d_coords_init = host.coords_init->gpu_data_p;
+    auto d_atypes = host.atypes->gpu_data_p;
+    auto d_catypes = host.catypes->gpu_data_p;
+    auto d_heavy = host.heavy->gpu_data_p;
+    auto d_dvelocities = host.dvelocities->gpu_data_p;
     cudaMemset(d_upres_energy, 0, sizeof(double));
     // ctx.sync_all_to_device();
 
@@ -134,7 +132,7 @@ void calc_restrseq_forces_host() {
         host.n_restrseqs,
         d_restrseq,
         d_coords,
-        d_coords_top,
+        d_coords_init,
         d_atypes,
         d_catypes,
         d_heavy,
diff --git a/src/core/cuda/src/cuda_restrwall_force.cu b/src/core/cuda/src/cuda_restrwall_force.cu
index 24183427..12d890ad 100644
--- a/src/core/cuda/src/cuda_restrwall_force.cu
+++ b/src/core/cuda/src/cuda_restrwall_force.cu
@@ -1,6 +1,5 @@
 #include <iostream>
 
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_restrwall_force.cuh"
 #include "common/include/context.h"
 
@@ -55,11 +54,10 @@ void calc_restrwall_forces_host() {
     auto& host = Context::instance();
     if (host.n_restrwalls == 0) return;
     using namespace CudaRestrwallForce;
-    CudaContext& ctx = CudaContext::instance();
-    auto d_restrwalls = ctx.d_restrwalls;
-    auto d_coords = ctx.d_coords;
-    auto d_dvelocities = ctx.d_dvelocities;
-    auto d_heavy = ctx.d_heavy;
+    auto d_restrwalls = host.restrwalls->gpu_data_p;
+    auto d_coords = host.coords->gpu_data_p;
+    auto d_dvelocities = host.dvelocities->gpu_data_p;
+    auto d_heavy = host.heavy->gpu_data_p;
     cudaMemset(d_energies, 0, sizeof(double));
 
     int blockSize = 256;
diff --git a/src/core/cuda/src/cuda_shake_constraints.cu b/src/core/cuda/src/cuda_shake_constraints.cu
index 8c68a92e..e9dfd051 100644
--- a/src/core/cuda/src/cuda_shake_constraints.cu
+++ b/src/core/cuda/src/cuda_shake_constraints.cu
@@ -1,6 +1,5 @@
 #include <iostream>
 
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_shake_constraints.cuh"
 #include "common/include/constants.h"
 #include "common/include/context.h"
@@ -99,13 +98,14 @@ __global__ void calc_shake_constraints_kernel(
 
 void init_shake_constraints_kernel_data() {
     auto& host = Context::instance();
+    auto *mol_n_shakes = host.mol_n_shakes->cpu_data_p;
     using namespace CudaShakeConstraints;
     if (!is_initialized) {
         if (host.n_molecules > 0) {
             int* mol_shake_offset_host = (int*)malloc(sizeof(int) * host.n_molecules);
             mol_shake_offset_host[0] = 0;
             for (int i = 1; i < host.n_molecules; i++) {
-                mol_shake_offset_host[i] = mol_shake_offset_host[i - 1] + host.mol_n_shakes[i - 1];
+                mol_shake_offset_host[i] = mol_shake_offset_host[i - 1] + mol_n_shakes[i - 1];
             }
             check_cudaMalloc((void**)&d_mol_shake_offset, sizeof(int) * host.n_molecules);
             cudaMemcpy(d_mol_shake_offset, mol_shake_offset_host, sizeof(int) * host.n_molecules, cudaMemcpyHostToDevice);
@@ -137,12 +137,11 @@ int calc_shake_constraints_host() {
     int blocks = host.n_molecules;
     int threads = 32;
 
-    CudaContext& ctx = CudaContext::instance();
-    auto d_mol_n_shakes = ctx.d_mol_n_shakes;
-    auto d_shake_bonds = ctx.d_shake_bonds;
-    auto d_coords = ctx.d_coords;
-    auto d_xcoords = ctx.d_xcoords;
-    auto d_winv = ctx.d_winv;
+    auto d_mol_n_shakes = host.mol_n_shakes->gpu_data_p;
+    auto d_shake_bonds = host.shake_bonds->gpu_data_p;
+    auto d_coords = host.coords->gpu_data_p;
+    auto d_xcoords = host.xcoords->gpu_data_p;
+    auto d_winv = host.winv->gpu_data_p;
 
     calc_shake_constraints_kernel<<<blocks, threads>>>(
         host.n_molecules,
@@ -155,6 +154,6 @@ int calc_shake_constraints_host() {
         d_mol_shake_offset);
     cudaDeviceSynchronize();
     cudaMemcpy(&total_iterations_host, d_total_iterations, sizeof(int), cudaMemcpyDeviceToHost);
-    cudaMemcpy(host.coords.data(), d_coords, sizeof(coord_t) * host.n_atoms, cudaMemcpyDeviceToHost);
+    host.coords->download();
     return host.n_molecules == 0 ? 0 : total_iterations_host / host.n_molecules;
 }
diff --git a/src/core/cuda/src/cuda_temperature.cu b/src/core/cuda/src/cuda_temperature.cu
index 138fa617..22b4dc82 100644
--- a/src/core/cuda/src/cuda_temperature.cu
+++ b/src/core/cuda/src/cuda_temperature.cu
@@ -1,4 +1,3 @@
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_temperature.cuh"
 #include "cuda/include/cuda_utility.cuh"
 #include "common/include/constants.h"
@@ -58,11 +57,10 @@ void calc_temperature_host() {
     cudaMemcpy(d_Tfree_solvent, &h_Tfree_solvent, sizeof(double), cudaMemcpyHostToDevice);
     cudaMemcpy(d_Texcl_solvent, &h_Texcl_solvent, sizeof(double), cudaMemcpyHostToDevice);
 
-    CudaContext& ctx = CudaContext::instance();
-    atype_t* d_atypes = ctx.d_atypes;
-    catype_t* d_catypes = ctx.d_catypes;
-    vel_t* d_velocities = ctx.d_velocities;
-    bool* d_excluded = ctx.d_excluded;
+    atype_t* d_atypes = host.atypes->gpu_data_p;
+    catype_t* d_catypes = host.catypes->gpu_data_p;
+    vel_t* d_velocities = host.velocities->gpu_data_p;
+    bool* d_excluded = host.excluded->gpu_data_p;
 
     int blockSize = 256;
     int numBlocks = (host.n_atoms + blockSize - 1) / blockSize;
diff --git a/src/core/cuda/src/cuda_torsion_force.cu b/src/core/cuda/src/cuda_torsion_force.cu
index 3d6d3174..6ef7cd45 100644
--- a/src/core/cuda/src/cuda_torsion_force.cu
+++ b/src/core/cuda/src/cuda_torsion_force.cu
@@ -1,6 +1,6 @@
-#include "cuda/include/cuda_context.cuh"
 #include "cuda/include/cuda_torsion_force.cuh"
 #include "cuda/include/cuda_utility.cuh"
+#include "context.h"
 
 namespace CudaTorsionForce {
 bool is_initialized = false;
@@ -133,11 +133,11 @@ double calc_torsion_forces_host(int start, int end) {
     double zero = 0.0;
     cudaMemcpy(d_energy_sum, &zero, sizeof(double), cudaMemcpyHostToDevice);
 
-    CudaContext& ctx = CudaContext::instance();
-    coord_t* d_coords = ctx.d_coords;
-    dvel_t* d_dvelocities = ctx.d_dvelocities;
-    torsion_t* d_torsions = ctx.d_torsions;
-    ctorsion_t* d_ctorsions = ctx.d_ctorsions;
+    auto& host_ctx = Context::instance();
+    coord_t* d_coords = host_ctx.coords->gpu_data_p;
+    dvel_t* d_dvelocities = host_ctx.dvelocities->gpu_data_p;
+    torsion_t* d_torsions = host_ctx.torsions->gpu_data_p;
+    ctorsion_t* d_ctorsions = host_ctx.ctorsions->gpu_data_p;
 
     calc_torsion_forces_kernel<<<numBlocks, blockSize>>>(start, end, d_torsions, d_ctorsions, d_coords, d_dvelocities, d_energy_sum);
     cudaDeviceSynchronize();
@@ -161,4 +161,4 @@ void cleanup_torsion_force() {
         cudaFree(d_energy_sum);
         is_initialized = false;
     }
-}
\ No newline at end of file
+}
diff --git a/src/core/cuda/src/cuda_utility.cu b/src/core/cuda/src/cuda_utility.cu
index ab1cca7c..3430736f 100644
--- a/src/core/cuda/src/cuda_utility.cu
+++ b/src/core/cuda/src/cuda_utility.cu
@@ -1,19 +1 @@
 #include "cuda/include/cuda_utility.cuh"
-
-#include <stdio.h>
-#include <stdlib.h>
-
-void check_cudaMalloc(void** devPtr, size_t size) {
-    cudaError_t error = cudaMalloc((void**)devPtr, size);
-    if (error != cudaSuccess) {
-        printf(">>> FATAL: cudaMalloc failed with error code %d: %s\n", error, cudaGetErrorString(error));
-        exit(EXIT_FAILURE);
-    }
-}
-
-void check_cuda(cudaError_t status) {
-    if (status != cudaSuccess) {
-        printf(">>> FATAL: CUDA call failed with error code %d: %s\n", status, cudaGetErrorString(status));
-        exit(EXIT_FAILURE);
-    }
-}