Improve multidimensional array access via VPI

2024-12-29 10:47:34 +00:00 · 2024-12-09 09:15:26 -05:00 · 2024-12-09 09:15:26 -05:00 · f8f00f136e
commit f8f00f136e
parent ff75e5f530
6 changed files with 704 additions and 244 deletions
--- a/include/verilated_vpi.cpp
+++ b/include/verilated_vpi.cpp
@ -85,7 +85,7 @@ public:
        // To simplify our free list, we use a size large enough for all derived types
        // We reserve word zero for the next pointer, as that's safer in case a
        // dangling reference to the original remains around.
-        static constexpr size_t CHUNK_SIZE = 256;
+        static constexpr size_t CHUNK_SIZE = 128;
        if (VL_UNCOVERABLE(size > CHUNK_SIZE))
            VL_FATAL_MT(__FILE__, __LINE__, "", "increase CHUNK_SIZE");
        if (VL_LIKELY(t_freeHeadp)) {
@ -473,7 +473,7 @@ class VerilatedVpioRegIter final : public VerilatedVpio {
 public:
    explicit VerilatedVpioRegIter(const VerilatedVpioVar* vop)
        : m_var{new VerilatedVpioVar(vop)}
-        , m_maxDim{vop->indexedDim() + 1 >= vop->varp()->udims() ? vop->varp()->dims() - 1 : vop->varp()->udims() - 1} {
+        , m_maxDim{vop->varp()->udims() - 1} {
            for (auto it = vop->indexedDim() + 1; it <= m_maxDim; it++)
                m_ranges.push_back(*vop->varp()->range(it));
            for (auto it : m_ranges)
@ -485,7 +485,7 @@ public:
    }
    uint32_t type() const override { return vpiIterator; }
    vpiHandle dovpi_scan() override {
-        if (VL_UNLIKELY(m_var->indexedDim() == m_maxDim)) {
+        if (VL_UNLIKELY(m_var->indexedDim() >= m_maxDim)) {
            // Trying to iterate over a non-array object
            delete this;
            return nullptr;
@ -2193,6 +2193,9 @@ vpiHandle vpi_iterate(PLI_INT32 type, vpiHandle object) {
        for (int dim = vop->indexedDim() + 1; dim <= maxDim; dim++)
            ranges.emplace_back(*vop->varp()->range(dim));

+        // allow one more range layer (regbit)
+        if (ranges.empty())
+            ranges.emplace_back(VerilatedRange(0, 0));
        return ((new VerilatedVpioRangeIter{ranges})->castVpiHandle());
    }
    case vpiReg: {
@ -2282,7 +2285,7 @@ PLI_INT32 vpi_get(PLI_INT32 property, vpiHandle object) {
    case vpiVector: {
        const VerilatedVpioVarBase* const vop = VerilatedVpioVarBase::castp(object);
        if (VL_UNLIKELY(!vop)) return vpiUndefined;
-        return (property == vpiVector) ^ (vop->varp()->entBits() == 1);
+        return (property == vpiVector) ^ (vop->varp()->packedRanges().empty() || !vop->rangep());
    }
    case vpiSize: {
        const VerilatedVpioVarBase* const vop = VerilatedVpioVarBase::castp(object);
@ -2380,7 +2383,9 @@ bool vl_check_format(const VerilatedVar* varp, const p_vpi_value valuep, const c
        switch (varp->vltype()) {
        case VLVT_UINT8:
        case VLVT_UINT16:
-        case VLVT_UINT32: return status;
+        case VLVT_UINT32:
+        case VLVT_UINT64:
+        case VLVT_WDATA: return status;
        default: status = false;
        }
    } else if (valuep->format == vpiRealVal) {
@ -2417,60 +2422,121 @@ static void vl_strprintf(std::string& buffer, char const* fmt, ...) {
    va_end(args);
 }

-// vl_put_word / vl_get_word don't operate across word boundaries!
-// T should match the underlying datatype.
 template<typename T>
-void vl_put_word(const VerilatedVpioVar* vop, T word) {
-    size_t wordBits = sizeof(T) * 8;
-    uint32_t varBits = vop->size();
+struct VarAccessInfo final {
+    size_t bitOffset;
+    size_t wordOffset;
+    T mask_lo, mask_hi;
+    T* ptr;
+};

+template<typename T>
+VarAccessInfo<T> vl_var_access_info(const VerilatedVpioVarBase* vop, size_t bitCount, size_t addOffset) {
+    size_t wordBits = sizeof(T) * 8;
+    uint32_t varBits;
+
+    // find how many bits does the variable have
    if (vop->indexedDim() + 1 < vop->varp()->udims())
        varBits = vop->varp()->entBits();
+    else
+        varBits = vop->size();

-    switch (vop->varp()->vltype()) {
-        case VLVT_REAL: varBits *= sizeof(double) * 8; break;
-        case VLVT_STRING: // TODO
-        default: break;
-    }
-    T* ptr = reinterpret_cast<T*>(vop->varDatap());
-    T mask = 0;
-    assert(varBits > 0);
-    assert(varBits + vop->bitOffset() <= wordBits);
-    if (varBits < wordBits) {
-        mask = ((T)1 << varBits) - 1;
-        mask = mask << vop->bitOffset();
+    if (vop->varp()->vltype() == VLVT_REAL)
+        varBits *= sizeof(double) * 8;
+
+    // make sure we're not trying to write outside var bounds
+    assert(varBits > addOffset);
+    bitCount = std::min(bitCount, varBits - addOffset);
+
+    VarAccessInfo<T> info;
+    info.ptr = reinterpret_cast<T*>(vop->varDatap());
+    if (vop->varp()->vltype() == VLVT_WDATA) {
+        assert(sizeof(T) == sizeof(EData));
+        assert(bitCount <= wordBits);
+        info.wordOffset = (vop->bitOffset() + addOffset) / wordBits;
+        info.bitOffset = (vop->bitOffset() + addOffset) % wordBits;
+        if (bitCount + info.bitOffset <= wordBits) {
+            // within single word
+            if (bitCount == wordBits)
+                info.mask_lo = ~(T)0;
+            else
+                info.mask_lo = ((T)1 << bitCount) - 1;
+            info.mask_lo = info.mask_lo << info.bitOffset;
+            info.mask_hi = 0;
+        } else {
+            // straddles word boundary
+            info.mask_lo = ((T)1 << (wordBits - info.bitOffset)) - 1;
+            info.mask_lo = info.mask_lo << info.bitOffset;
+            info.mask_hi = ((T)1 << (bitCount + info.bitOffset - wordBits)) - 1;
+        }
    } else {
-        mask = ~mask;
+        info.wordOffset = 0;
+        info.bitOffset = vop->bitOffset() + addOffset;
+        assert(bitCount + info.bitOffset <= wordBits);
+        if (bitCount < wordBits) {
+            info.mask_lo = ((T)1 << bitCount) - 1;
+            info.mask_lo = info.mask_lo << info.bitOffset;
+        } else {
+            info.mask_lo = ~(T)0;
+        }
+        info.mask_hi = 0;
    }
-    *ptr = (*ptr & ~mask) | ((word << vop->bitOffset()) & mask);
+    return info;
 }

 template<typename T>
-T vl_get_word(const VerilatedVpioVarBase* vop) {
+T vl_get_word_gen(const VerilatedVpioVarBase* vop, size_t bitCount, size_t addOffset) {
    size_t wordBits = sizeof(T) * 8;
-    uint32_t varBits = vop->size();
-
-    if (vop->indexedDim() + 1 < vop->varp()->udims())
-        varBits = vop->varp()->entBits();
-
-    switch (vop->varp()->vltype()) {
-        case VLVT_REAL: varBits *= sizeof(double) * 8; break;
-        case VLVT_STRING: // TODO
-        default: break;
-    }
-    T* ptr = reinterpret_cast<T*>(vop->varDatap());
-    T mask = 0;
-    assert(varBits > 0);
-    assert(varBits + vop->bitOffset() <= wordBits);
-    if (varBits < wordBits) {
-        mask = ((T)1 << varBits) - 1;
-        mask = mask << vop->bitOffset();
-    } else {
-        mask = ~mask;
-    }
-    return (*ptr & mask) >> vop->bitOffset();
+    VarAccessInfo<T> info = vl_var_access_info<T>(vop, bitCount, addOffset);
+    if (info.mask_hi)
+        return ((info.ptr[info.wordOffset] & info.mask_lo) >> info.bitOffset) |
+               ((info.ptr[info.wordOffset + 1] & info.mask_hi) << (wordBits - info.bitOffset));
+    else
+        return (info.ptr[info.wordOffset] & info.mask_lo) >> info.bitOffset;
 }

+template<typename T>
+void vl_put_word_gen(const VerilatedVpioVar* vop, T word, size_t bitCount, size_t addOffset) {
+    size_t wordBits = sizeof(T) * 8;
+    VarAccessInfo<T> info = vl_var_access_info<T>(vop, bitCount, addOffset);
+
+    if (info.mask_hi) {
+        info.ptr[info.wordOffset + 1] = (info.ptr[info.wordOffset+1] & ~info.mask_hi) |
+                                        ((word >> (wordBits - info.bitOffset)) & info.mask_hi);
+    }
+    info.ptr[info.wordOffset] = (info.ptr[info.wordOffset] & ~info.mask_lo) |
+                                ((word << info.bitOffset) & info.mask_lo);
+}
+
+// bitCount: maximum number of bits to read, will stop earlier if it reaches the var bounds
+// addOffset: additional read bitoffset
+QData vl_get_word(const VerilatedVpioVarBase* vop, size_t bitCount, size_t addOffset) {
+    switch (vop->varp()->vltype()) {
+        case VLVT_UINT8: return vl_get_word_gen<CData>(vop, bitCount, addOffset);
+        case VLVT_UINT16: return vl_get_word_gen<SData>(vop, bitCount, addOffset);
+        case VLVT_UINT32: return vl_get_word_gen<IData>(vop, bitCount, addOffset);
+        case VLVT_UINT64: return vl_get_word_gen<QData>(vop, bitCount, addOffset);
+        case VLVT_WDATA: return vl_get_word_gen<EData>(vop, bitCount, addOffset);
+        default:
+            VL_VPI_ERROR_(__FILE__, __LINE__, "%s: Unsupported vltype (%d)", __func__, vop->varp()->vltype());
+            return 0;
+    }
+}
+
+// word: data to be written
+// bitCount: maximum number of bits to write, will stop earlier if it reaches the var bounds
+// addOffset: additional write bitoffset
+void vl_put_word(const VerilatedVpioVar* vop, QData word, size_t bitCount, size_t addOffset) {
+    switch (vop->varp()->vltype()) {
+        case VLVT_UINT8: vl_put_word_gen<CData>(vop, word, bitCount, addOffset); break;
+        case VLVT_UINT16: vl_put_word_gen<SData>(vop, word, bitCount, addOffset); break;
+        case VLVT_UINT32: vl_put_word_gen<IData>(vop, word, bitCount, addOffset); break;
+        case VLVT_UINT64: vl_put_word_gen<QData>(vop, word, bitCount, addOffset); break;
+        case VLVT_WDATA: vl_put_word_gen<EData>(vop, word, bitCount, addOffset); break;
+        default:
+            VL_VPI_ERROR_(__FILE__, __LINE__, "%s: Unsupported vltype (%d)", __func__, vop->varp()->vltype());
+    }
+}

 void vl_get_value(const VerilatedVpioVarBase* vop, p_vpi_value valuep) {
    const VerilatedVar* varp = vop->varp();
@ -2480,6 +2546,13 @@ void vl_get_value(const VerilatedVpioVarBase* vop, p_vpi_value valuep) {
    if (!vl_check_format(varp, valuep, fullname, true)) return;
    // string data type is dynamic and may vary in size during simulation
    static thread_local std::string t_outDynamicStr;
+
+    int varBits;
+    if (vop->indexedDim() + 1 < vop->varp()->udims())
+        varBits = vop->varp()->entBits();
+    else
+        varBits = vop->size();
+
    // We used to presume vpiValue.format = vpiIntVal or if single bit vpiScalarVal
    // This may cause backward compatibility issues with older code.
    if (valuep->format == vpiVectorVal) {
@ -2487,27 +2560,8 @@ void vl_get_value(const VerilatedVpioVarBase* vop, p_vpi_value valuep) {
        // It only needs to persist until the next vpi_get_value
        static thread_local t_vpi_vecval t_out[VL_VALUE_STRING_MAX_WORDS * 2];
        valuep->value.vector = t_out;
-        if (varp->vltype() == VLVT_UINT8) {
-            t_out[0].aval = vl_get_word<CData>(vop);
-            t_out[0].bval = 0;
-            return;
-        } else if (varp->vltype() == VLVT_UINT16) {
-            t_out[0].aval = vl_get_word<SData>(vop);
-            t_out[0].bval = 0;
-            return;
-        } else if (varp->vltype() == VLVT_UINT32) {
-            t_out[0].aval = vl_get_word<IData>(vop);
-            t_out[0].bval = 0;
-            return;
-        } else if (varp->vltype() == VLVT_UINT64) {
-            const QData data = vl_get_word<QData>(vop);
-            t_out[1].aval = static_cast<IData>(data >> 32ULL);
-            t_out[1].bval = 0;
-            t_out[0].aval = static_cast<IData>(data);
-            t_out[0].bval = 0;
-            return;
-        } else if (varp->vltype() == VLVT_WDATA) {
-            const int words = VL_WORDS_I(varp->entBits());
+        if (varp->vltype() == VLVT_WDATA) {
+            const int words = VL_WORDS_I(varBits);
            if (VL_UNCOVERABLE(words >= VL_VALUE_STRING_MAX_WORDS)) {
                VL_VPI_ERROR_(
                    __FILE__, __LINE__,
@ -2515,83 +2569,63 @@ void vl_get_value(const VerilatedVpioVarBase* vop, p_vpi_value valuep) {
                    "recompile");
                return;
            }
-            const WDataInP datap = (reinterpret_cast<EData*>(varDatap));
            for (int i = 0; i < words; ++i) {
-                t_out[i].aval = datap[i];
+                t_out[i].aval = vl_get_word(vop, 32, i * 32);
                t_out[i].bval = 0;
            }
            return;
+        } else if (varp->vltype() == VLVT_UINT64 && varBits > 32) {
+            const QData data = vl_get_word(vop, 64, 0);
+            t_out[1].aval = static_cast<IData>(data >> 32ULL);
+            t_out[1].bval = 0;
+            t_out[0].aval = static_cast<IData>(data);
+            t_out[0].bval = 0;
+            return;
+        } else {
+            t_out[0].aval = vl_get_word(vop, 32, 0);
+            t_out[0].bval = 0;
+            return;
        }
    } else if (valuep->format == vpiBinStrVal) {
-        const size_t bits = vop->size();
-        t_outDynamicStr.resize(bits);
+        t_outDynamicStr.resize(varBits);
        const CData* datap = (reinterpret_cast<CData*>(varDatap));
-        for (size_t i = 0; i < bits; ++i) {
+        for (size_t i = 0; i < varBits; ++i) {
            size_t pos = i + vop->bitOffset();
            const char val = (datap[pos >> 3] >> (pos & 7)) & 1;
-            t_outDynamicStr[bits - i - 1] = val ? '1' : '0';
+            t_outDynamicStr[varBits - i - 1] = val ? '1' : '0';
        }
        valuep->value.str = const_cast<PLI_BYTE8*>(t_outDynamicStr.c_str());
        return;
    } else if (valuep->format == vpiOctStrVal) {
-        int chars = (vop->size() + 2) / 3;
+        const int chars = (varBits + 2) / 3;
        t_outDynamicStr.resize(chars);
-        const int bytes = VL_BYTES_I(varp->entBits());
-        const CData* datap = (reinterpret_cast<CData*>(varDatap));
        for (size_t i = 0; i < chars; ++i) {
-            const div_t idx = div(i * 3, 8);
-            int val = datap[idx.quot];
-            if ((idx.quot + 1) < bytes) {
-                // if the next byte is valid or that in
-                // for when the required 3 bits straddle adjacent bytes
-                val |= datap[idx.quot + 1] << 8;
-            }
-            // align so least significant 3 bits represent octal char
-            val >>= idx.rem;
-            if (i == (chars - 1)) {
-                // most significant char, mask off nonexistent bits when vector
-                // size is not a multiple of 3
-                const unsigned int rem = varp->entBits() % 3;
-                if (rem) {
-                    // generate bit mask & zero nonexistent bits
-                    val &= (1 << rem) - 1;
-                }
-            }
-            t_outDynamicStr[chars - i - 1] = '0' + (val & 7);
+            const char val = vl_get_word(vop, 3, i * 3);
+            t_outDynamicStr[chars - i - 1] = '0' + val;
        }
        valuep->value.str = const_cast<PLI_BYTE8*>(t_outDynamicStr.c_str());
        return;
    } else if (valuep->format == vpiDecStrVal) {
        if (varp->vltype() == VLVT_UINT8) {
            vl_strprintf(t_outDynamicStr, "%hhu",
-                         static_cast<unsigned char>(vl_get_word<CData>(vop)));
+                         static_cast<unsigned char>(vl_get_word(vop, 8, 0)));
        } else if (varp->vltype() == VLVT_UINT16) {
            vl_strprintf(t_outDynamicStr, "%hu",
-                         static_cast<unsigned short>(vl_get_word<SData>(vop)));
+                         static_cast<unsigned short>(vl_get_word(vop, 16, 0)));
        } else if (varp->vltype() == VLVT_UINT32) {
            vl_strprintf(t_outDynamicStr, "%u",
-                         static_cast<unsigned int>(vl_get_word<IData>(vop)));
+                         static_cast<unsigned int>(vl_get_word(vop, 32, 0)));
        } else if (varp->vltype() == VLVT_UINT64) {
            vl_strprintf(t_outDynamicStr, "%llu",  // lintok-format-ll
-                         static_cast<unsigned long long>(vl_get_word<QData>(vop)));
+                         static_cast<unsigned long long>(vl_get_word(vop, 64, 0)));
        }
        valuep->value.str = const_cast<PLI_BYTE8*>(t_outDynamicStr.c_str());
        return;
    } else if (valuep->format == vpiHexStrVal) {
-        int chars = (varp->entBits() + 3) >> 2;
+        const int chars = (varBits + 3) >> 2;
        t_outDynamicStr.resize(chars);
-        const CData* datap = (reinterpret_cast<CData*>(varDatap));
        for (size_t i = 0; i < chars; ++i) {
-            char val = (datap[i >> 1] >> ((i & 1) << 2)) & 15;
-            if (i == (chars - 1)) {
-                // most significant char, mask off nonexistent bits when vector
-                // size is not a multiple of 4
-                const unsigned int rem = varp->entBits() & 3;
-                if (rem) {
-                    // generate bit mask & zero nonexistent bits
-                    val &= (1 << rem) - 1;
-                }
-            }
+            const char val = vl_get_word(vop, 4, i * 4);
            t_outDynamicStr[chars - i - 1] = "0123456789abcdef"[static_cast<int>(val)];
        }
        valuep->value.str = const_cast<PLI_BYTE8*>(t_outDynamicStr.c_str());
@ -2607,28 +2641,19 @@ void vl_get_value(const VerilatedVpioVarBase* vop, p_vpi_value valuep) {
                return;
            }
        } else {
-            int bytes = VL_BYTES_I(varp->entBits());
-            t_outDynamicStr.resize(bytes);
-            const CData* datap = (reinterpret_cast<CData*>(varDatap));
-            for (size_t i = 0; i < bytes; ++i) {
-                const char val = datap[bytes - i - 1];
+            const int chars = VL_BYTES_I(varBits);
+            t_outDynamicStr.resize(chars);
+            for (size_t i = 0; i < chars; ++i) {
+                const char val = vl_get_word(vop, 8, i * 8);
                // other simulators replace [leading?] zero chars with spaces, replicate here.
-                t_outDynamicStr[i] = val ? val : ' ';
+                t_outDynamicStr[chars - i - 1] = val ? val : ' ';
            }
            valuep->value.str = const_cast<PLI_BYTE8*>(t_outDynamicStr.c_str());
            return;
        }
    } else if (valuep->format == vpiIntVal) {
-        if (varp->vltype() == VLVT_UINT8) {
-            valuep->value.integer = vl_get_word<CData>(vop);
-            return;
-        } else if (varp->vltype() == VLVT_UINT16) {
-            valuep->value.integer = vl_get_word<SData>(vop);
-            return;
-        } else if (varp->vltype() == VLVT_UINT32) {
-            valuep->value.integer = vl_get_word<IData>(vop);
-            return;
-        }
+        valuep->value.integer = vl_get_word(vop, 32, 0);
+        return;
    } else if (valuep->format == vpiRealVal) {
        valuep->value.real = *(reinterpret_cast<double*>(varDatap));
        return;
@ -2699,38 +2724,31 @@ vpiHandle vpi_put_value(vpiHandle object, p_vpi_value valuep, p_vpi_time /*time_
            return nullptr;
        }
        if (!vl_check_format(vop->varp(), valuep, vop->fullname(), false)) return nullptr;
-        // TODO: ensure we're writing to packed?
+        int varBits;
+        if (vop->indexedDim() + 1 < vop->varp()->udims())
+            varBits = vop->varp()->entBits();
+        else
+            varBits = vop->size();
        if (valuep->format == vpiVectorVal) {
            if (VL_UNLIKELY(!valuep->value.vector)) return nullptr;
-            if (vop->varp()->vltype() == VLVT_UINT8) {
-                vl_put_word<CData>(vop, valuep->value.vector[0].aval);
+            if (vop->varp()->vltype() == VLVT_WDATA) {
+                const int words = VL_WORDS_I(varBits);
+                for (int i = 0; i < words; ++i)
+                    vl_put_word(vop, valuep->value.vector[i].aval, 32, i * 32);
                return object;
-            } else if (vop->varp()->vltype() == VLVT_UINT16) {
-                vl_put_word<SData>(vop, valuep->value.vector[0].aval);
-                return object;
-            } else if (vop->varp()->vltype() == VLVT_UINT32) {
-                vl_put_word<IData>(vop, valuep->value.vector[0].aval);
-                return object;
-            } else if (vop->varp()->vltype() == VLVT_UINT64) {
+            } else if (vop->varp()->vltype() == VLVT_UINT64 && varBits > 32) {
                QData val = ((QData)valuep->value.vector[1].aval << 32)
                            | (QData)valuep->value.vector[0].aval;
-                vl_put_word<QData>(vop, val);
+                vl_put_word(vop, val, 64, 0);
                return object;
-            } else if (vop->varp()->vltype() == VLVT_WDATA) {
-                // TODO
-                const int words = VL_WORDS_I(vop->size());
-                WDataOutP datap = (reinterpret_cast<EData*>(vop->varDatap()));
-                for (int i = 0; i < words; ++i) {
-                    datap[i] = valuep->value.vector[i].aval;
-                    if (i == (words - 1)) datap[i] &= vop->mask();
-                }
+            } else {
+                vl_put_word(vop, valuep->value.vector[0].aval, 32, 0);
                return object;
            }
        } else if (valuep->format == vpiBinStrVal) {
-            const int bits = vop->size();
            const int len = std::strlen(valuep->value.str);
            CData* const datap = (reinterpret_cast<CData*>(vop->varDatap()));
-            for (int i = 0; i < bits; ++i) {
+            for (int i = 0; i < varBits; ++i) {
                bool set = (i < len) && (valuep->value.str[len - i - 1] == '1');
                size_t pos = vop->bitOffset() + i;

@ -2741,54 +2759,21 @@ vpiHandle vpi_put_value(vpiHandle object, p_vpi_value valuep, p_vpi_time /*time_
            }
            return object;
        } else if (valuep->format == vpiOctStrVal) {
-            const int chars = (vop->size() + 2) / 3;
-            const int bytes = VL_BYTES_I(vop->varp()->entBits());
+            const int chars = (varBits + 2) / 3;
            const int len = std::strlen(valuep->value.str);
            CData* const datap = (reinterpret_cast<CData*>(vop->varDatap()));
-            div_t idx;
-            datap[0] = 0;  // reset zero'th byte
-            for (int i = 0; i < chars; ++i) {
-                union {
-                    char byte[2];
-                    uint16_t half;
-                } val;
-                idx = div(i * 3, 8);
-                if (i < len) {
-                    // ignore illegal chars
-                    const char digit = valuep->value.str[len - i - 1];
-                    if (digit >= '0' && digit <= '7') {
-                        val.half = digit - '0';
-                    } else {
-                        VL_VPI_WARNING_(__FILE__, __LINE__,
-                                        "%s: Non octal character '%c' in '%s' as value %s for %s",
-                                        __func__, digit, valuep->value.str,
-                                        VerilatedVpiError::strFromVpiVal(valuep->format),
-                                        vop->fullname());
-                        val.half = 0;
-                    }
-                } else {
-                    val.half = 0;
-                }
-                // align octal character to position within vector, note that
-                // the three bits may straddle a byte boundary so two byte wide
-                // assignments are made to adjacent bytes - but not if the least
-                // significant byte of the aligned value is the most significant
-                // byte of the destination.
-                val.half <<= idx.rem;
-                datap[idx.quot] |= val.byte[0];  // or in value
-                if ((idx.quot + 1) < bytes) {
-                    datap[idx.quot + 1] = val.byte[1];  // this also resets
-                    // all bits to 0 prior to or'ing above
+            for (int i = 0; i < len; ++i) {
+                char digit = valuep->value.str[len - i - 1] - '0';
+                if (digit < 0 || digit > 7) {
+                    VL_VPI_WARNING_(__FILE__, __LINE__,
+                                    "%s: Non octal character '%c' in '%s' as value %s for %s",
+                                    __func__, digit + '0', valuep->value.str,
+                                    VerilatedVpiError::strFromVpiVal(valuep->format),
+                                    vop->fullname());
+                    digit = 0;
                }
+                vl_put_word(vop, digit, 3, i * 3);
            }
-            // mask off non-existent bits in the most significant byte
-            if (idx.quot == (bytes - 1)) {
-                datap[idx.quot] &= vop->mask_byte(idx.quot);
-            } else if (idx.quot + 1 == (bytes - 1)) {
-                datap[idx.quot + 1] &= vop->mask_byte(idx.quot + 1);
-            }
-            // zero off remaining top bytes
-            for (int i = idx.quot + 2; i < bytes; ++i) datap[i] = 0;
            return object;
        } else if (valuep->format == vpiDecStrVal) {
            char remainder[16];
@ -2807,22 +2792,10 @@ vpiHandle vpi_put_value(vpiHandle object, p_vpi_value valuep, p_vpi_time /*time_
                                remainder, valuep->value.str,
                                VerilatedVpiError::strFromVpiVal(valuep->format), vop->fullname());
            }
-            if (vop->varp()->vltype() == VLVT_UINT8) {
-                vl_put_word<CData>(vop, val);
-                return object;
-            } else if (vop->varp()->vltype() == VLVT_UINT16) {
-                vl_put_word<SData>(vop, val);
-                return object;
-            } else if (vop->varp()->vltype() == VLVT_UINT32) {
-                vl_put_word<IData>(vop, val);
-                return object;
-            } else if (vop->varp()->vltype() == VLVT_UINT64) {
-                vl_put_word<QData>(vop, val);
-                return object;
-            }
+            vl_put_word(vop, val, 64, 0);
+            return object;
        } else if (valuep->format == vpiHexStrVal) {
-            const int chars = (vop->varp()->entBits() + 3) >> 2;
-            CData* const datap = (reinterpret_cast<CData*>(vop->varDatap()));
+            const int chars = (varBits + 3) >> 2;
            const char* val = valuep->value.str;
            // skip hex ident if one is detected at the start of the string
            if (val[0] == '0' && (val[1] == 'x' || val[1] == 'X')) val += 2;
@ -2850,46 +2823,29 @@ vpiHandle vpi_put_value(vpiHandle object, p_vpi_value valuep, p_vpi_time /*time_
                    hex = 0;
                }
                // assign hex digit value to destination
-                if (i & 1) {
-                    datap[i >> 1] |= hex << 4;
-                } else {
-                    datap[i >> 1] = hex;  // this also resets all
-                    // bits to 0 prior to or'ing above of the msb
-                }
+                vl_put_word(vop, hex, 4, i * 4);
            }
-            // apply bit mask to most significant byte
-            datap[(chars - 1) >> 1] &= vop->mask_byte((chars - 1) >> 1);
            return object;
        } else if (valuep->format == vpiStringVal) {
            if (vop->varp()->vltype() == VLVT_STRING) {
                *(reinterpret_cast<std::string*>(vop->varDatap())) = valuep->value.str;
                return object;
            } else {
-                const int bytes = VL_BYTES_I(vop->size());
+                const int chars = VL_BYTES_I(varBits);
                const int len = std::strlen(valuep->value.str);
-                CData* const datap = (reinterpret_cast<CData*>(vop->varDatap()));
-                for (int i = 0; i < bytes; ++i) {
+                for (int i = 0; i < chars; ++i) {
                    // prepend with 0 values before placing string the least significant bytes
-                    datap[i] = (i < len) ? valuep->value.str[len - i - 1] : 0;
+                    const char c = (i < len) ? valuep->value.str[len - i - 1] : 0;
+                    vl_put_word(vop, c, 8, i * 8);
                }
            }
            return object;
        } else if (valuep->format == vpiIntVal) {
-            if (vop->varp()->vltype() == VLVT_UINT8) {
-                vl_put_word<CData>(vop, valuep->value.integer);
-                return object;
-            } else if (vop->varp()->vltype() == VLVT_UINT16) {
-                vl_put_word<SData>(vop, valuep->value.integer);
-                return object;
-            } else if (vop->varp()->vltype() == VLVT_UINT32) {
-                vl_put_word<IData>(vop, valuep->value.integer);
-                return object;
-            }
+            vl_put_word(vop, valuep->value.integer, 64, 0);
+            return object;
        } else if (valuep->format == vpiRealVal) {
            if (vop->varp()->vltype() == VLVT_REAL) {
-                QData val;
-                memcpy(&val, &valuep->value.real, sizeof(QData));
-                vl_put_word<QData>(vop, val);
+                *(reinterpret_cast<double*>(vop->varDatap())) = valuep->value.real;
                return object;
            }
        }
--- a/src/V3EmitCSyms.cpp
+++ b/src/V3EmitCSyms.cpp
@ -929,8 +929,6 @@ void EmitCSyms::emitSymImp() {
            checkSplit(true);
            AstScope* const scopep = it->second.m_scopep;
            AstVar* const varp = it->second.m_varp;
-            //
-            int pwidth = 1;
            int pdim = 0;
            int udim = 0;
            string bounds;
@ -944,12 +942,10 @@ void EmitCSyms::emitSymImp() {
                        bounds += cvtToStr(adtypep->left());
                        bounds += ",";
                        bounds += cvtToStr(adtypep->right());
-                        if (VN_IS(dtypep, PackArrayDType)) {
+                        if (VN_IS(dtypep, PackArrayDType))
                            pdim++;
-                            pwidth *= adtypep->elementsConst();
-                        } else {
+                        else
                            udim++;
-                        }
                        dtypep = adtypep->subDTypep();
                    } else {
                        if (basicp->isRanged()) {
@ -958,17 +954,12 @@ void EmitCSyms::emitSymImp() {
                            bounds += ",";
                            bounds += cvtToStr(basicp->lo());
                            pdim++;
-                            pwidth *= basicp->elements();
                        }
                        break;  // AstBasicDType - nothing below, 1
                    }
                }
            }

-            if (pdim == 0) {
-                pdim = 1;
-                bounds += " ,0,0";
-            }

            putns(scopep, protect("__Vscope_" + it->second.m_scopeName));
            putns(varp, ".varInsert(__Vfinal,");
--- a/test_regress/driver.py
+++ b/test_regress/driver.py
@ -1418,7 +1418,7 @@ class VlTest:
            cmd = [
                "echo q | " + run_env + VtOs.getenv_def('VERILATOR_MODELSIM', "vsim"),
                ' '.join(param['ms_run_flags']), ' '.join(param['all_run_flags']), pli_opt,
-                (" top")
+                (" t")
            ]
            self.run(cmd=cmd,
                     check_finished=param['check_finished'],
--- a/test_regress/t/t_vpi_multidim.cpp
+++ b/test_regress/t/t_vpi_multidim.cpp
@ -0,0 +1,447 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//*************************************************************************
+//
+// Copyright 2024 by Wilson Snyder. This program is free software; you can
+// redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+//
+//*************************************************************************
+
+#ifdef IS_VPI
+
+#include "vpi_user.h"
+
+#include <cstdlib>
+
+#else
+
+#include "verilated.h"
+#include "verilated_vcd_c.h"
+#include "verilated_vpi.h"
+
+#include "Vt_vpi_multidim.h"
+#include "Vt_vpi_multidim__Dpi.h"
+#include "svdpi.h"
+
+#endif
+
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <random>
+#include <cstdlib>
+
+// These require the above. Comment prevents clang-format moving them
+#include "TestCheck.h"
+#include "TestSimulator.h"
+#include "TestVpi.h"
+
+#define FILENM "t_vpi_multidim.cpp"
+
+int errors = 0;
+
+// TEST START
+
+void _arr_iter_check(const char* name, int wordSize) {
+    TestVpiHandle arr_h = vpi_handle_by_name((PLI_BYTE8*)TestSimulator::rooted(name), NULL);
+    TEST_CHECK_NZ(arr_h);
+
+    {
+        // variable is size-4 RegArray
+        int vpitype = vpi_get(vpiType, arr_h);
+        TEST_CHECK_EQ(vpitype, vpiRegArray);
+        int vpisize = vpi_get(vpiSize, arr_h);
+        TEST_CHECK_EQ(vpisize, 4);
+    }
+
+    {
+        // can't iterate through RegArrays on a nested RegArray
+        TestVpiHandle arr_iter_h = vpi_iterate(vpiRegArray, arr_h);
+        TEST_CHECK_Z(vpi_scan(arr_iter_h));
+        arr_iter_h.freed();
+    }
+
+    if (!TestSimulator::is_questa()) {
+        // but we can access them by index (Questa can't)
+        for (int idx = 0; idx < 2; idx++) {
+            TestVpiHandle arr_elem_h = vpi_handle_by_index(arr_h, idx);
+            TEST_CHECK_NZ(arr_elem_h);
+            {
+                // first indexing yields size-2 RegArrays
+                int vpitype = vpi_get(vpiType, arr_elem_h);
+                TEST_CHECK_EQ(vpitype, vpiRegArray);
+                int vpisize = vpi_get(vpiSize, arr_elem_h);
+                TEST_CHECK_EQ(vpisize, 2);
+            }
+            for (int idx2 = 0; idx2 < 2; idx2++) {
+                TestVpiHandle arr_elem2_h = vpi_handle_by_index(arr_elem_h, idx2);
+                TEST_CHECK_NZ(arr_elem2_h);
+                {
+                    // second indexing yields wordSize Regs
+                    int vpitype = vpi_get(vpiType, arr_elem2_h);
+                    TEST_CHECK_EQ(vpitype, vpiReg);
+                    int vpisize = vpi_get(vpiSize, arr_elem2_h);
+                    TEST_CHECK_EQ(vpisize, wordSize);
+                }
+            }
+        }
+    }
+
+    {
+        // it's also possible to directly iterate through all four Regs
+        TestVpiHandle arr_iter_h = vpi_iterate(vpiReg, arr_h);
+        for (int idx = 0; idx < 4; idx++) {
+            TestVpiHandle arr_elem_h = vpi_scan(arr_iter_h);
+            TEST_CHECK_NZ(arr_elem_h);
+            {
+                // which gives us wordSize Regs
+                int vpitype = vpi_get(vpiType, arr_elem_h);
+                TEST_CHECK_EQ(vpitype, vpiReg);
+                int vpisize = vpi_get(vpiSize, arr_elem_h);
+                TEST_CHECK_EQ(vpisize, wordSize);
+            }
+
+            {
+                // can't iterate through Regs on a nested Reg
+                TestVpiHandle arr_iter2_h = vpi_iterate(vpiReg, arr_elem_h);
+                TEST_CHECK_Z(vpi_scan(arr_iter2_h));
+                arr_iter2_h.freed();
+            }
+
+            // but we can access them by index
+            for (int idx2 = 0; idx2 < 2; idx2++) {
+                TestVpiHandle arr_elem2_h = vpi_handle_by_index(arr_elem_h, idx2);
+                TEST_CHECK_NZ(arr_elem2_h);
+                {
+                    // first indexing yields wordSize / 2 Regs
+                    int vpitype = vpi_get(vpiType, arr_elem2_h);
+                    TEST_CHECK_EQ(vpitype, vpiReg);
+                    int vpisize = vpi_get(vpiSize, arr_elem2_h);
+                    TEST_CHECK_EQ(vpisize, wordSize / 2);
+                }
+
+                for (int idx3 = 0; idx3 < wordSize / 2; idx3++) {
+                    TestVpiHandle arr_elem3_h = vpi_handle_by_index(arr_elem2_h, idx3);
+                    TEST_CHECK_NZ(arr_elem3_h);
+                    {
+                        // second indexing yields size-1 RegBits (no support for RegBit VPI type yet)
+                        int vpitype = vpi_get(vpiType, arr_elem3_h);
+                        //TEST_CHECK_EQ(vpitype, vpiRegBit);
+                        int vpisize = vpi_get(vpiSize, arr_elem3_h);
+                        TEST_CHECK_EQ(vpisize, 1);
+                    }
+                }
+            }
+
+            // iterating through packed ranges
+            TestVpiHandle range_iter_h = vpi_iterate(vpiRange, arr_elem_h);
+            for (int idx2 = 0; idx2 < 2; idx2++) {
+                TestVpiHandle range_h = vpi_scan(range_iter_h);
+                TEST_CHECK_NZ(range_h);
+                {
+                    s_vpi_value value;
+                    value.format = vpiIntVal;
+                    TestVpiHandle side_h = vpi_handle(vpiLeftRange, range_h);
+                    TEST_CHECK_NZ(side_h);
+                    vpi_get_value(side_h, &value);
+                    if (idx2 == 0) {
+                        TEST_CHECK_EQ(value.value.integer, 1);
+                    } else {
+                        TEST_CHECK_EQ(value.value.integer, wordSize / 2 - 1);
+                    }
+                    side_h = vpi_handle(vpiRightRange, range_h);
+                    TEST_CHECK_NZ(side_h);
+                    vpi_get_value(side_h, &value);
+                    TEST_CHECK_EQ(value.value.integer, 0);
+                }
+            }
+            TEST_CHECK_Z(vpi_scan(range_iter_h));
+            range_iter_h.freed();
+        }
+        TEST_CHECK_Z(vpi_scan(arr_iter_h));
+        arr_iter_h.freed();
+    }
+
+    {
+        // iterating through unpacked ranges
+        TestVpiHandle range_iter_h = vpi_iterate(vpiRange, arr_h);
+        for (int idx = 0; idx < 2; idx++) {
+            TestVpiHandle range_h = vpi_scan(range_iter_h);
+            TEST_CHECK_NZ(range_h);
+            {
+                s_vpi_value value;
+                value.format = vpiIntVal;
+                TestVpiHandle side_h = vpi_handle(vpiLeftRange, range_h);
+                TEST_CHECK_NZ(side_h);
+                vpi_get_value(side_h, &value);
+                TEST_CHECK_EQ(value.value.integer, 1);
+                side_h = vpi_handle(vpiRightRange, range_h);
+                TEST_CHECK_NZ(side_h);
+                vpi_get_value(side_h, &value);
+                TEST_CHECK_EQ(value.value.integer, 0);
+            }
+        }
+        TEST_CHECK_Z(vpi_scan(range_iter_h));
+        range_iter_h.freed();
+    }
+}
+
+void _arr_access_format_check(TestVpiHandle &reg_h, int wordSize, char *octVal_s, PLI_INT32 format)
+{
+    const int spanSize = wordSize / 2;
+    s_vpi_value value_in;
+    s_vpi_value value_out;
+    s_vpi_error_info e;
+    char zero_s[2] = "0";
+
+    // zero out the vector
+    value_in.format = vpiOctStrVal;
+    value_in.value.str = zero_s;
+    vpi_put_value(reg_h, &value_in, NULL, vpiNoDelay);
+    TEST_CHECK_Z(vpi_chk_error(&e));
+
+    value_in.format = format;
+    value_out.format = format;
+
+    for (int i = 0; i < 2; i++) {
+        TestVpiHandle subreg_h = vpi_handle_by_index(reg_h, i);
+        TEST_CHECK_NZ(subreg_h);
+
+        char octSpan_s[spanSize / 3 + 1];
+        strncpy(octSpan_s, &octVal_s[spanSize / 3 * (1 - i)], spanSize / 3);
+        octSpan_s[spanSize / 3] = '\0';
+
+        uint64_t intVal;
+        t_vpi_vecval vecVal[2];
+        sscanf(octSpan_s, "%" SCNo64, &intVal);
+        char strVal_s[spanSize + 1]; // max length of the string happens for binary
+
+        if (format == vpiIntVal) {
+            value_in.value.integer = intVal;
+        } else if (format == vpiVectorVal) {
+            if (spanSize > 32) {
+                vecVal[1].aval = intVal >> 32;
+                vecVal[1].bval = 0;
+            }
+            vecVal[0].aval = intVal;
+            vecVal[0].bval = 0;
+            value_in.value.vector = vecVal;
+        } else if (format == vpiBinStrVal) {
+            for (int j = 0; j < spanSize; j++)
+                strVal_s[j] = (intVal >> (spanSize - j - 1)) % 2 + '0';
+            strVal_s[spanSize] = '\0';
+            value_in.value.str = strVal_s;
+        } else if (format == vpiDecStrVal) {
+            sprintf(strVal_s, "%" PRIu64, intVal);
+            value_in.value.str = strVal_s;
+        } else if (format == vpiHexStrVal) {
+            sprintf(strVal_s, "%0*" PRIx64, (spanSize + 3) / 4, intVal);
+            value_in.value.str = strVal_s;
+        } else if (format == vpiOctStrVal) {
+            sprintf(strVal_s, "%" PRIo64, intVal);
+            value_in.value.str = strVal_s;
+        } else if (format == vpiStringVal) {
+            const int byteCount = (spanSize + 7) / 8;
+            for (int j = 0; j < byteCount; j++)
+                strVal_s[j] = (intVal >> (8 * (byteCount - j - 1))) & 0xff;
+            strVal_s[byteCount] = '\0';
+            value_in.value.str = strVal_s;
+        }
+
+        vpi_put_value(subreg_h, &value_in, NULL, vpiNoDelay);
+        TEST_CHECK_Z(vpi_chk_error(&e));
+
+        vpi_get_value(subreg_h, &value_out);
+        switch (format) {
+            case vpiIntVal:
+                TEST_CHECK_EQ(value_out.value.integer, value_in.value.integer);
+                break;
+            case vpiVectorVal:
+                if (spanSize > 32)
+                    TEST_CHECK_EQ(value_out.value.vector[1].aval, value_in.value.vector[1].aval);
+                TEST_CHECK_EQ(value_out.value.vector[0].aval, value_in.value.vector[0].aval);
+                break;
+            case vpiStringVal:
+                TEST_CHECK_EQ(value_out.value.str[0], value_in.value.str[0] ? value_in.value.str[0] : ' ');
+                break;
+            case vpiBinStrVal:
+            case vpiDecStrVal:
+            case vpiHexStrVal:
+            case vpiOctStrVal:
+                TEST_CHECK_CSTR(value_out.value.str, value_in.value.str);
+                break;
+        }
+    }
+
+    // validate the resulting flattened vector
+    value_out.format = vpiOctStrVal;
+    vpi_get_value(reg_h, &value_out);
+    TEST_CHECK_CSTR(value_out.value.str, octVal_s);
+}
+
+std::default_random_engine rng;
+
+void _arr_access_check(const char* name, int wordSize) {
+    TestVpiHandle arr_h = vpi_handle_by_name((PLI_BYTE8*)TestSimulator::rooted(name), NULL);
+    TEST_CHECK_NZ(arr_h);
+
+    std::uniform_int_distribution<uint64_t> rand64(
+        std::numeric_limits<uint64_t>::min(),
+        std::numeric_limits<uint64_t>::max()
+    );
+
+    char octVal_s[wordSize / 3 + 1];
+
+    // fill octVal_s with random octal digits
+    if (wordSize < 64) {
+        sprintf(octVal_s, "%0*" PRIo64, wordSize / 3, rand64(rng) % (1ULL << wordSize));
+    } else {
+        sprintf(octVal_s, "%0*" PRIo64, 63 / 3, rand64(rng));
+        sprintf(octVal_s + 63 / 3, "%0*" PRIo64, (wordSize - 63) / 3, rand64(rng) % (1ULL << (wordSize - 63)));
+    }
+
+    // Assume that reading/writing to the "flattened" packed register is already tested,
+    // check only reading/writing to sub-regs and validate the flattened result.
+    {
+        TestVpiHandle arr_iter_h = vpi_iterate(vpiReg, arr_h);
+        while (TestVpiHandle reg_h = vpi_scan(arr_iter_h)) {
+            s_vpi_value value_in;
+            s_vpi_value value_out;
+            s_vpi_error_info e;
+
+            value_out.format = vpiOctStrVal;
+            value_in.format = vpiOctStrVal;
+            value_in.value.str = octVal_s;
+            vpi_put_value(reg_h, &value_in, NULL, vpiNoDelay);
+            TEST_CHECK_Z(vpi_chk_error(&e));
+            vpi_get_value(reg_h, &value_out);
+            TEST_CHECK_CSTR(value_out.value.str, octVal_s);
+
+            // test each I/O data format
+            if (wordSize <= 64) {
+                _arr_access_format_check(reg_h, wordSize, octVal_s, vpiIntVal);
+                _arr_access_format_check(reg_h, wordSize, octVal_s, vpiDecStrVal);
+            }
+            _arr_access_format_check(reg_h, wordSize, octVal_s, vpiVectorVal);
+            _arr_access_format_check(reg_h, wordSize, octVal_s, vpiBinStrVal);
+            _arr_access_format_check(reg_h, wordSize, octVal_s, vpiOctStrVal);
+            _arr_access_format_check(reg_h, wordSize, octVal_s, vpiHexStrVal);
+            _arr_access_format_check(reg_h, wordSize, octVal_s, vpiStringVal);
+        }
+        arr_iter_h.freed();
+    }
+}
+
+struct params {
+    const char* name;
+    int wordSize;
+};
+
+void _multidim_check() {
+    static struct params values[] = {
+        {"arr_cdata", 6},
+        {"arr_sdata", 12},
+        {"arr_idata", 30},
+        {"arr_qdata", 60},
+        {"arr_wdata", 126},
+        {NULL}
+    };
+    struct params* value = values;
+    while (value->name) {
+        _arr_iter_check(value->name, value->wordSize);
+        _arr_access_check(value->name, value->wordSize);
+        value++;
+    }
+}
+
+// TEST END
+
+extern "C" int mon_check() {
+    // Callback from initial block in monitor
+    //if (int status = _mon_check_param()) return status;
+    printf("-mon_check()\n");
+    _multidim_check();
+    return errors;
+}
+
+#ifdef IS_VPI
+
+static int mon_check_vpi() {
+    TestVpiHandle href = vpi_handle(vpiSysTfCall, 0);
+    s_vpi_value vpi_value;
+
+    vpi_value.format = vpiIntVal;
+    vpi_value.value.integer = mon_check();
+    vpi_put_value(href, &vpi_value, NULL, vpiNoDelay);
+
+    return 0;
+}
+
+static s_vpi_systf_data vpi_systf_data[] = {{vpiSysFunc, vpiIntFunc, (PLI_BYTE8*)"$mon_check",
+                                             (PLI_INT32(*)(PLI_BYTE8*))mon_check_vpi, 0, 0, 0},
+                                            0};
+
+void vpi_compat_bootstrap(void) {
+    p_vpi_systf_data systf_data_p;
+    systf_data_p = &(vpi_systf_data[0]);
+    while (systf_data_p->type != 0) vpi_register_systf(systf_data_p++);
+}
+
+void (*vlog_startup_routines[])() = {vpi_compat_bootstrap, 0};
+
+#else
+
+int main(int argc, char** argv) {
+    const std::unique_ptr<VerilatedContext> contextp{new VerilatedContext};
+
+    uint64_t sim_time = 1100; // TODO
+    contextp->debug(0);
+    contextp->commandArgs(argc, argv);
+
+    const std::unique_ptr<VM_PREFIX> topp{new VM_PREFIX{contextp.get(),
+                                                        // Note null name - we're flattening it out
+                                                        ""}};
+
+#ifdef VERILATOR
+#ifdef TEST_VERBOSE
+    contextp->scopesDump();
+#endif
+#endif
+
+#if VM_TRACE
+    contextp->traceEverOn(true);
+    VL_PRINTF("Enabling waves...\n");
+    VerilatedVcdC* tfp = new VerilatedVcdC;
+    topp->trace(tfp, 99);
+    tfp->open(STRINGIFY(TEST_OBJ_DIR) "/simx.vcd");
+#endif
+
+    topp->eval();
+    topp->clk = 0;
+    contextp->timeInc(10);
+
+    while (contextp->time() < sim_time && !contextp->gotFinish()) {
+        contextp->timeInc(1);
+        topp->eval();
+        VerilatedVpi::callValueCbs();
+        topp->clk = !topp->clk;
+        // mon_do();
+#if VM_TRACE
+        if (tfp) tfp->dump(main_time);
+#endif
+    }
+    if (!contextp->gotFinish()) {
+        vl_fatal(__FILE__, __LINE__, "main", "%Error: Timeout; never got a $finish");
+    }
+    topp->final();
+
+#if VM_TRACE
+    if (tfp) tfp->close();
+#endif
+
+    return 0;
+}
+
+#endif
--- a/test_regress/t/t_vpi_multidim.py
+++ b/test_regress/t/t_vpi_multidim.py
@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2024 by Wilson Snyder. This program is free software; you
+# can redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+
+import vltest_bootstrap
+
+test.scenarios('simulator')
+
+test.compile(make_top_shell=False,
+             make_main=False,
+             make_pli=True,
+             v_flags2=["+define+USE_VPI_NOT_DPI"],
+             verilator_flags2=["--exe --vpi --no-l2name --public-flat-rw", test.pli_filename])
+
+test.execute(use_libvpi=True)
+
+test.passes()
--- a/test_regress/t/t_vpi_multidim.v
+++ b/test_regress/t/t_vpi_multidim.v
@ -0,0 +1,44 @@
+// DESCRIPTION: Verilator: Verilog Test module
+//
+// Copyright 2024 by Wilson Snyder. This program is free software; you can
+// redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+
+module t (/*AUTOARG*/
+    // Inputs
+   clk
+   ); /*verilator public_module*/
+
+`ifdef VERILATOR
+`systemc_header
+extern "C" int mon_check();
+`verilog
+`endif
+
+   input clk;
+
+   logic [1:0][2:0] arr_cdata [1:0][1:0]; // 2x3 (6) bit words
+   logic [1:0][5:0] arr_sdata [1:0][1:0]; // 2x6 (12) bit words
+   logic [1:0][14:0] arr_idata [1:0][1:0]; // 2x15 (30) bit words
+   logic [1:0][29:0] arr_qdata [1:0][1:0]; // 2x30 (60) bit words
+   logic [1:0][62:0] arr_wdata [1:0][1:0]; // 2x63 (126) bit words
+
+   int status;
+
+   initial begin
+`ifdef VERILATOR
+      status = $c32("mon_check()");
+`else
+      status = $mon_check();
+`endif
+      if (status!=0) begin
+         $write("%%Error: t_vpi_multidim.cpp:%0d: C Test failed\n", status);
+         $stop;
+      end
+      $write("*-* All Finished *-*\n");
+      $finish;
+   end
+
+endmodule : t