add meminfo and grad warning

2020-05-11 13:26:23 +08:00 · 2020-05-11 13:26:23 +08:00 · 1c19d5837d
parent c657491a51
commit 1c19d5837d
8 changed files with 175 additions and 102 deletions
--- a/src/grad.cc
+++ b/src/grad.cc
@ -122,7 +122,7 @@ vector<VarPtr> grad(Var* loss, vector<Var*> targets) {
        if (var->tflag == nt)
            grad = move(grads[var->custom_data]);
        if (!grad) {
-            LOGvvv << var << "grads[">>i>>"] set to zero";
+            LOGw << "grads[">>i>>"] doesn't have gradient. It will be set to zero:" << var;
            grad = make_number(0.f, var);
            assign_attrs(grad.ptr, var);
            registe_node_trace_grad(grad.ptr, var, 0);
--- a/src/mem/allocator.cc
+++ b/src/mem/allocator.cc
@ -4,12 +4,6 @@
 // file 'LICENSE.txt', which is part of this source code package.
 // ***************************************************************
 #include <typeinfo>
-#include <iomanip>
-#include <sys/sysinfo.h>
-
-#include "var.h"
-#include "op.h"
-#include "var_holder.h"
 #include "misc/cuda_flags.h"

 #include "mem/allocator/aligned_allocator.h"
@ -92,67 +86,4 @@ void gc_all() {
    for (auto& kv : allocators) kv.second->gc();
 }

-struct FloatOutput {
-    double value;
-    string scale;
-    int base;
-    string suffix;
-    int p=4;
-};
-
-std::ostream& operator<<(std::ostream& os, const FloatOutput& o) {
-    int w = 8;
-    os << std::setw(w-2-o.suffix.size());
-    os << std::setprecision(o.p);
-    uint i=0;
-    double k = o.value;
-    for (; i+1<o.scale.size(); i++) {
-        if (k<o.base) break;
-        k /= o.base;
-    }
-    os << k << o.scale[i];
-    return os << o.suffix;
-}
-
-void display_memory_info(const char* fileline) {
-    int p = 2;
-    Log log(fileline, 'i', 0);
-    log << "\n=== display_memory_info ===\n";
-    log << "hold_vars:" << VarHolder::hold_vars.size()
-        << "lived_vars:" << Var::number_of_lived_vars
-        << "lived_ops:" << Op::number_of_lived_ops >> '\n';
-    if (use_stat_allocator) {
-        log << "stat:" << use_stat_allocator;
-        log << "total alloc:" << FloatOutput{(double)(stat_allocator_total_alloc_byte 
-                        - stat_allocator_total_free_byte), " KMG", 1024, "B"};
-        log << "total alloc call:" << FloatOutput{(double)(stat_allocator_total_alloc_call 
-                        - stat_allocator_total_free_call), " KMG", 1000, ""} >> '\n';
-    }
-    for (auto& a : SFRLAllocator::sfrl_allocators) {
-        auto total = a->used_memory + a->unused_memory;
-        log << "name:" << a->name() << "is_cuda:" << a->is_cuda()
-            << "used:" << FloatOutput{(double)a->used_memory, " KMG", 1024, "B"}
-                >> "(" >> std::setprecision(p) >> a->used_memory*100.0 / total >> "%)"
-            << "unused:" << FloatOutput{(double)a->unused_memory, " KMG", 1024, "B"} 
-                >> "(" >> std::setprecision(p) >> a->unused_memory*100.0 / total >> "%)"
-            << "total:" << FloatOutput{(double)total, " KMG", 1024, "B"} >> "\n";
-    }
-    log >> "===========================\n";
-    log.end();
-}
-
-MemInfo::MemInfo() {
-    struct sysinfo info = {0};
-    sysinfo(&info);
-    total_cpu_ram = info.totalram;
-    total_cuda_ram = 0;
-#ifdef HAS_CUDA
-    cudaDeviceProp prop = {0};
-    cudaGetDeviceProperties(&prop, 0);
-    total_cuda_ram = prop.totalGlobalMem;
-#endif
-}
-
-MemInfo mem_info;
-
 } // jittor
--- a/src/mem/allocator.h
+++ b/src/mem/allocator.h
@ -5,6 +5,7 @@
 // ***************************************************************
 #pragma once
 #include "common.h"
+#include "mem/mem_info.h"

 namespace jittor {

@ -51,24 +52,4 @@ Allocator* get_allocator();
 // @pyjt(gc)
 void gc_all();

-// @pyjt(display_memory_info)
-void display_memory_info(const char* fileline="");
-
-// @pyjt(MemInfo)
-struct MemInfo {
-    // @pyjt(total_cpu_ram)
-    int64 total_cpu_ram;
-    // @pyjt(total_cuda_ram)
-    int64 total_cuda_ram;
-
-    inline MemInfo(const MemInfo&) = default;
-
-    MemInfo();
-};
-
-extern MemInfo mem_info;
-
-// @pyjt(get_mem_info)
-inline MemInfo get_mem_info() { return mem_info; }
-
 } // jittor
--- a/src/mem/mem_info.cc
+++ b/src/mem/mem_info.cc
@ -0,0 +1,110 @@
+// ***************************************************************
+// Copyright (c) 2020 Jittor. Authors: Dun Liang <randonlang@gmail.com>. All Rights Reserved.
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+#include <iomanip>
+#include <algorithm>
+#include <sys/sysinfo.h>
+
+#include "var.h"
+#include "op.h"
+#include "var_holder.h"
+#include "graph.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator/sfrl_allocator.h"
+#include "mem/allocator/stat_allocator.h"
+#include "mem/mem_info.h"
+
+namespace jittor {
+
+struct FloatOutput {
+    double value;
+    string scale;
+    int base;
+    string suffix;
+    int p=4;
+};
+
+std::ostream& operator<<(std::ostream& os, const FloatOutput& o) {
+    int w = 8;
+    os << std::setw(w-2-o.suffix.size());
+    os << std::setprecision(o.p);
+    uint i=0;
+    double k = o.value;
+    for (; i+1<o.scale.size(); i++) {
+        if (k<o.base) break;
+        k /= o.base;
+    }
+    os << k << o.scale[i];
+    return os << o.suffix;
+}
+
+void display_memory_info(const char* fileline) {
+    int p = 3;
+    Log log(fileline, 'i', 0);
+    log << "\n=== display_memory_info ===\n";
+    log << "total_cpu_ram:" << 
+        FloatOutput{(double)mem_info.total_cpu_ram, " KMG", 1024, "B"};
+    log << "total_cuda_ram:" << 
+        FloatOutput{(double)mem_info.total_cuda_ram, " KMG", 1024, "B"} >> "\n";
+    log << "hold_vars:" << VarHolder::hold_vars.size()
+        << "lived_vars:" << Var::number_of_lived_vars
+        << "lived_ops:" << Op::number_of_lived_ops >> '\n';
+
+    #ifdef NODE_MEMCHECK
+    // get the oldest var
+    vector<Node*> queue;
+    auto t = ++Node::tflag_count;
+    for (auto& vh : VarHolder::hold_vars)
+        if (vh->var->tflag != t) {
+            vh->var->tflag = t;
+            queue.push_back(vh->var);
+        }
+    bfs_both(queue, [](Node*){return true;});
+    vector<pair<int64, Node*>> nodes;
+    nodes.reserve(queue.size());
+    for (auto* node : queue)
+        nodes.push_back({node->__id(), node});
+    std::sort(nodes.begin(), nodes.end());
+    log << "list of the oldest nodes:\n";
+    for (int i=0; i<10 && i<nodes.size(); i++) {
+        log << "ID#" >> nodes[i].first >> ":" << nodes[i].second << "\n";
+    }
+    #endif
+
+    if (use_stat_allocator) {
+        log << "stat:" << use_stat_allocator;
+        log << "total alloc:" << FloatOutput{(double)(stat_allocator_total_alloc_byte 
+                        - stat_allocator_total_free_byte), " KMG", 1024, "B"};
+        log << "total alloc call:" << FloatOutput{(double)(stat_allocator_total_alloc_call 
+                        - stat_allocator_total_free_call), " KMG", 1000, ""} >> '\n';
+    }
+    for (auto& a : SFRLAllocator::sfrl_allocators) {
+        auto total = a->used_memory + a->unused_memory;
+        log << "name:" << a->name() << "is_cuda:" << a->is_cuda()
+            << "used:" << FloatOutput{(double)a->used_memory, " KMG", 1024, "B"}
+                >> "(" >> std::setprecision(p) >> a->used_memory*100.0 / total >> "%)"
+            << "unused:" << FloatOutput{(double)a->unused_memory, " KMG", 1024, "B"} 
+                >> "(" >> std::setprecision(p) >> a->unused_memory*100.0 / total >> "%)"
+            << "total:" << FloatOutput{(double)total, " KMG", 1024, "B"} >> "\n";
+    }
+    log >> "===========================\n";
+    log.end();
+}
+
+MemInfo::MemInfo() {
+    struct sysinfo info = {0};
+    sysinfo(&info);
+    total_cpu_ram = info.totalram;
+    total_cuda_ram = 0;
+#ifdef HAS_CUDA
+    cudaDeviceProp prop = {0};
+    cudaGetDeviceProperties(&prop, 0);
+    total_cuda_ram = prop.totalGlobalMem;
+#endif
+}
+
+MemInfo mem_info;
+
+} // jittor
--- a/src/mem/mem_info.h
+++ b/src/mem/mem_info.h
@ -0,0 +1,31 @@
+// ***************************************************************
+// Copyright (c) 2020 Jittor. Authors: Dun Liang <randonlang@gmail.com>. All Rights Reserved.
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+#pragma once
+#include "common.h"
+
+namespace jittor {
+
+// @pyjt(display_memory_info)
+void display_memory_info(const char* fileline="");
+
+// @pyjt(MemInfo)
+struct MemInfo {
+    // @pyjt(total_cpu_ram)
+    int64 total_cpu_ram;
+    // @pyjt(total_cuda_ram)
+    int64 total_cuda_ram;
+
+    inline MemInfo(const MemInfo&) = default;
+
+    MemInfo();
+};
+
+extern MemInfo mem_info;
+
+// @pyjt(get_mem_info)
+inline MemInfo get_mem_info() { return mem_info; }
+
+} // jittor
--- a/src/pybind/py_var_tracer.cc
+++ b/src/pybind/py_var_tracer.cc
@ -15,23 +15,33 @@ using namespace pybind11::literals;

 namespace jittor {

-DEFINE_FLAG(int, trace_py_var, 0, "Trace py stack for debug.");
+DEFINE_FLAG(int, trace_py_var, 0, "Trace py stack max depth for debug.");

 unordered_map<const Node*, string> trace_data;

 void __registe_node_trace(Node* node) {
-    auto py_stack = 
+    auto py_stacks = 
        py::module::import("traceback")
-        .attr("extract_stack")(nullptr, 1).attr("__getitem__")(0);
-    auto filename = py_stack.attr("filename").cast<string>();
-    auto basename = split(filename, "/").back();
-    basename += ':';
-    basename +=  py_stack.attr("name").cast<string>();
-    basename += ':';
-    basename +=  S(py_stack.attr("lineno").cast<int>());
-    basename += ':';
-    basename +=  py_stack.attr("line").cast<string>();
-    trace_data[node] = basename;
+        .attr("extract_stack")(nullptr, trace_py_var);
+    auto len = py_stacks.attr("__len__")().cast<int>();
+    string info;
+    for (int i=0; i<len; i++) {
+        auto py_stack = py_stacks.attr("__getitem__")(i);
+        auto filename = py_stack.attr("filename").cast<string>();
+        if (len==1)
+            info += split(filename, "/").back();
+        else {
+            info += "\n        ";
+            info += filename;
+        }
+        info += ':';
+        info +=  py_stack.attr("name").cast<string>();
+        info += ':';
+        info +=  S(py_stack.attr("lineno").cast<int>());
+        info += ':';
+        info +=  py_stack.attr("line").cast<string>();
+    }
+    trace_data[node] = info;
 }

 void __unregiste_node_trace(Node* node) {
--- a/src/var_holder.cc
+++ b/src/var_holder.cc
@ -3,6 +3,7 @@
 // This file is subject to the terms and conditions defined in
 // file 'LICENSE.txt', which is part of this source code package.
 // ***************************************************************
+#include <sstream>
 #ifdef HAS_CUDA
 #include <cuda_runtime.h>
 #include <helper_cuda.h>
@ -122,4 +123,10 @@ vector<ArrayArgs> fetch_sync(const vector<VarHolder*>& vh) {
    return ret;
 }

+string VarHolder::debug_msg() {
+    std::stringstream ss;
+    ss << var;
+    return ss.str();
+}
+
 } // jittor
--- a/src/var_holder.h
+++ b/src/var_holder.h
@ -154,6 +154,9 @@ struct VarHolder {
        #endif
        std::memcpy(var->mem_ptr, array.ptr, size);
    }
+
+    // @pyjt(debug_msg)
+    string debug_msg();
 };

 // @pyjt(sync)