1#!/usr/bin/env python
2import getopt
3import sys
4import os
5import time
6from threading import Thread
7
8sys.path.append('.')
9sys.path.append('lib')
10from remote.remote_util import RemoteMachineShellConnection
11
12from TestInput import TestInputParser
13
14
15def usage(error=None):
16    print """\
17Syntax: getcoredumps.py [options]
18
19Options
20 -i <file>        Path to .ini file containing cluster information.
21 -p <key=val,...> Comma-separated key=value info.
22
23Available keys:
24 path=<file_path> The destination path you want to put your zipped diag file
25
26Example:
27 getcoredumps.py -i cluster.ini -p path=/tmp/nosql
28"""
29    sys.exit(error)
30
31
32class Getcoredumps(object):
33    def __init__(self, server, path):
34        self.server = server
35        self.path = path
36
37    def run(self):
38        remote = RemoteMachineShellConnection(self.server)
39        server_type = 'membase'
40        if remote.is_couchbase_installed():
41            server_type = 'couchbase'
42        stamp = time.strftime("%d_%m_%Y_%H_%M")
43        try:
44            info = remote.extract_remote_info()
45            if info.type.lower() != 'windows':
46                core_files = []
47                print "looking for crashes on {0} ... ".format(info.ip)
48                print "erl_crash files under /opt/{0}/var/lib/{0}/".format(server_type)
49                core_files.extend(remote.file_starts_with("/opt/{0}/var/lib/{0}/".format(server_type), "erl_crash"))
50                print "core* files under /opt/{0}/var/lib/{0}/".format(server_type)
51                core_files.extend(remote.file_starts_with("/opt/{0}/var/lib/{0}/".format(server_type), "core"))
52                print "core* files under /tmp/"
53                core_files.extend(remote.file_starts_with("/tmp/", "core"))
54                print "breakpad *dmp files under /opt/{0}/var/lib/{0}/".format(server_type)
55                core_files.extend(remote.file_ends_with("/opt/{0}/var/lib/{0}/".format(server_type), ".dmp"))
56                if core_files:
57                    print "found crashes on {0}: {1}".format(info.ip, core_files)
58                else:
59                    print "crashes not found on {0}".format(info.ip)
60                i = 0
61                for core_file in core_files:
62                    if core_file.find('erl_crash.dump') != -1:
63                        #let's just copy that file back
64                        erl_crash_file_name = "erlang-{0}-{1}.log".format(self.server.ip, i)
65                        remote_path, file_name = os.path.dirname(core_file), os.path.basename(core_file)
66                        if remote.get_file(remote_path, file_name, os.path.join(self.path, erl_crash_file_name)):
67                            print 'downloaded core file : {0}'.format(core_file)
68                            i += 1
69                    elif core_file.find('.dmp') != -1:
70                        breakpad_crash_file_name = "breakpad-{0}-{1}.dmp".format(self.server.ip, i)
71                        remote_path, file_name = os.path.dirname(core_file), os.path.basename(core_file)
72                        if remote.get_file(remote_path, file_name, os.path.join(self.path, breakpad_crash_file_name)):
73                            print 'downloaded breakpad .dmp file : {0}'.format(core_file)
74                            i += 1
75                    else:
76                        command = "/opt/{0}/bin/tools/cbanalyze-core".format(server_type)
77                        core_file_name = "core-{0}-{1}.log".format(self.server.ip, i)
78                        core_log_output = "/tmp/{0}".format(core_file_name)
79                        output, _ = remote.execute_command('{0} {1} -f {2}'.format(command, core_file, core_log_output))
80                        print output
81                        remote_path, file_name = os.path.dirname(core_log_output), os.path.basename(core_log_output)
82                        if remote.get_file(remote_path, file_name, os.path.join(self.path, core_file_name)):
83                            print 'downloaded core backtrace : {0}'.format(core_log_output)
84                            i += 1
85                if i > 0:
86                    command = "mkdir -p /tmp/backup_crash/{0};" \
87                              "mv -f /tmp/core* /tmp/backup_crash/{0};" \
88                              "mv -f /opt/{1}/var/lib/{1}/erl_crash.dump* /tmp/backup_crash/{0}; " \
89                              "mv -f /opt/{1}/var/lib/{1}/*.dmp /tmp/backup_crash/{0};" \
90                              "mv -f /opt/{1}/var/lib/{1}/crash/*.dmp /tmp/backup_crash/{0};".\
91                        format(stamp, server_type)
92                    print "put all crashes on {0} in backup folder: /tmp/backup_crash/{1}".format(self.server.ip, stamp)
93                    remote.execute_command(command)
94                    output, error = remote.execute_command("ls -la /tmp/backup_crash/{0}".format(stamp))
95                    for o in output:
96                        print o
97                    remote.disconnect()
98                    return True
99                if remote:
100                    remote.disconnect()
101                return False
102        except Exception as ex:
103            print ex
104            return False
105
106
107class Clearcoredumps(object):
108    def __init__(self, server, path):
109        self.server = server
110        self.path = path
111
112    def run(self):
113        remote = RemoteMachineShellConnection(self.server)
114        server_type = 'membase'
115        if remote.is_couchbase_installed():
116            server_type = 'couchbase'
117        stamp = time.strftime("%d_%m_%Y_%H_%M")
118        try:
119            info = remote.extract_remote_info()
120            if info.type.lower() != 'windows':
121                core_files = []
122                print "looking for Erlang/Memcached crashes on {0} ... ".format(info.ip)
123                core_files.extend(remote.file_starts_with("/opt/{0}/var/lib/{0}/".format(server_type), "erl_crash"))
124                core_files.extend(remote.file_starts_with("/opt/{0}/var/lib/{0}/".format(server_type), "core"))
125                core_files.extend(remote.file_starts_with("/tmp/", "core"))
126                core_files.extend(remote.file_ends_with("/opt/{0}/var/lib/{0}/crash".format(server_type), ".dmp"))
127                if core_files:
128                    print "found dumps on {0}: {1}".format(info.ip, core_files)
129                    command = "mkdir -p /tmp/backup_crash/{0};" \
130                              "mv -f /tmp/core* /tmp/backup_crash/{0};" \
131                              "mv -f /opt/{1}/var/lib/{1}/erl_crash.dump* /tmp/backup_crash/{0}; " \
132                              "mv -f /opt/{1}/var/lib/{1}/crash/*.dmp /tmp/backup_crash/{0};".\
133                        format(stamp, server_type)
134                    print "Moved all dumps on {0} to backup folder: /tmp/backup_crash/{1}".format(self.server.ip, stamp)
135                    remote.execute_command(command)
136                    output, error = remote.execute_command("ls -la /tmp/backup_crash/{0}".format(stamp))
137                    for o in output:
138                        print o
139                    for core_file in core_files:
140                        remote_path, file_name = os.path.dirname(core_file), os.path.basename(core_file)
141                        if remote.delete_file(remote_path, file_name):
142                            print 'deleted core file : {0}'.format(core_file)
143                    remote.disconnect()
144                else:
145                    print "dump files not found on {0}".format(info.ip)
146                    if remote:
147                        remote.disconnect()
148        except Exception as ex:
149            print ex
150
151def main():
152    try:
153        (opts, args) = getopt.getopt(sys.argv[1:], 'hi:p', [])
154        for o, a in opts:
155            if o == "-h":
156                usage()
157
158        input = TestInputParser.get_test_input(sys.argv)
159        if not input.servers:
160            usage("ERROR: no servers specified. Please use the -i parameter.")
161    except IndexError:
162        usage()
163    except getopt.GetoptError, error:
164        usage("ERROR: " + str(error))
165
166    file_path = input.param("path", ".")
167    remotes = (Getcoredumps(server, file_path) for server in input.servers)
168    remote_threads = [Thread(target=remote.run) for remote in remotes]
169
170    for remote_thread in remote_threads:
171        remote_thread.daemon = True
172        remote_thread.start()
173        run_time = 0
174        while remote_thread.isAlive() and run_time < 1200:
175            time.sleep(15)
176            run_time += 15
177            print "Waiting for another 15 seconds (time-out after 20 min)"
178        if run_time == 1200:
179            print "collect core dumps hung on this node. Jumping to next node"
180        print "collect core dumps info done"
181
182    for remote_thread in remote_threads:
183        remote_thread.join(120)
184        if remote_thread.isAlive():
185            raise Exception("collect core dumps hung on remote node")
186
187if __name__ == "__main__":
188    main()