1#!/usr/bin/env python3 2# -*- python -*- 3# 4# @author Couchbase <info@couchbase.com> 5# @copyright 2011-2019 Couchbase, Inc. 6# 7# Licensed under the Apache License, Version 2.0 (the "License"); 8# you may not use this file except in compliance with the License. 9# You may obtain a copy of the License at 10# 11# http://www.apache.org/licenses/LICENSE-2.0 12# 13# Unless required by applicable law or agreed to in writing, software 14# distributed under the License is distributed on an "AS IS" BASIS, 15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16# See the License for the specific language governing permissions and 17# limitations under the License. 18import os 19import sys 20import tempfile 21import time 22import subprocess 23import re 24import platform 25import glob 26import socket 27import threading 28import optparse 29import atexit 30import signal 31import urllib.parse 32import shutil 33import errno 34import hashlib 35import uuid 36import codecs 37from datetime import datetime, timedelta, tzinfo 38from io import BytesIO, StringIO 39 40 41# This is a facility that provides a functionality similar to atexit from the 42# standard library. We don't use the latter for the following reasons. 43# 44# When cbcollect_info is started with --watch-stdin flag, we start a thread 45# monitoring stdin that terminates the process when stdin gets closed. The 46# issue is many-fold: 47# 48# - sys.exit() can only be called from the main thread. 49# 50# - os._exit() doesn't invoke any of the cleanup functions registered by 51# atexit. 52# 53# - It's possible for the stdin watcher thread to interrupt the main thread 54# by calling _thread.interrupt_main(). This plays nicely with atexit. But 55# the issue is that the thread can't always be interrupted. So it can take 56# a noticeable amount of time for the main thread to terminate. 57# 58# So AltExitC is a solution to these issues. It terminates the process as soon 59# as possible by calling os._exit(). The price is that the cleanup actions 60# need to be registered with AltExitC and synchronization is a concern. 61class AltExitC(object): 62 def __init__(self): 63 self.list = [] 64 self.lock = threading.Lock() 65 atexit.register(self.at_exit_handler) 66 67 def register(self, f): 68 self.lock.acquire() 69 self.register_and_unlock(f) 70 71 def register_and_unlock(self, f): 72 try: 73 self.list.append(f) 74 finally: 75 self.lock.release() 76 77 def at_exit_handler(self): 78 self.lock.acquire() 79 self.list.reverse() 80 for f in self.list: 81 try: 82 f() 83 except BaseException: 84 # Continue exit handling in spite of any exceptions 85 pass 86 87 def exit(self, status): 88 self.at_exit_handler() 89 os._exit(status) 90 91 92AltExit = AltExitC() 93 94# Currently we decode bytes in this file via LATIN1. The reason for this is that 95# UTF8 (which is the default in python) is a decoding which can fail - i.e. not 96# all sequences of bytes are valid UTF8 and we cannot currenlty guarantee that 97# all bytes that will be run through cbcollect will be valid UTF8. (We need 98# protections elsewhere to make this guarantee that currently don't exist.) By 99# contrast, all byte sequences are valid LATIN1, almost all our content is ASCII 100# and thus LATIN1, and python2 essentially decoded strings as LATIN1, thus we 101# are backwards compatible with pre-6.5 behavior. See MB-33809. 102# For cases in which one knows for certain UTF8 is being used, feel free 103# to use it. 104LATIN1 = 'latin1' 105 106USAGE = """usage: %prog [options] output_file.zip 107 108- Linux/Windows/OSX: 109 %prog output_file.zip 110 %prog -v output_file.zip""" 111 112# adapted from pytz 113 114 115class FixedOffsetTZ(tzinfo): 116 def __init__(self, minutes): 117 if abs(minutes) >= 1440: 118 raise ValueError("absolute offset is too large", minutes) 119 self._minutes = minutes 120 self._offset = timedelta(minutes=minutes) 121 122 def utcoffset(self, dt): 123 return self._offset 124 125 def dst(self, dt): 126 return timedelta(0) 127 128 def tzname(self, dt): 129 return None 130 131 132local_tz = FixedOffsetTZ(minutes=time.timezone / 60) 133log_stream = StringIO() 134local_addr = None 135local_url_addr = None 136 137 138def set_local_addr(ipv6): 139 global local_addr 140 global local_url_addr 141 142 local_addr = "::1" if ipv6 else "127.0.0.1" 143 local_url_addr = "[::1]" if ipv6 else "127.0.0.1" 144 145 146log_line = None 147 148 149def buffer_log_line(message, new_line): 150 global log_line 151 152 line = log_line 153 if line is None: 154 now = datetime.now(tz=local_tz) 155 line = '[%s] ' % now.isoformat() 156 157 line += message 158 if new_line: 159 log_line = None 160 return line 161 else: 162 log_line = line 163 return None 164 165 166def log(message, new_line=True): 167 global log_stream 168 169 if new_line: 170 message += '\n' 171 172 bufline = buffer_log_line(message, new_line) 173 if bufline is not None: 174 log_stream.write(bufline) 175 176 sys.stderr.write(message) 177 sys.stderr.flush() 178 179 180def generate_hash(val): 181 return hashlib.sha1(val.encode()) 182 183 184class AccessLogProcessor: 185 def __init__(self, salt): 186 self.salt = salt 187 self.column_parser = re.compile( 188 r'(^\S* \S* )(\S*)( \[.*\] \"\S* )(\S*)( .*$)') 189 self.urls_to_redact = [['/settings/rbac/users', 190 re.compile(r'\/(?P<user>[^\/\s#&]+)([#&]|$)'), 191 self._process_user, "user"], 192 ['/_cbauth/checkPermission', 193 re.compile(r'user=(?P<user>[^\s&#]+)'), 194 self._process_user, "user"], 195 ['/pools/default/buckets', 196 re.compile(r'\/(?:[^\/\s#&]+)\/docs\/' 197 '(?P<docid>[^\/\s#&]+)$'), 198 self._process_docid, "docid"]] 199 200 def _process_url(self, surl): 201 for conf in self.urls_to_redact: 202 prefix = conf[0] 203 if surl[:len(prefix)] == prefix: 204 return prefix + self._process_url_tail(conf[1], conf[2], 205 conf[3], 206 surl[len(prefix):]) 207 return surl 208 209 def _process_url_tail(self, rex, fn, key, s): 210 m = rex.search(s) 211 if m is not None: 212 return s[:m.start(key)] + fn(m.group(key)) + s[m.end(key):] 213 else: 214 return s 215 216 def _process_user(self, user): 217 if user == '-' or user[0] == '@': 218 return user 219 elif user[-3:] == "/UI": 220 return self._hash(user[:-3]) + "/UI" 221 else: 222 return self._hash(user) 223 224 def _process_docid(self, docid): 225 return self._hash(docid) 226 227 def _hash(self, token): 228 return generate_hash(self.salt + token).hexdigest() 229 230 def _repl_columns(self, matchobj): 231 return matchobj.group(1) + \ 232 self._process_user(matchobj.group(2)) + \ 233 matchobj.group(3) + \ 234 self._process_url(matchobj.group(4)) + \ 235 matchobj.group(5) 236 237 def do(self, line): 238 return self.column_parser.sub(self._repl_columns, line) 239 240 241class RegularLogProcessor: 242 rexes = [re.compile('(<ud>)(.+?)(</ud>)'), 243 # Redact the rest of the line in the case we encounter 244 # log-redaction-salt. Needed to redact pre-6.5 debug logs 245 # as well as occurence in couchbase.log 246 re.compile('(log-redaction-salt)(.+)')] 247 248 def __init__(self, salt): 249 self.salt = salt 250 251 def _hash(self, match): 252 result = match.group(1) 253 if match.lastindex == 3: 254 h = generate_hash(self.salt + match.group(2)).hexdigest() 255 result += h + match.group(3) 256 elif match.lastindex == 2: 257 result += " <redacted>" 258 return result 259 260 def _process_line(self, line): 261 for rex in self.rexes: 262 line = rex.sub(self._hash, line) 263 return line 264 265 def do(self, line): 266 return self._process_line(line) 267 268 269class CouchbaseLogProcessor(RegularLogProcessor): 270 def do(self, line): 271 if "RedactLevel" in line: 272 # salt + salt to maintain consistency with other 273 # occurances of hashed salt in the logs. 274 return 'RedactLevel:partial,HashOfSalt:%s\n' \ 275 % generate_hash(self.salt + self.salt).hexdigest() 276 else: 277 return self._process_line(line) 278 279 280class LogRedactor: 281 def __init__(self, salt, tmpdir, default_name): 282 self.default_name = default_name 283 self.target_dir = os.path.join(tmpdir, "redacted") 284 os.makedirs(self.target_dir) 285 286 self.access_log = AccessLogProcessor(salt) 287 self.couchbase_log = CouchbaseLogProcessor(salt) 288 self.regular_log = RegularLogProcessor(salt) 289 290 def _process_file(self, ifile, ofile, processor): 291 # Don't try to catch any errors here as we want failures (e.g. 292 # due to disk full) to abort the collection. This will allow 293 # the cause of the failure to be fixed and collection retried. 294 with codecs.open(ifile, 'r', LATIN1) as inp: 295 with open(ofile, 'w+') as out: 296 for line in inp: 297 out.write(processor.do(line)) 298 299 def redact_file(self, name, ifile): 300 ofile = os.path.join(self.target_dir, name) 301 if "http_access" in name: 302 self._process_file(ifile, ofile, self.access_log) 303 elif name == self.default_name: 304 self._process_file(ifile, ofile, self.couchbase_log) 305 else: 306 self._process_file(ifile, ofile, self.regular_log) 307 return ofile 308 309 310class Task(object): 311 privileged = False 312 no_header = False 313 num_samples = 1 314 interval = 0 315 316 def __init__(self, description, command, timeout=None, **kwargs): 317 self.description = description 318 self.command = command 319 self.timeout = timeout 320 self.__dict__.update(kwargs) 321 self._is_posix = (os.name == 'posix') 322 323 def _platform_popen_flags(self): 324 flags = {} 325 if self._is_posix: 326 flags['preexec_fn'] = os.setpgrp 327 328 return flags 329 330 def _can_kill(self, p): 331 if self._is_posix: 332 return True 333 334 return hasattr(p, 'kill') 335 336 def _kill(self, p): 337 if self._is_posix: 338 group_pid = os.getpgid(p.pid) 339 os.killpg(group_pid, signal.SIGKILL) 340 else: 341 p.kill() 342 343 def _env_flags(self): 344 flags = {} 345 if hasattr(self, 'addenv'): 346 env = os.environ.copy() 347 env.update(self.addenv) 348 flags['env'] = env 349 350 return flags 351 352 def _cwd_flags(self): 353 flags = {} 354 if getattr(self, 'change_dir', False): 355 cwd = self._task_runner.tmpdir 356 if isinstance(self.change_dir, str): 357 cwd = self.change_dir 358 359 flags['cwd'] = cwd 360 361 return flags 362 363 def _extra_flags(self): 364 flags = self._env_flags() 365 flags.update(self._platform_popen_flags()) 366 flags.update(self._cwd_flags()) 367 368 return flags 369 370 def set_task_runner(self, runner): 371 self._task_runner = runner 372 373 def execute(self, fp): 374 """Run the task""" 375 use_shell = not isinstance(self.command, list) 376 extra_flags = self._extra_flags() 377 try: 378 p = subprocess.Popen(self.command, bufsize=-1, 379 stdin=subprocess.PIPE, 380 stdout=subprocess.PIPE, 381 stderr=subprocess.STDOUT, 382 shell=use_shell, 383 **extra_flags) 384 if hasattr(self, 'to_stdin'): 385 p.stdin.write(self.to_stdin.encode()) 386 387 p.stdin.close() 388 389 except OSError as e: 390 # if use_shell is False then Popen may raise exception 391 # if binary is missing. In this case we mimic what 392 # shell does. Namely, complaining to stderr and 393 # setting non-zero status code. It's might also 394 # automatically handle things like "failed to fork due 395 # to some system limit". 396 fp.write(f"Failed to execute {self.command}: {e}\n".encode()) 397 return 127 398 399 except IOError as e: 400 if e.errno == errno.EPIPE: 401 fp.write(f"Ignoring broken pipe on stdin for {self.command}\n".encode()) 402 else: 403 raise 404 405 from threading import Timer, Event 406 407 timer = None 408 timer_fired = Event() 409 410 if self.timeout is not None and self._can_kill(p): 411 def on_timeout(): 412 try: 413 self._kill(p) 414 except BaseException: 415 # the process might have died already 416 pass 417 418 timer_fired.set() 419 420 timer = Timer(self.timeout, on_timeout) 421 timer.start() 422 423 try: 424 while True: 425 data = p.stdout.read(64 * 1024) 426 if not data: 427 break 428 429 fp.write(data) 430 finally: 431 if timer is not None: 432 timer.cancel() 433 timer.join() 434 435 # there's a tiny chance that command succeeds just before 436 # timer is fired; that would result in a spurious timeout 437 # message 438 if timer_fired.isSet(): 439 fp.write(f"`{self.command}` timed out after {self.timeout} seconds\n".encode()) 440 log("[Command timed out after %s seconds] - " % 441 (self.timeout), new_line=False) 442 443 return p.wait() 444 445 def will_run(self): 446 """Determine if this task will run on this platform.""" 447 return sys.platform in self.platforms 448 449 450class TaskRunner(object): 451 default_name = "couchbase.log" 452 453 def __init__(self, verbosity=0, task_regexp='', tmp_dir=None, 454 salt_value=""): 455 self.files = {} 456 self.verbosity = verbosity 457 self.start_time = time.strftime("%Y%m%d-%H%M%S", time.gmtime()) 458 self.salt_value = salt_value 459 460 # Depending on platform, mkdtemp() may act unpredictably if passed an 461 # empty string. 462 if not tmp_dir: 463 tmp_dir = None 464 else: 465 tmp_dir = os.path.abspath(os.path.expanduser(tmp_dir)) 466 467 try: 468 self.tmpdir = tempfile.mkdtemp(dir=tmp_dir) 469 except OSError as e: 470 print("Could not use temporary dir {0}: {1}".format(tmp_dir, e)) 471 sys.exit(1) 472 473 # If a dir wasn't passed by --tmp-dir, check if the env var was set and 474 # if we were able to use it 475 if not tmp_dir and os.getenv("TMPDIR") and os.path.split( 476 self.tmpdir)[0] != os.getenv("TMPDIR"): 477 log("Could not use TMPDIR {0}".format(os.getenv("TMPDIR"))) 478 log("Using temporary dir {0}".format(os.path.split(self.tmpdir)[0])) 479 480 self.task_regexp = re.compile(task_regexp) 481 482 AltExit.register(self.finalize) 483 484 def finalize(self): 485 try: 486 for fp in self.files.items(): 487 fp.close() 488 except BaseException: 489 # Continue exit handling in spite of any exceptions 490 pass 491 492 shutil.rmtree(self.tmpdir, ignore_errors=True) 493 494 def collect_file(self, filename): 495 """Add a file to the list of files collected. Used to capture the exact 496 file (including timestamps) from the Couchbase instance. 497 498 filename - Absolute path to file to collect. 499 """ 500 if filename not in self.files: 501 try: 502 self.files[filename] = open(filename, 'r') 503 except IOError as e: 504 log("Failed to collect file '%s': %s" % (filename, str(e))) 505 else: 506 log("Unable to collect file '%s' - already collected." % filename) 507 508 def get_file(self, filename): 509 if filename in self.files: 510 fp = self.files[filename] 511 else: 512 fp = open(os.path.join(self.tmpdir, filename), 'wb+') 513 self.files[filename] = fp 514 515 return fp 516 517 def header(self, fp, title, subtitle): 518 separator = '=' * 78 519 if isinstance(subtitle, list): 520 subtitle = " ".join(subtitle) 521 message = f"{separator}\n{title}\n{subtitle}\n{separator}\n" 522 fp.write(message.encode()) 523 fp.flush() 524 525 def log_result(self, result): 526 if result == 0: 527 log("OK") 528 else: 529 log("Exit code %d" % result) 530 531 def run_tasks(self, tasks): 532 for task in tasks: 533 self.run(task) 534 535 def run(self, task): 536 if self.task_regexp.match(task.description) is None: 537 log("Skipping task %s because " 538 "it doesn't match '%s'" % (task.description, 539 self.task_regexp.pattern)) 540 else: 541 self._run(task) 542 543 def _run(self, task): 544 """Run a task with a file descriptor corresponding to its log file""" 545 if task.will_run(): 546 log("%s (%s) - " % (task.description, task.command), new_line=False) 547 if task.privileged and os.getuid() != 0: 548 log("skipped (needs root privs)") 549 return 550 551 task.set_task_runner(self) 552 553 filename = getattr(task, 'log_file', self.default_name) 554 fp = self.get_file(filename) 555 if not task.no_header: 556 self.header(fp, task.description, task.command) 557 558 for i in range(task.num_samples): 559 if i > 0: 560 log("Taking sample %d after %f seconds - " % 561 (i + 1, task.interval), new_line=False) 562 time.sleep(task.interval) 563 result = task.execute(fp) 564 self.log_result(result) 565 566 for artifact in getattr(task, 'artifacts', []): 567 path = artifact 568 if not os.path.isabs(path): 569 # we assume that "relative" artifacts are produced in the 570 # self.tmpdir 571 path = os.path.join(self.tmpdir, path) 572 573 self.collect_file(path) 574 575 fp.flush() 576 577 elif self.verbosity >= 2: 578 log('Skipping "%s" (%s): not for platform %s' % 579 (task.description, task.command, sys.platform)) 580 581 def literal(self, description, value, **kwargs): 582 self.run(LiteralTask(description, value, **kwargs)) 583 584 def redact_and_zip(self, filename, node): 585 files = [] 586 redactor = LogRedactor(self.salt_value, self.tmpdir, self.default_name) 587 588 for name, fp in self.files.items(): 589 if "users.dets" in name: 590 continue 591 files.append(redactor.redact_file(name, fp.name)) 592 593 prefix = "cbcollect_info_%s_%s" % (node, self.start_time) 594 self._zip_helper(prefix, filename, files) 595 596 def close_all_files(self): 597 for name, fp in self.files.items(): 598 fp.close() 599 600 def zip(self, filename, node): 601 prefix = "cbcollect_info_%s_%s" % (node, self.start_time) 602 603 files = [] 604 for name, fp in self.files.items(): 605 files.append(fp.name) 606 self._zip_helper(prefix, filename, files) 607 608 def _zip_helper(self, prefix, filename, files): 609 """Write all our logs to a zipfile""" 610 exe = exec_name("gozip") 611 612 fallback = False 613 614 try: 615 p = subprocess.Popen([exe, 616 "-strip-path", 617 "-prefix", 618 prefix, 619 filename] + files, 620 stderr=subprocess.STDOUT, 621 stdin=subprocess.PIPE) 622 p.stdin.close() 623 status = p.wait() 624 625 if status != 0: 626 log("gozip terminated with non-zero exit code (%d)" % status) 627 except OSError as e: 628 log("Exception during compression: %s" % e) 629 fallback = True 630 631 if fallback: 632 log("IMPORTANT:") 633 log(" Compression using gozip failed.") 634 log(" Falling back to python implementation.") 635 log(" Please let us know about this and provide console output.") 636 637 self._zip_fallback(filename, prefix, files) 638 639 def _zip_fallback(self, filename, prefix, files): 640 from zipfile import ZipFile, ZIP_DEFLATED 641 zf = ZipFile(filename, mode='w', compression=ZIP_DEFLATED) 642 try: 643 for name in files: 644 zf.write(name, 645 "%s/%s" % (prefix, os.path.basename(name))) 646 finally: 647 zf.close() 648 649 650class SolarisTask(Task): 651 platforms = ['sunos5', 'solaris'] 652 653 654class LinuxTask(Task): 655 platforms = ['linux'] 656 657 658class WindowsTask(Task): 659 platforms = ['win32', 'cygwin'] 660 661 662class MacOSXTask(Task): 663 platforms = ['darwin'] 664 665 666class UnixTask(SolarisTask, LinuxTask, MacOSXTask): 667 platforms = SolarisTask.platforms + LinuxTask.platforms + MacOSXTask.platforms 668 669 670class AllOsTask(UnixTask, WindowsTask): 671 platforms = UnixTask.platforms + WindowsTask.platforms 672 673 674class LiteralTask(AllOsTask): 675 def __init__(self, description, literal, **kwargs): 676 self.description = description 677 self.command = '' 678 self.literal = literal 679 self.__dict__.update(kwargs) 680 681 def execute(self, fp): 682 fp.write(self.literal.encode() + b'\n') 683 return 0 684 685 686class CollectFile(AllOsTask): 687 def __init__(self, description, file_path, **kwargs): 688 self.description = description 689 self.command = '' 690 self.file_path = file_path 691 self.__dict__.update(kwargs) 692 693 def execute(self, fp): 694 self._task_runner.collect_file(self.file_path) 695 fp.write(f"Collected file {self.file_path}\n".encode()) 696 return 0 697 698 699def make_curl_task(name, user, password, url, 700 timeout=60, log_file="couchbase.log", base_task=AllOsTask, 701 **kwargs): 702 return base_task(name, ["curl", "-sS", "--proxy", "", "-K-", url], 703 timeout=timeout, 704 log_file=log_file, 705 to_stdin="--user %s:%s" % (user, password), 706 **kwargs) 707 708 709def make_cbstats_task(kind, memcached_pass, guts): 710 port = read_guts(guts, "memcached_port") 711 user = read_guts(guts, "memcached_admin") 712 return AllOsTask("memcached stats %s" % kind, 713 flatten(["cbstats", "-a", "%s:%s" % 714 (local_url_addr, port), kind, "-u", user]), 715 log_file="stats.log", 716 timeout=60, 717 addenv=[("CB_PASSWORD", memcached_pass)]) 718 719 720def get_local_token(guts, port): 721 path = read_guts(guts, "localtoken_path") 722 token = "" 723 try: 724 with open(path, 'r') as f: 725 token = f.read().rstrip('\n') 726 except IOError as e: 727 log("I/O error(%s): %s" % (e.errno, e.strerror)) 728 return token 729 730 731def get_diag_password(guts): 732 port = read_guts(guts, "rest_port") 733 pwd = get_local_token(guts, port) 734 url = "http://%s:%s/diag/password" % (local_url_addr, port) 735 command = ["curl", "-sS", "--proxy", "", "-u", "@localtoken:%s" % pwd, url] 736 737 task = AllOsTask("get diag password", command, timeout=60) 738 output_bytes = BytesIO() 739 status = task.execute(output_bytes) 740 output = output_bytes.getvalue().decode(LATIN1) 741 if status == 0: 742 return output 743 log(output) 744 return "" 745 746 747def make_query_task(statement, user, password, port): 748 url = "http://%s:%s/query/service?statement=%s" % ( 749 local_url_addr, port, urllib.parse.quote(statement)) 750 751 return make_curl_task(name="Result of query statement \'%s\'" % statement, 752 user=user, password=password, url=url) 753 754 755def make_index_task(name, api, passwd, index_port, logfile="couchbase.log", **kwargs): 756 index_url = 'http://%s:%s/%s' % (local_url_addr, index_port, api) 757 758 return make_curl_task(name, "@", passwd, index_url, log_file=logfile, **kwargs) 759 760 761def make_redaction_task(): 762 return LiteralTask("Log Redaction", "RedactLevel:none") 763 764 765def basedir(): 766 # We are installed in $INSTALL_DIR/lib/python, so need to go up three 767 # levels 768 return os.path.normpath( 769 os.path.join( 770 os.path.abspath(__file__), 771 '..', '..', '..' 772 ) 773 ) 774 775 776def make_event_log_task(): 777 from datetime import datetime, timedelta 778 779 # I found that wmic ntevent can be extremely slow; so limiting the output 780 # to approximately last month 781 limit = datetime.today() - timedelta(days=31) 782 limit = limit.strftime('%Y%m%d000000.000000-000') 783 784 return WindowsTask( 785 "Event log", 786 "wmic ntevent where " 787 "\"" 788 "(LogFile='application' or LogFile='system') and " 789 "EventType<3 and TimeGenerated>'%(limit)s'" 790 "\" " 791 "get TimeGenerated,LogFile,SourceName,EventType,Message " 792 "/FORMAT:list" % 793 locals()) 794 795 796def make_os_tasks(): 797 programs = " ".join(["moxi", "memcached", "beam.smp", 798 "couch_compact", "godu", "sigar_port", 799 "cbq-engine", "indexer", "projector", "goxdcr", 800 "cbft", "eventing-producer", "eventing-consumer"]) 801 802 _tasks = [ 803 UnixTask("uname", "uname -a"), 804 UnixTask("time and TZ", "date; date -u"), 805 UnixTask("ntp time", 806 "ntpdate -q pool.ntp.org || " 807 "nc time.nist.gov 13 || " 808 "netcat time.nist.gov 13", timeout=60), 809 UnixTask("ntp peers", "ntpq -p"), 810 UnixTask("raw /etc/sysconfig/clock", "cat /etc/sysconfig/clock"), 811 UnixTask("raw /etc/timezone", "cat /etc/timezone"), 812 WindowsTask("System information", "systeminfo"), 813 WindowsTask("Computer system", "wmic computersystem"), 814 WindowsTask("Computer OS", "wmic os"), 815 LinuxTask("System Hardware", "lshw -json || lshw"), 816 SolarisTask("Process list snapshot", 817 "prstat -a -c -n 100 -t -v -L 1 10"), 818 SolarisTask("Process list", "ps -ef"), 819 SolarisTask("Service configuration", "svcs -a"), 820 SolarisTask("Swap configuration", "swap -l"), 821 SolarisTask("Disk activity", "zpool iostat 1 10"), 822 SolarisTask("Disk activity", "iostat -E 1 10"), 823 LinuxTask("Process list snapshot", 824 "export TERM=''; top -Hb -n1 || top -H n1"), 825 LinuxTask( 826 "Process list", 827 "ps -AwwL -o user,pid,lwp,ppid,nlwp,pcpu,maj_flt,min_flt,pri,nice,vsize,rss,tty,stat,wchan:12,start," 828 "bsdtime,comm,command"), 829 LinuxTask("Raw /proc/buddyinfo", "cat /proc/buddyinfo"), 830 LinuxTask("Raw /proc/meminfo", "cat /proc/meminfo"), 831 LinuxTask("Raw /proc/pagetypeinfo", "cat /proc/pagetypeinfo"), 832 LinuxTask("Raw /proc/zoneinfo", "cat /proc/zoneinfo"), 833 LinuxTask("Raw /proc/vmstat", "cat /proc/vmstat"), 834 LinuxTask("Raw /proc/mounts", "cat /proc/mounts"), 835 LinuxTask("Raw /proc/partitions", "cat /proc/partitions"), 836 LinuxTask("Raw /proc/diskstats", 837 "cat /proc/diskstats; echo ''", num_samples=10, interval=1), 838 LinuxTask("Raw /proc/interrupts", "cat /proc/interrupts"), 839 LinuxTask("Swap configuration", "free -t"), 840 LinuxTask("Swap configuration", "swapon -s"), 841 LinuxTask("Kernel modules", "lsmod"), 842 LinuxTask("Distro version", "cat /etc/redhat-release"), 843 LinuxTask("Distro version", "cat /etc/oracle-release"), 844 LinuxTask("Distro version", "lsb_release -a"), 845 LinuxTask("Distro version", "cat /etc/SuSE-release"), 846 LinuxTask("Distro version", "cat /etc/issue"), 847 LinuxTask("Distro version", "cat /etc/os-release"), 848 LinuxTask("Distro version", "cat /etc/system-release"), 849 LinuxTask("Installed software", "rpm -qa"), 850 LinuxTask("Ksplice updates", "uptrack-show"), 851 LinuxTask("Hot fix list", "rpm -V couchbase-server"), 852 # NOTE: AFAIK columns _was_ necessary, but it doesn't appear to be 853 # required anymore. I.e. dpkg -l correctly detects stdout as not a 854 # tty and stops playing smart on formatting. Lets keep it for few 855 # years and then drop, however. 856 LinuxTask("Installed software", "COLUMNS=300 dpkg -l"), 857 # NOTE: -V is supported only from dpkg v1.17.2 onwards. 858 LinuxTask("Hot fix list", "COLUMNS=300 dpkg -V couchbase-server"), 859 LinuxTask( 860 "Extended iostat", 861 "iostat -x -p ALL 1 10 || iostat -x 1 10"), 862 LinuxTask("Core dump settings", 863 "find /proc/sys/kernel -type f -name '*core*' -print -exec cat '{}' ';'"), 864 UnixTask("sysctl settings", "sysctl -a"), 865 LinuxTask("Relevant lsof output", 866 "echo %(programs)s | xargs -n1 pgrep | xargs -n1 -r -- lsof -n -p" % locals()), 867 LinuxTask("LVM info", "lvdisplay"), 868 LinuxTask("LVM info", "vgdisplay"), 869 LinuxTask("LVM info", "pvdisplay"), 870 LinuxTask("Block device queue settings", 871 "find /sys/block/*/queue -type f | xargs grep -vH xxxx | sort"), 872 MacOSXTask("Process list snapshot", "top -l 1"), 873 MacOSXTask("Disk activity", "iostat 1 10"), 874 MacOSXTask("Process list", 875 "ps -Aww -o user,pid,lwp,ppid,nlwp,pcpu,pri,nice,vsize,rss,tty," 876 "stat,wchan:12,start,bsdtime,command"), 877 WindowsTask("Installed software", "wmic product get name, version"), 878 WindowsTask( 879 "Service list", "wmic service where state=\"running\" GET caption, name, state"), 880 WindowsTask("Process list", "wmic process"), 881 WindowsTask("Process usage", "tasklist /V /fo list"), 882 WindowsTask("Swap settings", "wmic pagefile"), 883 WindowsTask("Disk partition", "wmic partition"), 884 WindowsTask("Disk volumes", "wmic volume"), 885 UnixTask("Network configuration", "ifconfig -a", interval=10, 886 num_samples=2), 887 LinuxTask("Network configuration", 888 "echo link addr neigh rule route netns | xargs -n1 -- sh -x -c 'ip $1 list' --"), 889 WindowsTask("Network configuration", "ipconfig /all", interval=10, 890 num_samples=2), 891 LinuxTask("Raw /proc/net/dev", "cat /proc/net/dev"), 892 LinuxTask("Network link statistics", "ip -s link"), 893 UnixTask("Network status", "netstat -anp || netstat -an"), 894 WindowsTask("Network status", "netstat -anotb"), 895 AllOsTask("Network routing table", "netstat -rn"), 896 LinuxTask("Network socket statistics", "ss -an"), 897 LinuxTask("Extended socket statistics", 898 "ss -an --info --processes --memory --options", 899 timeout=300), 900 UnixTask("Arp cache", "arp -na"), 901 LinuxTask("Iptables dump", "iptables-save"), 902 UnixTask("Raw /etc/hosts", "cat /etc/hosts"), 903 UnixTask("Raw /etc/resolv.conf", "cat /etc/resolv.conf"), 904 UnixTask("Raw /etc/nsswitch.conf", "cat /etc/nsswitch.conf"), 905 WindowsTask("Arp cache", "arp -a"), 906 WindowsTask("Network Interface Controller", "wmic nic"), 907 WindowsTask("Network Adapter", "wmic nicconfig"), 908 WindowsTask("Active network connection", "wmic netuse"), 909 WindowsTask("Protocols", "wmic netprotocol"), 910 WindowsTask( 911 "Hosts file", "type %SystemRoot%\system32\drivers\etc\hosts"), 912 WindowsTask("Cache memory", "wmic memcache"), 913 WindowsTask("Physical memory", "wmic memphysical"), 914 WindowsTask("Physical memory chip info", "wmic memorychip"), 915 WindowsTask("Local storage devices", "wmic logicaldisk"), 916 UnixTask("Filesystem", "df -ha"), 917 UnixTask("System activity reporter", "sar 1 10"), 918 UnixTask("System paging activity", "vmstat 1 10"), 919 UnixTask("System uptime", "uptime"), 920 UnixTask("Last logins of users and ttys", "last -x || last"), 921 UnixTask("couchbase user definition", "getent passwd couchbase"), 922 UnixTask("couchbase user limits", "su couchbase -s /bin/sh -c \"ulimit -a\"", 923 privileged=True), 924 UnixTask("Interrupt status", "intrstat 1 10"), 925 UnixTask("Processor status", "mpstat 1 10"), 926 UnixTask("System log", "cat /var/adm/messages"), 927 LinuxTask("Raw /proc/uptime", "cat /proc/uptime"), 928 LinuxTask("Systemd journal", 929 "journalctl | gzip -c > systemd_journal.gz", 930 change_dir=True, artifacts=['systemd_journal.gz']), 931 LinuxTask("All logs", 932 "tar cz /var/log/syslog* /var/log/dmesg /var/log/messages* /var/log/daemon* /var/log/debug* " 933 "/var/log/kern.log* 2>/dev/null", 934 log_file="syslog.tar.gz", no_header=True), 935 LinuxTask("Relevant proc data", "echo %(programs)s | " 936 "xargs -n1 pgrep | xargs -n1 -- sh -c 'echo $1; cat /proc/$1/status; cat /proc/$1/limits; " 937 "cat /proc/$1/smaps; cat /proc/$1/numa_maps; cat /proc/$1/task/*/sched; echo' --" % locals()), 938 LinuxTask("Processes' environment", "echo %(programs)s | " 939 r"xargs -n1 pgrep | xargs -n1 -- sh -c 'echo $1; ( cat /proc/$1/environ | tr \\0 \\n | " 940 "egrep -v ^CB_MASTER_PASSWORD=\|^CBAUTH_REVRPC_URL=); echo' --" % locals()), 941 LinuxTask("Processes' stack", 942 "for program in %(programs)s; do for thread in $(pgrep --lightweight $program); " 943 "do echo $program/$thread:; cat /proc/$thread/stack; echo; done; done" % locals()), 944 LinuxTask("NUMA data", "numactl --hardware"), 945 LinuxTask("NUMA data", "numactl --show"), 946 LinuxTask("NUMA data", "cat /sys/devices/system/node/node*/numastat"), 947 UnixTask("Kernel log buffer", "dmesg -T || dmesg -H || dmesg"), 948 LinuxTask("Transparent Huge Pages data", 949 "cat /sys/kernel/mm/transparent_hugepage/enabled"), 950 LinuxTask("Transparent Huge Pages data", 951 "cat /sys/kernel/mm/transparent_hugepage/defrag"), 952 LinuxTask("Transparent Huge Pages data", 953 "cat /sys/kernel/mm/redhat_transparent_hugepage/enabled"), 954 LinuxTask("Transparent Huge Pages data", 955 "cat /sys/kernel/mm/redhat_transparent_hugepage/defrag"), 956 LinuxTask("Network statistics", "netstat -s"), 957 LinuxTask("Full raw netstat", "cat /proc/net/netstat"), 958 LinuxTask("CPU throttling info", 959 "echo /sys/devices/system/cpu/cpu*/thermal_throttle/* | xargs -n1 -- sh -c 'echo $1; cat $1' --"), 960 LinuxTask("Raw PID 1 scheduler /proc/1/sched", 961 "cat /proc/1/sched | head -n 1"), 962 LinuxTask("Raw PID 1 control groups /proc/1/cgroup", 963 "cat /proc/1/cgroup"), 964 make_event_log_task(), 965 ] 966 967 return _tasks 968 969# stolen from 970# http://rightfootin.blogspot.com/2006/09/more-on-python-flatten.html 971 972 973def iter_flatten(iterable): 974 it = iter(iterable) 975 for e in it: 976 if isinstance(e, (list, tuple)): 977 for f in iter_flatten(e): 978 yield f 979 else: 980 yield e 981 982 983def flatten(iterable): 984 return [e for e in iter_flatten(iterable)] 985 986 987def read_guts(guts, key): 988 return guts.get(key, "") 989 990 991def winquote_path(s): 992 return '"' + s.replace("\\\\", "\\").replace('/', "\\") + '"' 993 994# python's split splits empty string to [''] which doesn't make any 995# sense. So this function works around that. 996 997 998def correct_split(string, splitchar): 999 rv = string.split(splitchar) 1000 if rv == ['']: 1001 rv = [] 1002 return rv 1003 1004 1005def make_stats_archives_task(guts, initargs_path): 1006 escript = exec_name("escript") 1007 escript_wrapper = find_script("escript-wrapper") 1008 dump_stats = find_script("dump-stats") 1009 stats_dir = read_guts(guts, "stats_dir") 1010 1011 if dump_stats is None or escript_wrapper is None or not stats_dir: 1012 return [] 1013 1014 output_file = "stats_archives.json" 1015 return AllOsTask("stats archives", 1016 [escript, 1017 escript_wrapper, 1018 "--initargs-path", initargs_path, "--", 1019 dump_stats, stats_dir, output_file], 1020 change_dir=True, 1021 artifacts=[output_file]) 1022 1023 1024def make_product_task(guts, initargs_path, memcached_pass, options): 1025 root = os.path.abspath(os.path.join(initargs_path, "..", "..", "..", "..")) 1026 dbdir = os.path.realpath(read_guts(guts, "db_dir")) 1027 viewdir = os.path.realpath(read_guts(guts, "idx_dir")) 1028 rebdir = os.path.realpath(os.path.join( 1029 read_guts(guts, "log_path"), "rebalance")) 1030 nodes = correct_split(read_guts(guts, "nodes"), ",") 1031 1032 diag_url = "http://%s:%s/diag" % ( 1033 local_url_addr, read_guts(guts, "rest_port")) 1034 1035 from distutils.spawn import find_executable 1036 1037 lookup_cmd = None 1038 for cmd in ["dig", "nslookup", "host"]: 1039 if find_executable(cmd) is not None: 1040 lookup_cmd = cmd 1041 break 1042 1043 lookup_tasks = [] 1044 if lookup_cmd is not None: 1045 lookup_tasks = [UnixTask(f"DNS lookup information for {node}", 1046 f"{lookup_cmd} '{node}'") 1047 for node in nodes] 1048 1049 getent_tasks = [LinuxTask("Name Service Switch " 1050 "hosts database info for %s" % node, 1051 ["getent", "ahosts", node]) 1052 for node in nodes] 1053 1054 query_tasks = [] 1055 query_port = read_guts(guts, "query_port") 1056 if query_port: 1057 def make(statement): 1058 return make_query_task(statement, user="@", 1059 password=memcached_pass, 1060 port=query_port) 1061 1062 query_tasks = [make("SELECT * FROM system:datastores"), 1063 make("SELECT * FROM system:namespaces"), 1064 make("SELECT * FROM system:keyspaces"), 1065 make("SELECT * FROM system:indexes")] 1066 1067 index_tasks = [] 1068 index_port = read_guts(guts, "indexer_http_port") 1069 if index_port: 1070 index_tasks = [ 1071 make_index_task( 1072 "Index definitions are: ", 1073 "getIndexStatus", 1074 memcached_pass, 1075 index_port), 1076 make_index_task( 1077 "Indexer settings are: ", 1078 "settings", 1079 memcached_pass, 1080 index_port), 1081 make_index_task( 1082 "Indexer stats are: ", 1083 "stats?partition=true", 1084 memcached_pass, 1085 index_port), 1086 make_index_task( 1087 "Index storage stats are: ", 1088 "stats/storage", 1089 memcached_pass, 1090 index_port), 1091 make_index_task( 1092 "MOI allocator stats are: ", 1093 "stats/storage/mm", 1094 memcached_pass, 1095 index_port), 1096 make_index_task( 1097 "Indexer Go routine dump: ", 1098 "debug/pprof/goroutine?debug=1", 1099 memcached_pass, 1100 index_port, 1101 logfile="indexer_pprof.log"), 1102 make_index_task( 1103 "Indexer Rebalance Tokens: ", 1104 "listRebalanceTokens", 1105 memcached_pass, 1106 index_port), 1107 make_index_task( 1108 "Indexer Metadata Tokens: ", 1109 "listMetadataTokens", 1110 memcached_pass, 1111 index_port), 1112 make_index_task( 1113 "Indexer CPU Profile: ", 1114 "debug/pprof/profile", 1115 memcached_pass, 1116 index_port, 1117 logfile="indexer_cprof.log", 1118 no_header=True), 1119 make_index_task( 1120 "Indexer Memory Profile: ", 1121 "debug/pprof/heap?debug=1", 1122 memcached_pass, 1123 index_port, 1124 logfile="indexer_mprof.log", 1125 no_header=True), 1126 ] 1127 1128 projector_tasks = [] 1129 proj_port = read_guts(guts, "projector_port") 1130 if proj_port: 1131 proj_url = 'http://%s:%s/debug/pprof/goroutine?debug=1' % ( 1132 local_url_addr, proj_port) 1133 projector_tasks = [ 1134 make_curl_task( 1135 name="Projector Go routine dump ", 1136 user="@", 1137 password=memcached_pass, 1138 url=proj_url, 1139 log_file="projector_pprof.log")] 1140 1141 goxdcr_tasks = [] 1142 goxdcr_port = read_guts(guts, "xdcr_rest_port") 1143 if goxdcr_port: 1144 goxdcr_url = 'http://%s:%s/debug/pprof/goroutine?debug=1' % ( 1145 local_url_addr, goxdcr_port) 1146 goxdcr_tasks = [ 1147 make_curl_task( 1148 name="GoXDCR Go routine dump ", 1149 user="@", 1150 password=memcached_pass, 1151 url=goxdcr_url, 1152 log_file="goxdcr_pprof.log")] 1153 1154 fts_tasks = [] 1155 fts_port = read_guts(guts, "fts_http_port") 1156 if fts_port: 1157 url = 'http://%s:%s/api/diag' % (local_url_addr, fts_port) 1158 fts_tasks = [make_curl_task(name="FTS /api/diag: ", 1159 user="@", password=memcached_pass, 1160 url=url, 1161 log_file="fts_diag.json", no_header=True)] 1162 1163 cbas_tasks = [] 1164 cbas_port = read_guts(guts, "cbas_admin_port") 1165 if cbas_port: 1166 cbas_diag_url = 'http://%s:%s/analytics/node/diagnostics' % ( 1167 local_url_addr, cbas_port) 1168 cbas_parent_port = read_guts(guts, "cbas_parent_port") 1169 cbas_pprof_url = 'http://%s:%s/debug/pprof/goroutine?debug=1' % ( 1170 local_url_addr, cbas_parent_port) 1171 1172 def make_cbas(statement): 1173 return make_query_task(statement, user="@", 1174 password=memcached_pass, 1175 port=cbas_port) 1176 1177 cbas_tasks = [ 1178 make_curl_task( 1179 name="Analytics /analytics/node/diagnostics: ", 1180 user="@", 1181 password=memcached_pass, 1182 url=cbas_diag_url, 1183 log_file="analytics_diag.json", 1184 no_header=True), 1185 make_curl_task( 1186 name="Analytics Go routine dump: ", 1187 user="@", 1188 password=memcached_pass, 1189 url=cbas_pprof_url, 1190 log_file="analytics_pprof.log", 1191 no_header=True), 1192 make_cbas("select * from `Metadata`.`Dataverse`"), 1193 make_cbas("select * from `Metadata`.`Dataset`"), 1194 make_cbas("select * from `Metadata`.`Index`"), 1195 make_cbas("select * from `Metadata`.`Bucket`"), 1196 make_cbas("select * from `Metadata`.`Link`")] 1197 1198 eventing_tasks = [] 1199 eventing_port = read_guts(guts, "eventing_http_port") 1200 if eventing_port: 1201 stats_url = 'http://%s:%s/api/v1/stats?type=full' % ( 1202 local_url_addr, eventing_port) 1203 eventing_pprof_url = 'http://%s:%s/debug/pprof/goroutine?debug=1' % ( 1204 local_url_addr, eventing_port) 1205 eventing_insight_url = 'http://%s:%s/getInsight?udmark=true&aggregate=false' % ( 1206 local_url_addr, eventing_port) 1207 eventing_tasks = [ 1208 make_curl_task( 1209 name="Eventing /api/v1/stats: ", 1210 user="@", 1211 password=memcached_pass, 1212 url=stats_url, 1213 log_file="eventing_stats.json", 1214 no_header=True), 1215 make_curl_task( 1216 name="Eventing Go routine dump: ", 1217 user="@", 1218 password=memcached_pass, 1219 url=eventing_pprof_url, 1220 log_file="eventing_pprof.log", 1221 no_header=True), 1222 make_curl_task( 1223 name="Eventing code insights: ", 1224 user="@", 1225 password=memcached_pass, 1226 url=eventing_insight_url, 1227 log_file="eventing_insights.log", 1228 no_header=True)] 1229 1230 _tasks = [ 1231 UnixTask("Directory structure", 1232 ["ls", "-lRai", root]), 1233 UnixTask("Database directory structure", 1234 ["ls", "-lRai", dbdir]), 1235 UnixTask("Index directory structure", 1236 ["ls", "-lRai", viewdir]), 1237 UnixTask("couch_dbinfo", 1238 ["find", dbdir, "-type", "f", 1239 "-name", "*.couch.*", 1240 "-exec", "couch_dbinfo", "{}", "+"]), 1241 LinuxTask("Database directory filefrag info", 1242 ["find", dbdir, "-type", "f", "-exec", "filefrag", "-v", "{}", "+"]), 1243 LinuxTask("Index directory filefrag info", 1244 ["find", viewdir, "-type", "f", "-exec", "filefrag", "-v", "{}", "+"]), 1245 WindowsTask("Database directory structure", 1246 "dir /s " + winquote_path(dbdir)), 1247 WindowsTask("Index directory structure", 1248 "dir /s " + winquote_path(viewdir)), 1249 WindowsTask("Version file", 1250 "type " + winquote_path(basedir()) + "\\VERSION.txt"), 1251 WindowsTask("Manifest file", 1252 "type " + winquote_path(basedir()) + "\\manifest.txt"), 1253 WindowsTask("Manifest file", 1254 "type " + winquote_path(basedir()) + "\\manifest.xml"), 1255 LinuxTask("Version file", "cat '%s/VERSION.txt'" % root), 1256 LinuxTask("Variant file", "cat '%s/VARIANT.txt'" % root), 1257 LinuxTask("Manifest file", "cat '%s/manifest.txt'" % root), 1258 LinuxTask("Manifest file", "cat '%s/manifest.xml'" % root), 1259 LiteralTask("Couchbase config", read_guts(guts, "ns_config")), 1260 LiteralTask("Couchbase static config", 1261 read_guts(guts, "static_config")), 1262 LiteralTask("Raw ns_log", read_guts(guts, "ns_log")), 1263 # TODO: just gather those in python 1264 WindowsTask("Memcached logs", 1265 "cd " + winquote_path(read_guts(guts, "memcached_logs_path")) + " && " + 1266 "for /f %a IN ('dir memcached.log.* /od /tw /b') do type %a", 1267 log_file="memcached.log"), 1268 UnixTask("Memcached logs", 1269 ["sh", "-c", 'cd "$1"; for file in $(ls -tr memcached.log.*); do cat \"$file\"; done', 1270 "--", read_guts(guts, "memcached_logs_path")], 1271 log_file="memcached.log"), 1272 [WindowsTask("Ini files (%s)" % p, 1273 "type " + winquote_path(p), 1274 log_file="ini.log") 1275 for p in read_guts(guts, "couch_inis").split(";")], 1276 UnixTask("Ini files", 1277 ["sh", "-c", 'for i in "$@"; do echo "file: $i"; cat "$i"; done', 1278 "--"] + read_guts(guts, "couch_inis").split(";"), 1279 log_file="ini.log"), 1280 1281 make_curl_task(name="couchbase diags", 1282 user="@", 1283 password=memcached_pass, 1284 timeout=600, 1285 url=diag_url, 1286 log_file="diag.log"), 1287 1288 make_curl_task(name="master events", 1289 user="@", 1290 password=memcached_pass, 1291 timeout=300, 1292 url='http://%s:%s/diag/masterEvents?o=1' % ( 1293 local_url_addr, read_guts(guts, "rest_port")), 1294 log_file="master_events.log", 1295 no_header=True), 1296 1297 make_curl_task(name="ale configuration", 1298 user="@", 1299 password=memcached_pass, 1300 url='http://%s:%s/diag/ale' % ( 1301 local_url_addr, read_guts(guts, "rest_port")), 1302 log_file="couchbase.log"), 1303 1304 [AllOsTask("couchbase logs (%s)" % name, "cbbrowse_logs %s" % name, 1305 addenv=[("REPORT_DIR", read_guts(guts, "log_path"))], 1306 log_file="ns_server.%s" % name) 1307 for name in ["debug.log", "info.log", "error.log", "couchdb.log", 1308 "xdcr_target.log", 1309 "views.log", "mapreduce_errors.log", 1310 "stats.log", "babysitter.log", 1311 "reports.log", "http_access.log", 1312 "http_access_internal.log", "ns_couchdb.log", 1313 "goxdcr.log", "query.log", "projector.log", "indexer.log", 1314 "fts.log", "metakv.log", "json_rpc.log", "eventing.log", 1315 "analytics_info.log", "analytics_debug.log", "analytics_shutdown.log", 1316 "analytics_error.log", "analytics_warn.log", "analytics_dcpdebug.log", 1317 "analytics_trace.json", "analytics_access.log", "analytics_cbas_debug.log"]], 1318 1319 [make_cbstats_task(kind, memcached_pass, guts) 1320 for kind in ["all", "allocator", "checkpoint", "collections", "config", 1321 "dcp", "dcpagg", 1322 ["diskinfo", "detail"], ["dispatcher", "logs"], 1323 "eviction", "failovers", ["hash", "detail"], 1324 "kvstore", "kvtimings", "memory", 1325 "prev-vbucket", ["responses", "all"], 1326 "runtimes", "scheduler", "scopes", 1327 "tasks", 1328 "timings", "uuid", 1329 "vbucket-details", "vbucket-seqno", 1330 "warmup", "workload"]], 1331 1332 [AllOsTask("memcached mcstat %s" % kind, 1333 flatten(["mcstat", "-h", "%s:%s" % (local_url_addr, read_guts(guts, "memcached_port")), 1334 "-u", read_guts(guts, "memcached_admin"), kind]), 1335 log_file="stats.log", 1336 timeout=60, 1337 addenv=[("CB_PASSWORD", memcached_pass)]) 1338 for kind in ["connections", "tracing"]], 1339 1340 [AllOsTask("fts mossScope (%s)" % path, 1341 ["mossScope", "stats", "diag", path], 1342 log_file="fts_store_stats.log") 1343 for path in glob.glob(os.path.join(viewdir, "@fts", "*.pindex", "store")) 1344 if any(".moss" in entry for entry in os.listdir(path))], 1345 1346 [AllOsTask("fts scorch zap (%s)" % path, 1347 ["cbft-bleve", "zap", "footer", path], 1348 log_file="fts_store_stats.log") 1349 for path in glob.glob(os.path.join(viewdir, "@fts", "*.pindex", "store", "*.zap"))], 1350 1351 [AllOsTask("ddocs for %s (%s)" % (bucket, path), 1352 ["couch_dbdump", path], 1353 log_file="ddocs.log") 1354 for bucket in set(correct_split(read_guts(guts, "buckets"), ",")) - \ 1355 set(correct_split(read_guts(guts, "memcached_buckets"), ",")) 1356 for path in glob.glob(os.path.join(dbdir, bucket, "master.couch*"))], 1357 1358 [AllOsTask("Couchstore local documents (%s, %s)" % (bucket, os.path.basename(path)), 1359 ["couch_dbdump", "--local", path], 1360 log_file="couchstore_local.log") 1361 for bucket in set(correct_split(read_guts(guts, "buckets"), ",")) - \ 1362 set(correct_split(read_guts(guts, "memcached_buckets"), ",")) 1363 for path in glob.glob(os.path.join(dbdir, bucket, "*.couch.*"))], 1364 1365 # RocksDB has logs per DB (i.e. vBucket). 'LOG' is the most 1366 # recent file, with old files named LOG.old.<timestamp>. 1367 # Sort so we go from oldest -> newest as per other log files. 1368 [AllOsTask("RocksDB Log file (%s, %s)" % (bucket, os.path.basename(path)), 1369 "cat '%s'" % (log_file), 1370 log_file="kv_rocks.log") 1371 for bucket in (set(correct_split(read_guts(guts, "buckets"), ",")) - 1372 set(correct_split(read_guts(guts, "memcached_buckets"), ","))) 1373 for path in glob.glob(os.path.join(dbdir, bucket, "rocksdb.*")) 1374 for log_file in sorted(glob.glob(os.path.join(path, "LOG.old.*"))) + [os.path.join(path, "LOG")]], 1375 1376 [UnixTask("moxi stats (port %s)" % port, 1377 "echo stats proxy | nc %s %s" % (local_addr, port), 1378 log_file="stats.log", 1379 timeout=60) 1380 for port in correct_split(read_guts(guts, "moxi_ports"), ",")], 1381 1382 [AllOsTask("mctimings %s" % stat, 1383 ["mctimings", 1384 "-u", read_guts(guts, "memcached_admin"), 1385 "-h", "%s:%s" % (local_url_addr, 1386 read_guts(guts, "memcached_port")), 1387 "-v"] + stat, 1388 log_file="stats.log", 1389 timeout=60, 1390 addenv=[("CB_PASSWORD", memcached_pass)]) 1391 for stat in ([], ["subdoc_execute"])], 1392 1393 CollectFile("Users storage", read_guts(guts, "users_storage_path")), 1394 1395 [CollectFile("Rebalance Report: %s" % path, path) 1396 for path in glob.glob(os.path.join(rebdir, "rebalance_report*"))], 1397 1398 make_stats_archives_task(guts, initargs_path), 1399 AllOsTask("Phosphor Trace", 1400 ["kv_trace_dump", 1401 "-H", "%s:%s" % (local_url_addr, 1402 read_guts(guts, "memcached_port")), 1403 "-u", read_guts(guts, "memcached_admin"), 1404 "-P", memcached_pass, 1405 "kv_trace.json"], 1406 timeout=120, 1407 log_file="stats.log", 1408 change_dir=True, 1409 artifacts=["kv_trace.json"]), 1410 ] 1411 1412 _tasks = flatten([getent_tasks, 1413 lookup_tasks, 1414 query_tasks, 1415 index_tasks, 1416 projector_tasks, 1417 fts_tasks, 1418 cbas_tasks, 1419 eventing_tasks, 1420 goxdcr_tasks, 1421 _tasks]) 1422 1423 return _tasks 1424 1425 1426def find_script(name): 1427 path = os.path.join(basedir(), "bin", name) 1428 if os.path.exists(path): 1429 log("Found %s: %s" % (name, path)) 1430 return os.path.abspath(path) 1431 1432 return None 1433 1434 1435def get_server_guts(initargs_path): 1436 dump_guts_path = find_script("dump-guts") 1437 1438 if dump_guts_path is None: 1439 log("Couldn't find dump-guts script. Some information will be missing") 1440 return {} 1441 1442 # Check if initargs exists and is read accessible. 1443 if not os.path.exists(initargs_path): 1444 log("initargs file '{}' does not exist".format(initargs_path)) 1445 return {} 1446 1447 if not os.access(initargs_path, os.R_OK): 1448 log("Read access to '{}' is required in order to proceed".format( 1449 initargs_path)) 1450 sys.exit(1) 1451 1452 escript = exec_name("escript") 1453 extra_args = os.getenv("EXTRA_DUMP_GUTS_ARGS") 1454 args = [escript, dump_guts_path, "--initargs-path", initargs_path] 1455 if extra_args: 1456 args = args + extra_args.split(";") 1457 print("Checking for server guts in %s..." % initargs_path) 1458 p = subprocess.Popen(args, stdout=subprocess.PIPE) 1459 output = p.stdout.read().decode(LATIN1) 1460 p.wait() 1461 rc = p.returncode 1462 # print("args: %s gave rc: %d and:\n\n%s\n" % (args, rc, output)) 1463 tokens = output.rstrip("\0").split("\0") 1464 d = {} 1465 if len(tokens) > 1: 1466 for i in range(0, len(tokens), 2): 1467 d[tokens[i]] = tokens[i + 1] 1468 return d 1469 1470 1471def guess_utility(command): 1472 if isinstance(command, list): 1473 command = ' '.join(command) 1474 1475 if not command: 1476 return None 1477 1478 if re.findall(r'[|;&]|\bsh\b|\bsu\b|\bfind\b|\bfor\b', command): 1479 # something hard to easily understand; let the human decide 1480 return command 1481 else: 1482 return command.split()[0] 1483 1484 1485def dump_utilities(*args, **kwargs): 1486 specific_platforms = {SolarisTask: 'Solaris', 1487 LinuxTask: 'Linux', 1488 WindowsTask: 'Windows', 1489 MacOSXTask: 'Mac OS X'} 1490 platform_utils = dict((name, set()) 1491 for name in specific_platforms.values()) 1492 1493 class FakeOptions(object): 1494 def __getattr__(self, name): 1495 return None 1496 1497 tasks = make_os_tasks() + make_product_task({}, "", "", FakeOptions()) 1498 1499 for task in tasks: 1500 utility = guess_utility(task.command) 1501 if utility is None: 1502 continue 1503 1504 for (platform, name) in specific_platforms.items(): 1505 if isinstance(task, platform): 1506 platform_utils[name].add(utility) 1507 1508 print('This is an autogenerated, possibly incomplete and flawed list ' 1509 'of utilites used by cbcollect_info') 1510 1511 for (name, utilities) in sorted( 1512 platform_utils.items(), key=lambda x: x[0]): 1513 print("\n%s:" % name) 1514 1515 for utility in sorted(utilities): 1516 print(" - %s" % utility) 1517 1518 sys.exit(0) 1519 1520 1521def stdin_watcher(): 1522 fd = sys.stdin.fileno() 1523 1524 while True: 1525 buf = os.read(fd, 1024) 1526 # stdin closed 1527 if not buf: 1528 break 1529 1530 1531def setup_stdin_watcher(): 1532 def _in_thread(): 1533 try: 1534 stdin_watcher() 1535 finally: 1536 AltExit.exit(2) 1537 th = threading.Thread(target=_in_thread, daemon=True) 1538 th.start() 1539 1540 1541class CurlKiller: 1542 def __init__(self, p): 1543 self.p = p 1544 1545 def cleanup(self): 1546 if self.p is not None: 1547 print("Killing curl...") 1548 os.kill(self.p.pid, signal.SIGKILL) 1549 print("done") 1550 1551 def disarm(self): 1552 self.p = None 1553 1554 1555def do_upload_and_exit(path, url, proxy): 1556 output_fd, output_file = tempfile.mkstemp() 1557 os.close(output_fd) 1558 1559 AltExit.register(lambda: os.unlink(output_file)) 1560 1561 args = ["curl", "-sS", 1562 "--output", output_file, 1563 "--proxy", proxy, 1564 "--write-out", "%{http_code}", "--upload-file", path, url] 1565 AltExit.lock.acquire() 1566 try: 1567 p = subprocess.Popen(args, stdout=subprocess.PIPE) 1568 k = CurlKiller(p) 1569 AltExit.register_and_unlock(k.cleanup) 1570 except Exception as e: 1571 AltExit.lock.release() 1572 raise e 1573 1574 stdout, _ = p.communicate() 1575 stdout = stdout.decode(LATIN1) 1576 k.disarm() 1577 1578 if p.returncode != 0: 1579 sys.exit(1) 1580 else: 1581 if stdout.strip() == '200': 1582 log('Upload path is: %s' % url) 1583 log('Done uploading') 1584 sys.exit(0) 1585 else: 1586 log('HTTP status code: %s' % stdout) 1587 sys.exit(1) 1588 1589 1590def parse_host(host): 1591 url = urllib.parse.urlsplit(host) 1592 if not url.scheme: 1593 url = urllib.parse.urlsplit('https://' + host) 1594 1595 return url.scheme, url.netloc, url.path 1596 1597 1598def generate_upload_url(parser, options, zip_filename): 1599 upload_url = None 1600 if options.upload_host: 1601 if not options.upload_customer: 1602 parser.error("Need --customer when --upload-host is given") 1603 1604 scheme, netloc, path = parse_host(options.upload_host) 1605 1606 customer = urllib.parse.quote(options.upload_customer) 1607 fname = urllib.parse.quote(os.path.basename(zip_filename)) 1608 if options.upload_ticket: 1609 full_path = '%s/%s/%d/%s' % (path, 1610 customer, 1611 options.upload_ticket, 1612 fname) 1613 else: 1614 full_path = '%s/%s/%s' % (path, customer, fname) 1615 1616 upload_url = urllib.parse.urlunsplit( 1617 (scheme, netloc, full_path, '', '')) 1618 log("Will upload collected .zip file into %s" % upload_url) 1619 return upload_url 1620 1621 1622def check_ticket(option, opt, value): 1623 if re.match(r'^\d{1,7}$', value): 1624 return int(value) 1625 else: 1626 raise optparse.OptionValueError( 1627 "option %s: invalid ticket number: %r" % (opt, value)) 1628 1629 1630class CbcollectInfoOptions(optparse.Option): 1631 from copy import copy 1632 1633 TYPES = optparse.Option.TYPES + ("ticket",) 1634 TYPE_CHECKER = copy(optparse.Option.TYPE_CHECKER) 1635 TYPE_CHECKER["ticket"] = check_ticket 1636 1637 1638def main(): 1639 # ask all tools to use C locale (MB-12050) 1640 os.environ['LANG'] = 'C' 1641 os.environ['LC_ALL'] = 'C' 1642 if 'HOME' not in os.environ: 1643 os.environ['HOME'] = basedir() 1644 1645 rootdir = basedir() 1646 # (MB-8239)erl script fails in OSX as it is unable to find COUCHBASE_TOP -ravi 1647 if platform.system() == 'Darwin': 1648 os.environ["COUCHBASE_TOP"] = rootdir 1649 1650 parser = optparse.OptionParser( 1651 usage=USAGE, option_class=CbcollectInfoOptions) 1652 parser.add_option("-r", dest="root", 1653 help="root directory - defaults to %s" % (rootdir), 1654 default=rootdir) 1655 parser.add_option("-v", dest="verbosity", help="increase verbosity level", 1656 action="count", default=0) 1657 parser.add_option( 1658 "-p", 1659 dest="product_only", 1660 help="gather only product related information", 1661 action="store_true", 1662 default=False) 1663 parser.add_option("-d", action="callback", callback=dump_utilities, 1664 help="dump a list of commands that cbcollect_info needs") 1665 parser.add_option("--watch-stdin", dest="watch_stdin", 1666 action="store_true", default=False, 1667 help=optparse.SUPPRESS_HELP) 1668 parser.add_option("--initargs", dest="initargs", 1669 help="server 'initargs' path") 1670 parser.add_option( 1671 "--log-redaction-level", 1672 dest="redact_level", 1673 default="none", 1674 help="redaction level for the logs collected, none and partial supported (default is none)") 1675 parser.add_option("--log-redaction-salt", dest="salt_value", 1676 default=str(uuid.uuid4()), 1677 help="Is used to salt the hashing of tagged data, \ 1678 defaults to random uuid. If input by user it should \ 1679 be provided along with --log-redaction-level option") 1680 parser.add_option("--just-upload-into", dest="just_upload_into", 1681 help=optparse.SUPPRESS_HELP) 1682 parser.add_option( 1683 "--upload-host", 1684 dest="upload_host", 1685 help="gather diagnostics and upload it for couchbase support. Gives upload host") 1686 parser.add_option("--customer", dest="upload_customer", 1687 help="specifies customer name for upload") 1688 parser.add_option("--ticket", dest="upload_ticket", type='ticket', 1689 help="specifies support ticket number for upload") 1690 parser.add_option("--bypass-sensitive-data", dest="bypass_sensitive_data", 1691 action="store_true", default=False, 1692 help="do not collect sensitive data") 1693 parser.add_option( 1694 "--task-regexp", 1695 dest="task_regexp", 1696 default="", 1697 help="Run only tasks matching regexp. For debugging purposes only.") 1698 parser.add_option( 1699 "--tmp-dir", 1700 dest="tmp_dir", 1701 default=None, 1702 help="set the temp dir used while processing collected data. Overrides the TMPDIR env variable if set") 1703 parser.add_option("--upload-proxy", dest="upload_proxy", default="", 1704 help="specifies proxy for upload") 1705 options, args = parser.parse_args() 1706 1707 if len(args) != 1: 1708 parser.error( 1709 "incorrect number of arguments. Expecting filename to collect diagnostics into") 1710 1711 if options.watch_stdin: 1712 setup_stdin_watcher() 1713 1714 zip_filename = args[0] 1715 if zip_filename[-4:] != '.zip': 1716 zip_filename = zip_filename + '.zip' 1717 1718 zip_dir = os.path.dirname(os.path.abspath(zip_filename)) 1719 1720 if not os.access(zip_dir, os.W_OK | os.X_OK): 1721 print("do not have write access to the directory %s" % (zip_dir)) 1722 sys.exit(1) 1723 1724 if options.redact_level != "none" and options.redact_level != "partial": 1725 parser.error( 1726 "Invalid redaction level. Only 'none' and 'partial' are supported.") 1727 1728 redact_zip_file = zip_filename[:-4] + "-redacted" + zip_filename[-4:] 1729 upload_url = "" 1730 if options.redact_level != "none": 1731 upload_url = generate_upload_url(parser, options, redact_zip_file) 1732 else: 1733 upload_url = generate_upload_url(parser, options, zip_filename) 1734 1735 bindir = os.path.join(options.root, 'bin') 1736 if os.name == 'posix': 1737 path = [bindir, 1738 '/opt/couchbase/bin', 1739 os.environ['PATH'], 1740 '/bin', 1741 '/sbin', 1742 '/usr/bin', 1743 '/usr/sbin'] 1744 os.environ['PATH'] = ':'.join(path) 1745 1746 library_path = [os.path.join(options.root, 'lib')] 1747 1748 current_library_path = os.environ.get('LD_LIBRARY_PATH') 1749 if current_library_path is not None: 1750 library_path.append(current_library_path) 1751 1752 os.environ['LD_LIBRARY_PATH'] = ':'.join(library_path) 1753 elif os.name == 'nt': 1754 path = [bindir, os.environ['PATH']] 1755 os.environ['PATH'] = ';'.join(path) 1756 1757 if options.just_upload_into is not None: 1758 do_upload_and_exit(args[0], options.just_upload_into, 1759 options.upload_proxy) 1760 1761 runner = TaskRunner(verbosity=options.verbosity, 1762 task_regexp=options.task_regexp, 1763 tmp_dir=options.tmp_dir, 1764 salt_value=options.salt_value) 1765 # We want this at the top of couchbase.log 1766 runner.run(make_redaction_task()) 1767 1768 if not options.product_only: 1769 runner.run_tasks(make_os_tasks()) 1770 1771 initargs_variants = [ 1772 os.path.abspath( 1773 os.path.join( 1774 options.root, 1775 "var", 1776 "lib", 1777 "couchbase", 1778 "initargs")), 1779 "/opt/couchbase/var/lib/couchbase/initargs", 1780 os.path.expanduser("~/Library/Application Support/Couchbase/var/lib/couchbase/initargs")] 1781 1782 if options.initargs is not None: 1783 initargs_variants = [options.initargs] 1784 1785 guts = None 1786 guts_initargs_path = None 1787 1788 for initargs_path in initargs_variants: 1789 d = get_server_guts(initargs_path) 1790 # print("for initargs: %s got:\n%s" % (initargs_path, d)) 1791 if len(d) > 0: 1792 guts = d 1793 guts_initargs_path = os.path.abspath(initargs_path) 1794 break 1795 1796 if guts is None: 1797 log("Couldn't read server guts. Using some default values.") 1798 1799 prefix = None 1800 if platform.system() == 'Windows': 1801 prefix = 'c:/Program Files/Couchbase/Server' 1802 elif platform.system() == 'Darwin': 1803 prefix = '~/Library/Application Support/Couchbase' 1804 else: 1805 prefix = '/opt/couchbase' 1806 1807 guts = { 1808 "db_dir": os.path.join( 1809 prefix, 1810 "var/lib/couchbase/data"), 1811 "idx_dir": os.path.join( 1812 prefix, 1813 "var/lib/couchbase/data"), 1814 "ns_log_path": os.path.join( 1815 prefix, 1816 "var/lib/couchbase/ns_log"), 1817 "log_path": os.path.join( 1818 prefix, 1819 "var/lib/couchbase/logs"), 1820 "memcached_logs_path": os.path.join( 1821 prefix, 1822 "var/lib/couchbase/logs")} 1823 1824 guts_initargs_path = os.path.abspath(prefix) 1825 1826 ipv6 = read_guts(guts, "ipv6") == "true" 1827 set_local_addr(ipv6) 1828 1829 memcached_password = get_diag_password(guts) 1830 1831 zip_node = read_guts(guts, "node") 1832 runner.literal( 1833 "product diag header", "Found server initargs at %s (%d)" % 1834 (guts_initargs_path, len(guts))) 1835 1836 runner.run_tasks(make_product_task(guts, guts_initargs_path, 1837 memcached_password, options)) 1838 1839 # Collect breakpad crash dumps. 1840 if options.bypass_sensitive_data: 1841 log("Bypassing Sensitive Data: Breakpad crash dumps") 1842 else: 1843 memcached_breakpad_minidump_dir = read_guts( 1844 guts, "memcached_breakpad_minidump_dir") 1845 for dump in glob.glob(os.path.join( 1846 memcached_breakpad_minidump_dir, "*.dmp")): 1847 runner.collect_file(dump) 1848 1849 # Collect indexer breakpad minidumps 1850 index_port = read_guts(guts, "indexer_http_port") 1851 if index_port: 1852 indexer_breakpad_minidump_dir = read_guts( 1853 guts, "indexer_breakpad_minidump_dir") 1854 if memcached_breakpad_minidump_dir != indexer_breakpad_minidump_dir: 1855 for dump in glob.glob(os.path.join( 1856 indexer_breakpad_minidump_dir, "*.dmp")): 1857 runner.collect_file(dump) 1858 1859 fts_port = read_guts(guts, "fts_http_port") 1860 if fts_port: 1861 idx_dir = read_guts(guts, "idx_dir") 1862 for dump in glob.glob(os.path.join( 1863 idx_dir, "@fts", "dumps", "*.dump.txt")): 1864 runner.collect_file(dump) 1865 1866 addr = zip_node.split("@")[-1] 1867 if addr == "127.0.0.1" or addr == "::1": 1868 zip_node = '@'.join(zip_node.split( 1869 "@")[:-1] + [find_primary_addr(ipv6, addr)]) 1870 1871 if options.verbosity: 1872 log("Python version: %s" % sys.version) 1873 1874 runner.literal("cbcollect_info log", log_stream.getvalue(), 1875 log_file="cbcollect_info.log", no_header=True) 1876 1877 runner.close_all_files() 1878 1879 if options.redact_level != "none": 1880 log("Redacting log files to level: %s" % options.redact_level) 1881 runner.redact_and_zip(redact_zip_file, zip_node) 1882 1883 runner.zip(zip_filename, zip_node) 1884 1885 if upload_url and options.redact_level != "none": 1886 do_upload_and_exit(redact_zip_file, upload_url, options.upload_proxy) 1887 elif upload_url: 1888 do_upload_and_exit(zip_filename, upload_url, options.upload_proxy) 1889 1890 1891def find_primary_addr(ipv6, default=None): 1892 Family = socket.AF_INET6 if ipv6 else socket.AF_INET 1893 DnsAddr = "2001:4860:4860::8844" if ipv6 else "8.8.8.8" 1894 s = socket.socket(Family, socket.SOCK_DGRAM) 1895 try: 1896 s.connect((DnsAddr, 56)) 1897 if ipv6: 1898 addr, port, _, _ = s.getsockname() 1899 else: 1900 addr, port = s.getsockname() 1901 1902 return addr 1903 except socket.error: 1904 return default 1905 finally: 1906 s.close() 1907 1908 1909def exec_name(name): 1910 if sys.platform == 'win32': 1911 name += ".exe" 1912 return name 1913 1914 1915if __name__ == '__main__': 1916 main() 1917