diff --git a/qa/suites/rados/singleton/all/mon-config-key-caps.yaml b/qa/suites/rados/singleton/all/mon-config-key-caps.yaml new file mode 100644 index 0000000000000..0b0b95c52e080 --- /dev/null +++ b/qa/suites/rados/singleton/all/mon-config-key-caps.yaml @@ -0,0 +1,17 @@ +roles: +- - mon.a + - mgr.x + - osd.0 + - osd.1 + - osd.2 + - client.0 +tasks: +- install: +- ceph: + log-whitelist: + - overall HEALTH_ + - \(AUTH_BAD_CAPS\) +- workunit: + clients: + all: + - mon/test_config_key_caps.sh diff --git a/qa/workunits/mon/test_config_key_caps.sh b/qa/workunits/mon/test_config_key_caps.sh new file mode 100755 index 0000000000000..77b4b53b701d1 --- /dev/null +++ b/qa/workunits/mon/test_config_key_caps.sh @@ -0,0 +1,201 @@ +#!/usr/bin/env bash + +set -x +set -e + +tmp=$(mktemp -d -p /tmp test_mon_config_key_caps.XXXXX) +entities=() + +function cleanup() +{ + set +e + set +x + if [[ -e $tmp/keyring ]] && [[ -e $tmp/keyring.orig ]]; then + grep '\[.*\..*\]' $tmp/keyring.orig > $tmp/entities.orig + for e in $(grep '\[.*\..*\]' $tmp/keyring | \ + diff $tmp/entities.orig - | \ + sed -n 's/^.*\[\(.*\..*\)\]/\1/p'); + do + ceph auth rm $e 2>&1 >& /dev/null + done + fi + #rm -fr $tmp +} + +trap cleanup 0 # cleanup on exit + +function expect_false() +{ + set -x + if "$@"; then return 1; else return 0; fi +} + +# for cleanup purposes +ceph auth export -o $tmp/keyring.orig + +k=$tmp/keyring + +# setup a few keys +ceph config-key ls +ceph config-key set daemon-private/osd.123/test-foo +ceph config-key set mgr/test-foo +ceph config-key set device/test-foo +ceph config-key set test/foo + +allow_aa=client.allow_aa +allow_bb=client.allow_bb +allow_cc=client.allow_cc + +mgr_a=mgr.a +mgr_b=mgr.b +osd_a=osd.100 +osd_b=osd.200 + +prefix_aa=client.prefix_aa +prefix_bb=client.prefix_bb +prefix_cc=client.prefix_cc +match_aa=client.match_aa +match_bb=client.match_bb + +fail_aa=client.fail_aa +fail_bb=client.fail_bb +fail_cc=client.fail_cc +fail_dd=client.fail_dd +fail_ee=client.fail_ee +fail_ff=client.fail_ff +fail_gg=client.fail_gg +fail_writes=client.fail_writes + +ceph auth get-or-create $allow_aa mon 'allow *' +ceph auth get-or-create $allow_bb mon 'allow service config-key rwx' +ceph auth get-or-create $allow_cc mon 'allow command "config-key get"' + +ceph auth get-or-create $mgr_a mon 'allow profile mgr' +ceph auth get-or-create $mgr_b mon 'allow profile mgr' +ceph auth get-or-create $osd_a mon 'allow profile osd' +ceph auth get-or-create $osd_b mon 'allow profile osd' + +ceph auth get-or-create $prefix_aa mon \ + "allow command \"config-key get\" with key prefix client/$prefix_aa" + +cap="allow command \"config-key set\" with key prefix client/" +cap="$cap,allow command \"config-key get\" with key prefix client/$prefix_bb" +ceph auth get-or-create $prefix_bb mon "$cap" + +cap="allow command \"config-key get\" with key prefix client/" +cap="$cap, allow command \"config-key set\" with key prefix client/" +cap="$cap, allow command \"config-key ls\"" +ceph auth get-or-create $prefix_cc mon "$cap" + +cap="allow command \"config-key get\" with key=client/$match_aa/foo" +ceph auth get-or-create $match_aa mon "$cap" +cap="allow command \"config-key get\" with key=client/$match_bb/foo" +cap="$cap,allow command \"config-key set\" with key=client/$match_bb/foo" +ceph auth get-or-create $match_bb mon "$cap" + +ceph auth get-or-create $fail_aa mon 'allow rx' +ceph auth get-or-create $fail_bb mon 'allow r,allow w' +ceph auth get-or-create $fail_cc mon 'allow rw' +ceph auth get-or-create $fail_dd mon 'allow rwx' +ceph auth get-or-create $fail_ee mon 'allow profile bootstrap-rgw' +ceph auth get-or-create $fail_ff mon 'allow profile bootstrap-rbd' +# write commands will require rw; wx is not enough +ceph auth get-or-create $fail_gg mon 'allow service config-key wx' +# read commands will only require 'r'; 'rx' should be enough. +ceph auth get-or-create $fail_writes mon 'allow service config-key rx' + +# grab keyring +ceph auth export -o $k + +# keys will all the caps can do whatever +for c in $allow_aa $allow_bb $allow_cc $mgr_a $mgr_b; do + ceph -k $k --name $c config-key get daemon-private/osd.123/test-foo + ceph -k $k --name $c config-key get mgr/test-foo + ceph -k $k --name $c config-key get device/test-foo + ceph -k $k --name $c config-key get test/foo +done + +for c in $osd_a $osd_b; do + ceph -k $k --name $c config-key put daemon-private/$c/test-foo + ceph -k $k --name $c config-key get daemon-private/$c/test-foo + expect_false ceph -k $k --name $c config-key ls + expect_false ceph -k $k --name $c config-key get mgr/test-foo + expect_false ceph -k $k --name $c config-key get device/test-foo + expect_false ceph -k $k --name $c config-key get test/foo +done + +expect_false ceph -k $k --name $osd_a get daemon-private/$osd_b/test-foo +expect_false ceph -k $k --name $osd_b get daemon-private/$osd_a/test-foo + +expect_false ceph -k $k --name $prefix_aa \ + config-key ls +expect_false ceph -k $k --name $prefix_aa \ + config-key get daemon-private/osd.123/test-foo +expect_false ceph -k $k --name $prefix_aa \ + config-key set test/bar +expect_false ceph -k $k --name $prefix_aa \ + config-key set client/$prefix_aa/foo + +# write something so we can read, use a custom entity +ceph -k $k --name $allow_bb config-key set client/$prefix_aa/foo +ceph -k $k --name $prefix_aa config-key get client/$prefix_aa/foo +# check one writes to the other's prefix, the other is able to read +ceph -k $k --name $prefix_bb config-key set client/$prefix_aa/bar +ceph -k $k --name $prefix_aa config-key get client/$prefix_aa/bar + +ceph -k $k --name $prefix_bb config-key set client/$prefix_bb/foo +ceph -k $k --name $prefix_bb config-key get client/$prefix_bb/foo + +expect_false ceph -k $k --name $prefix_bb config-key get client/$prefix_aa/bar +expect_false ceph -k $k --name $prefix_bb config-key ls +expect_false ceph -k $k --name $prefix_bb \ + config-key get daemon-private/osd.123/test-foo +expect_false ceph -k $k --name $prefix_bb config-key get mgr/test-foo +expect_false ceph -k $k --name $prefix_bb config-key get device/test-foo +expect_false ceph -k $k --name $prefix_bb config-key get test/bar +expect_false ceph -k $k --name $prefix_bb config-key set test/bar + +ceph -k $k --name $prefix_cc config-key set client/$match_aa/foo +ceph -k $k --name $prefix_cc config-key set client/$match_bb/foo +ceph -k $k --name $prefix_cc config-key get client/$match_aa/foo +ceph -k $k --name $prefix_cc config-key get client/$match_bb/foo +expect_false ceph -k $k --name $prefix_cc config-key set other/prefix +expect_false ceph -k $k --name $prefix_cc config-key get mgr/test-foo +ceph -k $k --name $prefix_cc config-key ls >& /dev/null + +ceph -k $k --name $match_aa config-key get client/$match_aa/foo +expect_false ceph -k $k --name $match_aa config-key get client/$match_bb/foo +expect_false ceph -k $k --name $match_aa config-key set client/$match_aa/foo +ceph -k $k --name $match_bb config-key get client/$match_bb/foo +ceph -k $k --name $match_bb config-key set client/$match_bb/foo +expect_false ceph -k $k --name $match_bb config-key get client/$match_aa/foo +expect_false ceph -k $k --name $match_bb config-key set client/$match_aa/foo + +keys=(daemon-private/osd.123/test-foo + mgr/test-foo + device/test-foo + test/foo + client/$prefix_aa/foo + client/$prefix_bb/foo + client/$match_aa/foo + client/$match_bb/foo +) +# expect these all to fail accessing config-key +for c in $fail_aa $fail_bb $fail_cc \ + $fail_dd $fail_ee $fail_ff \ + $fail_gg; do + for m in get set; do + for key in ${keys[*]} client/$prefix_aa/foo client/$prefix_bb/foo; do + expect_false ceph -k $k --name $c config-key $m $key + done + done +done + +# fail writes but succeed on reads +expect_false ceph -k $k --name $fail_writes config-key set client/$match_aa/foo +expect_false ceph -k $k --name $fail_writes config-key set test/foo +ceph -k $k --name $fail_writes config-key ls +ceph -k $k --name $fail_writes config-key get client/$match_aa/foo +ceph -k $k --name $fail_writes config-key get daemon-private/osd.123/test-foo + +echo "OK" diff --git a/src/mgr/DaemonServer.cc.orig b/src/mgr/DaemonServer.cc.orig new file mode 100644 index 0000000000000..becd428aca6da --- /dev/null +++ b/src/mgr/DaemonServer.cc.orig @@ -0,0 +1,2893 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "DaemonServer.h" +#include "mgr/Mgr.h" + +#include "include/stringify.h" +#include "include/str_list.h" +#include "auth/RotatingKeyRing.h" +#include "json_spirit/json_spirit_writer.h" + +#include "mgr/mgr_commands.h" +#include "mgr/DaemonHealthMetricCollector.h" +#include "mgr/OSDPerfMetricCollector.h" +#include "mon/MonCommand.h" + +#include "messages/MMgrOpen.h" +#include "messages/MMgrClose.h" +#include "messages/MMgrConfigure.h" +#include "messages/MMonMgrReport.h" +#include "messages/MCommand.h" +#include "messages/MCommandReply.h" +#include "messages/MMgrCommand.h" +#include "messages/MMgrCommandReply.h" +#include "messages/MPGStats.h" +#include "messages/MOSDScrub.h" +#include "messages/MOSDScrub2.h" +#include "messages/MOSDForceRecovery.h" +#include "common/errno.h" +#include "common/pick_address.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr.server " << __func__ << " " +using namespace TOPNSPC::common; +namespace { + template + bool map_compare(Map const &lhs, Map const &rhs) { + return lhs.size() == rhs.size() + && std::equal(lhs.begin(), lhs.end(), rhs.begin(), + [] (auto a, auto b) { return a.first == b.first && a.second == b.second; }); + } +} + +DaemonServer::DaemonServer(MonClient *monc_, + Finisher &finisher_, + DaemonStateIndex &daemon_state_, + ClusterState &cluster_state_, + PyModuleRegistry &py_modules_, + LogChannelRef clog_, + LogChannelRef audit_clog_) + : Dispatcher(g_ceph_context), + client_byte_throttler(new Throttle(g_ceph_context, "mgr_client_bytes", + g_conf().get_val("mgr_client_bytes"))), + client_msg_throttler(new Throttle(g_ceph_context, "mgr_client_messages", + g_conf().get_val("mgr_client_messages"))), + osd_byte_throttler(new Throttle(g_ceph_context, "mgr_osd_bytes", + g_conf().get_val("mgr_osd_bytes"))), + osd_msg_throttler(new Throttle(g_ceph_context, "mgr_osd_messsages", + g_conf().get_val("mgr_osd_messages"))), + mds_byte_throttler(new Throttle(g_ceph_context, "mgr_mds_bytes", + g_conf().get_val("mgr_mds_bytes"))), + mds_msg_throttler(new Throttle(g_ceph_context, "mgr_mds_messsages", + g_conf().get_val("mgr_mds_messages"))), + mon_byte_throttler(new Throttle(g_ceph_context, "mgr_mon_bytes", + g_conf().get_val("mgr_mon_bytes"))), + mon_msg_throttler(new Throttle(g_ceph_context, "mgr_mon_messsages", + g_conf().get_val("mgr_mon_messages"))), + msgr(nullptr), + monc(monc_), + finisher(finisher_), + daemon_state(daemon_state_), + cluster_state(cluster_state_), + py_modules(py_modules_), + clog(clog_), + audit_clog(audit_clog_), + pgmap_ready(false), + timer(g_ceph_context, lock), + shutting_down(false), + tick_event(nullptr), + osd_perf_metric_collector_listener(this), + osd_perf_metric_collector(osd_perf_metric_collector_listener) +{ + g_conf().add_observer(this); +} + +DaemonServer::~DaemonServer() { + delete msgr; + g_conf().remove_observer(this); +} + +int DaemonServer::init(uint64_t gid, entity_addrvec_t client_addrs) +{ + // Initialize Messenger + std::string public_msgr_type = g_conf()->ms_public_type.empty() ? + g_conf().get_val("ms_type") : g_conf()->ms_public_type; + msgr = Messenger::create(g_ceph_context, public_msgr_type, + entity_name_t::MGR(gid), + "mgr", + Messenger::get_pid_nonce(), + 0); + msgr->set_default_policy(Messenger::Policy::stateless_server(0)); + + msgr->set_auth_client(monc); + + // throttle clients + msgr->set_policy_throttlers(entity_name_t::TYPE_CLIENT, + client_byte_throttler.get(), + client_msg_throttler.get()); + + // servers + msgr->set_policy_throttlers(entity_name_t::TYPE_OSD, + osd_byte_throttler.get(), + osd_msg_throttler.get()); + msgr->set_policy_throttlers(entity_name_t::TYPE_MDS, + mds_byte_throttler.get(), + mds_msg_throttler.get()); + msgr->set_policy_throttlers(entity_name_t::TYPE_MON, + mon_byte_throttler.get(), + mon_msg_throttler.get()); + + entity_addrvec_t addrs; + int r = pick_addresses(cct, CEPH_PICK_ADDRESS_PUBLIC, &addrs); + if (r < 0) { + return r; + } + dout(20) << __func__ << " will bind to " << addrs << dendl; + r = msgr->bindv(addrs); + if (r < 0) { + derr << "unable to bind mgr to " << addrs << dendl; + return r; + } + + msgr->set_myname(entity_name_t::MGR(gid)); + msgr->set_addr_unknowns(client_addrs); + + msgr->start(); + msgr->add_dispatcher_tail(this); + + msgr->set_auth_server(monc); + monc->set_handle_authentication_dispatcher(this); + + started_at = ceph_clock_now(); + + std::lock_guard l(lock); + timer.init(); + + schedule_tick_locked( + g_conf().get_val("mgr_tick_period").count()); + + return 0; +} + +entity_addrvec_t DaemonServer::get_myaddrs() const +{ + return msgr->get_myaddrs(); +} + +int DaemonServer::ms_handle_authentication(Connection *con) +{ + auto s = ceph::make_ref(cct); + con->set_priv(s); + s->inst.addr = con->get_peer_addr(); + s->entity_name = con->peer_name; + dout(10) << __func__ << " new session " << s << " con " << con + << " entity " << con->peer_name + << " addr " << con->get_peer_addrs() + << dendl; + + AuthCapsInfo &caps_info = con->get_peer_caps_info(); + if (caps_info.allow_all) { + dout(10) << " session " << s << " " << s->entity_name + << " allow_all" << dendl; + s->caps.set_allow_all(); + } else if (caps_info.caps.length() > 0) { + auto p = caps_info.caps.cbegin(); + string str; + try { + decode(str, p); + } + catch (buffer::error& e) { + dout(10) << " session " << s << " " << s->entity_name + << " failed to decode caps" << dendl; + return -EACCES; + } + if (!s->caps.parse(str)) { + dout(10) << " session " << s << " " << s->entity_name + << " failed to parse caps '" << str << "'" << dendl; + return -EACCES; + } + dout(10) << " session " << s << " " << s->entity_name + << " has caps " << s->caps << " '" << str << "'" << dendl; + } + + if (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD) { + std::lock_guard l(lock); + s->osd_id = atoi(s->entity_name.get_id().c_str()); + dout(10) << "registering osd." << s->osd_id << " session " + << s << " con " << con << dendl; + osd_cons[s->osd_id].insert(con); + } + + return 1; +} + +bool DaemonServer::ms_handle_reset(Connection *con) +{ + if (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD) { + auto priv = con->get_priv(); + auto session = static_cast(priv.get()); + if (!session) { + return false; + } + std::lock_guard l(lock); + dout(10) << "unregistering osd." << session->osd_id + << " session " << session << " con " << con << dendl; + osd_cons[session->osd_id].erase(con); + + auto iter = daemon_connections.find(con); + if (iter != daemon_connections.end()) { + daemon_connections.erase(iter); + } + } + return false; +} + +bool DaemonServer::ms_handle_refused(Connection *con) +{ + // do nothing for now + return false; +} + +bool DaemonServer::ms_dispatch2(const ref_t& m) +{ + // Note that we do *not* take ::lock here, in order to avoid + // serializing all message handling. It's up to each handler + // to take whatever locks it needs. + switch (m->get_type()) { + case MSG_PGSTATS: + cluster_state.ingest_pgstats(ref_cast(m)); + maybe_ready(m->get_source().num()); + return true; + case MSG_MGR_REPORT: + return handle_report(ref_cast(m)); + case MSG_MGR_OPEN: + return handle_open(ref_cast(m)); + case MSG_MGR_CLOSE: + return handle_close(ref_cast(m)); + case MSG_COMMAND: + return handle_command(ref_cast(m)); + case MSG_MGR_COMMAND: + return handle_command(ref_cast(m)); + default: + dout(1) << "Unhandled message type " << m->get_type() << dendl; + return false; + }; +} + +void DaemonServer::dump_pg_ready(ceph::Formatter *f) +{ + f->dump_bool("pg_ready", pgmap_ready.load()); +} + +void DaemonServer::maybe_ready(int32_t osd_id) +{ + if (pgmap_ready.load()) { + // Fast path: we don't need to take lock because pgmap_ready + // is already set + } else { + std::lock_guard l(lock); + + if (reported_osds.find(osd_id) == reported_osds.end()) { + dout(4) << "initial report from osd " << osd_id << dendl; + reported_osds.insert(osd_id); + std::set up_osds; + + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + osdmap.get_up_osds(up_osds); + }); + + std::set unreported_osds; + std::set_difference(up_osds.begin(), up_osds.end(), + reported_osds.begin(), reported_osds.end(), + std::inserter(unreported_osds, unreported_osds.begin())); + + if (unreported_osds.size() == 0) { + dout(4) << "all osds have reported, sending PG state to mon" << dendl; + pgmap_ready = true; + reported_osds.clear(); + // Avoid waiting for next tick + send_report(); + } else { + dout(4) << "still waiting for " << unreported_osds.size() << " osds" + " to report in before PGMap is ready" << dendl; + } + } + } +} + +void DaemonServer::tick() +{ + dout(10) << dendl; + send_report(); + adjust_pgs(); + + schedule_tick_locked( + g_conf().get_val("mgr_tick_period").count()); +} + +// Currently modules do not set health checks in response to events delivered to +// all modules (e.g. notify) so we do not risk a thundering hurd situation here. +// if this pattern emerges in the future, this scheduler could be modified to +// fire after all modules have had a chance to set their health checks. +void DaemonServer::schedule_tick_locked(double delay_sec) +{ + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + + if (tick_event) { + timer.cancel_event(tick_event); + tick_event = nullptr; + } + + // on shutdown start rejecting explicit requests to send reports that may + // originate from python land which may still be running. + if (shutting_down) + return; + + tick_event = timer.add_event_after(delay_sec, + new LambdaContext([this](int r) { + tick(); + })); +} + +void DaemonServer::schedule_tick(double delay_sec) +{ + std::lock_guard l(lock); + schedule_tick_locked(delay_sec); +} + +void DaemonServer::handle_osd_perf_metric_query_updated() +{ + dout(10) << dendl; + + // Send a fresh MMgrConfigure to all clients, so that they can follow + // the new policy for transmitting stats + finisher.queue(new LambdaContext([this](int r) { + std::lock_guard l(lock); + for (auto &c : daemon_connections) { + if (c->peer_is_osd()) { + _send_configure(c); + } + } + })); +} + +void DaemonServer::shutdown() +{ + dout(10) << "begin" << dendl; + msgr->shutdown(); + msgr->wait(); + cluster_state.shutdown(); + dout(10) << "done" << dendl; + + std::lock_guard l(lock); + shutting_down = true; + timer.shutdown(); +} + +static DaemonKey key_from_service( + const std::string& service_name, + int peer_type, + const std::string& daemon_name) +{ + if (!service_name.empty()) { + return DaemonKey{service_name, daemon_name}; + } else { + return DaemonKey{ceph_entity_type_name(peer_type), daemon_name}; + } +} + +bool DaemonServer::handle_open(const ref_t& m) +{ + std::lock_guard l(lock); + + DaemonKey key = key_from_service(m->service_name, + m->get_connection()->get_peer_type(), + m->daemon_name); + + auto con = m->get_connection(); + dout(10) << "from " << key << " " << con->get_peer_addr() << dendl; + + _send_configure(con); + + DaemonStatePtr daemon; + if (daemon_state.exists(key)) { + dout(20) << "updating existing DaemonState for " << key << dendl; + daemon = daemon_state.get(key); + } + if (!daemon) { + if (m->service_daemon) { + dout(4) << "constructing new DaemonState for " << key << dendl; + daemon = std::make_shared(daemon_state.types); + daemon->key = key; + daemon->service_daemon = true; + daemon_state.insert(daemon); + } else { + /* A normal Ceph daemon has connected but we are or should be waiting on + * metadata for it. Close the session so that it tries to reconnect. + */ + dout(2) << "ignoring open from " << key << " " << con->get_peer_addr() + << "; not ready for session (expect reconnect)" << dendl; + con->mark_down(); + return true; + } + } + if (daemon) { + if (m->service_daemon) { + // update the metadata through the daemon state index to + // ensure it's kept up-to-date + daemon_state.update_metadata(daemon, m->daemon_metadata); + } + + std::lock_guard l(daemon->lock); + daemon->perf_counters.clear(); + + daemon->service_daemon = m->service_daemon; + if (m->service_daemon) { + daemon->service_status = m->daemon_status; + + utime_t now = ceph_clock_now(); + auto [d, added] = pending_service_map.get_daemon(m->service_name, + m->daemon_name); + if (added || d->gid != (uint64_t)m->get_source().num()) { + dout(10) << "registering " << key << " in pending_service_map" << dendl; + d->gid = m->get_source().num(); + d->addr = m->get_source_addr(); + d->start_epoch = pending_service_map.epoch; + d->start_stamp = now; + d->metadata = m->daemon_metadata; + pending_service_map_dirty = pending_service_map.epoch; + } + } + + auto p = m->config_bl.cbegin(); + if (p != m->config_bl.end()) { + decode(daemon->config, p); + decode(daemon->ignored_mon_config, p); + dout(20) << " got config " << daemon->config + << " ignored " << daemon->ignored_mon_config << dendl; + } + daemon->config_defaults_bl = m->config_defaults_bl; + daemon->config_defaults.clear(); + dout(20) << " got config_defaults_bl " << daemon->config_defaults_bl.length() + << " bytes" << dendl; + } + + if (con->get_peer_type() != entity_name_t::TYPE_CLIENT && + m->service_name.empty()) + { + // Store in set of the daemon/service connections, i.e. those + // connections that require an update in the event of stats + // configuration changes. + daemon_connections.insert(con); + } + + return true; +} + +bool DaemonServer::handle_close(const ref_t& m) +{ + std::lock_guard l(lock); + + DaemonKey key = key_from_service(m->service_name, + m->get_connection()->get_peer_type(), + m->daemon_name); + dout(4) << "from " << m->get_connection() << " " << key << dendl; + + if (daemon_state.exists(key)) { + DaemonStatePtr daemon = daemon_state.get(key); + daemon_state.rm(key); + { + std::lock_guard l(daemon->lock); + if (daemon->service_daemon) { + pending_service_map.rm_daemon(m->service_name, m->daemon_name); + pending_service_map_dirty = pending_service_map.epoch; + } + } + } + + // send same message back as a reply + m->get_connection()->send_message2(m); + return true; +} + +void DaemonServer::update_task_status(DaemonKey key, const ref_t& m) { + dout(10) << "got task status from " << key << dendl; + + auto p = pending_service_map.get_daemon(key.type, key.name); + if (!map_compare(p.first->task_status, *m->task_status)) { + p.first->task_status = *m->task_status; + pending_service_map_dirty = pending_service_map.epoch; + } +} + +bool DaemonServer::handle_report(const ref_t& m) +{ + DaemonKey key; + if (!m->service_name.empty()) { + key.type = m->service_name; + } else { + key.type = ceph_entity_type_name(m->get_connection()->get_peer_type()); + } + key.name = m->daemon_name; + + dout(10) << "from " << m->get_connection() << " " << key << dendl; + + if (m->get_connection()->get_peer_type() == entity_name_t::TYPE_CLIENT && + m->service_name.empty()) { + // Clients should not be sending us stats unless they are declaring + // themselves to be a daemon for some service. + dout(10) << "rejecting report from non-daemon client " << m->daemon_name + << dendl; + clog->warn() << "rejecting report from non-daemon client " << m->daemon_name + << " at " << m->get_connection()->get_peer_addrs(); + m->get_connection()->mark_down(); + return true; + } + + + { + std::unique_lock locker(lock); + + DaemonStatePtr daemon; + // Look up the DaemonState + if (daemon_state.exists(key)) { + dout(20) << "updating existing DaemonState for " << key << dendl; + daemon = daemon_state.get(key); + } else { + locker.unlock(); + + // we don't know the hostname at this stage, reject MMgrReport here. + dout(5) << "rejecting report from " << key << ", since we do not have its metadata now." + << dendl; + // issue metadata request in background + if (!daemon_state.is_updating(key) && + (key.type == "osd" || key.type == "mds" || key.type == "mon")) { + + std::ostringstream oss; + auto c = new MetadataUpdate(daemon_state, key); + if (key.type == "osd") { + oss << "{\"prefix\": \"osd metadata\", \"id\": " + << key.name<< "}"; + + } else if (key.type == "mds") { + c->set_default("addr", stringify(m->get_source_addr())); + oss << "{\"prefix\": \"mds metadata\", \"who\": \"" + << key.name << "\"}"; + + } else if (key.type == "mon") { + oss << "{\"prefix\": \"mon metadata\", \"id\": \"" + << key.name << "\"}"; + } else { + ceph_abort(); + } + + monc->start_mon_command({oss.str()}, {}, &c->outbl, &c->outs, c); + } + + locker.lock(); + + // kill session + auto priv = m->get_connection()->get_priv(); + auto session = static_cast(priv.get()); + if (!session) { + return false; + } + m->get_connection()->mark_down(); + + dout(10) << "unregistering osd." << session->osd_id + << " session " << session << " con " << m->get_connection() << dendl; + + if (osd_cons.find(session->osd_id) != osd_cons.end()) { + osd_cons[session->osd_id].erase(m->get_connection()); + } + + auto iter = daemon_connections.find(m->get_connection()); + if (iter != daemon_connections.end()) { + daemon_connections.erase(iter); + } + + return false; + } + + // Update the DaemonState + ceph_assert(daemon != nullptr); + { + std::lock_guard l(daemon->lock); + auto &daemon_counters = daemon->perf_counters; + daemon_counters.update(*m.get()); + + auto p = m->config_bl.cbegin(); + if (p != m->config_bl.end()) { + decode(daemon->config, p); + decode(daemon->ignored_mon_config, p); + dout(20) << " got config " << daemon->config + << " ignored " << daemon->ignored_mon_config << dendl; + } + + utime_t now = ceph_clock_now(); + if (daemon->service_daemon) { + if (m->daemon_status) { + daemon->service_status_stamp = now; + daemon->service_status = *m->daemon_status; + } + daemon->last_service_beacon = now; + } else if (m->daemon_status) { + derr << "got status from non-daemon " << key << dendl; + } + // update task status + if (m->task_status) { + update_task_status(key, m); + daemon->last_service_beacon = now; + } + if (m->get_connection()->peer_is_osd() || m->get_connection()->peer_is_mon()) { + // only OSD and MON send health_checks to me now + daemon->daemon_health_metrics = std::move(m->daemon_health_metrics); + dout(10) << "daemon_health_metrics " << daemon->daemon_health_metrics + << dendl; + } + } + } + + // if there are any schema updates, notify the python modules + if (!m->declare_types.empty() || !m->undeclare_types.empty()) { + py_modules.notify_all("perf_schema_update", ceph::to_string(key)); + } + + if (m->get_connection()->peer_is_osd()) { + osd_perf_metric_collector.process_reports(m->osd_perf_metric_reports); + } + + if (m->metric_report_message) { + const MetricReportMessage &message = *m->metric_report_message; + boost::apply_visitor(HandlePayloadVisitor(this), message.payload); + } + + return true; +} + + +void DaemonServer::_generate_command_map( + cmdmap_t& cmdmap, + map ¶m_str_map) +{ + for (auto p = cmdmap.begin(); + p != cmdmap.end(); ++p) { + if (p->first == "prefix") + continue; + if (p->first == "caps") { + vector cv; + if (cmd_getval(cmdmap, "caps", cv) && + cv.size() % 2 == 0) { + for (unsigned i = 0; i < cv.size(); i += 2) { + string k = string("caps_") + cv[i]; + param_str_map[k] = cv[i + 1]; + } + continue; + } + } + param_str_map[p->first] = cmd_vartype_stringify(p->second); + } +} + +const MonCommand *DaemonServer::_get_mgrcommand( + const string &cmd_prefix, + const std::vector &cmds) +{ + const MonCommand *this_cmd = nullptr; + for (const auto &cmd : cmds) { + if (cmd.cmdstring.compare(0, cmd_prefix.size(), cmd_prefix) == 0) { + this_cmd = &cmd; + break; + } + } + return this_cmd; +} + +bool DaemonServer::_allowed_command( + MgrSession *s, + const string &service, + const string &module, + const string &prefix, + const cmdmap_t& cmdmap, + const map& param_str_map, + const MonCommand *this_cmd) { + + if (s->entity_name.is_mon()) { + // mon is all-powerful. even when it is forwarding commands on behalf of + // old clients; we expect the mon is validating commands before proxying! + return true; + } + + bool cmd_r = this_cmd->requires_perm('r'); + bool cmd_w = this_cmd->requires_perm('w'); + bool cmd_x = this_cmd->requires_perm('x'); + + bool capable = s->caps.is_capable( + g_ceph_context, + s->entity_name, + service, module, prefix, param_str_map, + cmd_r, cmd_w, cmd_x, + s->get_peer_addr()); + + dout(10) << " " << s->entity_name << " " + << (capable ? "" : "not ") << "capable" << dendl; + return capable; +} + +/** + * The working data for processing an MCommand. This lives in + * a class to enable passing it into other threads for processing + * outside of the thread/locks that called handle_command. + */ +class CommandContext { +public: + ceph::ref_t m_tell; + ceph::ref_t m_mgr; + const std::vector& cmd; ///< ref into m_tell or m_mgr + const bufferlist& data; ///< ref into m_tell or m_mgr + bufferlist odata; + cmdmap_t cmdmap; + + explicit CommandContext(ceph::ref_t m) + : m_tell{std::move(m)}, + cmd(m_tell->cmd), + data(m_tell->get_data()) { + } + explicit CommandContext(ceph::ref_t m) + : m_mgr{std::move(m)}, + cmd(m_mgr->cmd), + data(m_mgr->get_data()) { + } + + void reply(int r, const std::stringstream &ss) { + reply(r, ss.str()); + } + + void reply(int r, const std::string &rs) { + // Let the connection drop as soon as we've sent our response + ConnectionRef con = m_tell ? m_tell->get_connection() + : m_mgr->get_connection(); + if (con) { + con->mark_disposable(); + } + + if (r == 0) { + dout(20) << "success" << dendl; + } else { + derr << __func__ << " " << cpp_strerror(r) << " " << rs << dendl; + } + if (con) { + if (m_tell) { + MCommandReply *reply = new MCommandReply(r, rs); + reply->set_tid(m_tell->get_tid()); + reply->set_data(odata); + con->send_message(reply); + } else { + MMgrCommandReply *reply = new MMgrCommandReply(r, rs); + reply->set_tid(m_mgr->get_tid()); + reply->set_data(odata); + con->send_message(reply); + } + } + } +}; + +/** + * A context for receiving a bufferlist/error string from a background + * function and then calling back to a CommandContext when it's done + */ +class ReplyOnFinish : public Context { + std::shared_ptr cmdctx; + +public: + bufferlist from_mon; + string outs; + + explicit ReplyOnFinish(const std::shared_ptr &cmdctx_) + : cmdctx(cmdctx_) + {} + void finish(int r) override { + cmdctx->odata.claim_append(from_mon); + cmdctx->reply(r, outs); + } +}; + +bool DaemonServer::handle_command(const ref_t& m) +{ + std::lock_guard l(lock); + // a blank fsid in MCommand signals a legacy client sending a "mon-mgr" CLI + // command. + if (m->fsid != uuid_d()) { + cct->get_admin_socket()->queue_tell_command(m); + return true; + } else { + // legacy client; send to CLI processing + auto cmdctx = std::make_shared(m); + try { + return _handle_command(cmdctx); + } catch (const bad_cmd_get& e) { + cmdctx->reply(-EINVAL, e.what()); + return true; + } + } +} + +bool DaemonServer::handle_command(const ref_t& m) +{ + std::lock_guard l(lock); + auto cmdctx = std::make_shared(m); + try { + return _handle_command(cmdctx); + } catch (const bad_cmd_get& e) { + cmdctx->reply(-EINVAL, e.what()); + return true; + } +} + +void DaemonServer::log_access_denied( + std::shared_ptr& cmdctx, + MgrSession* session, std::stringstream& ss) { + dout(1) << " access denied" << dendl; + audit_clog->info() << "from='" << session->inst << "' " + << "entity='" << session->entity_name << "' " + << "cmd=" << cmdctx->cmd << ": access denied"; + ss << "access denied: does your client key have mgr caps? " + "See http://docs.ceph.com/docs/master/mgr/administrator/" + "#client-authentication"; +} + +bool DaemonServer::_handle_command( + std::shared_ptr& cmdctx) +{ + MessageRef m; + if (cmdctx->m_tell) { + m = cmdctx->m_tell; + } else { + m = cmdctx->m_mgr; + } + auto priv = m->get_connection()->get_priv(); + auto session = static_cast(priv.get()); + if (!session) { + return true; + } + if (session->inst.name == entity_name_t()) { + session->inst.name = m->get_source(); + } + + std::string format; + boost::scoped_ptr f; + map param_str_map; + std::stringstream ss; + int r = 0; + + if (!cmdmap_from_json(cmdctx->cmd, &(cmdctx->cmdmap), ss)) { + cmdctx->reply(-EINVAL, ss); + return true; + } + + { + cmd_getval(cmdctx->cmdmap, "format", format, string("plain")); + f.reset(Formatter::create(format)); + } + + string prefix; + cmd_getval(cmdctx->cmdmap, "prefix", prefix); + + dout(10) << "decoded-size=" << cmdctx->cmdmap.size() << " prefix=" << prefix << dendl; + + if (prefix == "get_command_descriptions") { + dout(10) << "reading commands from python modules" << dendl; + const auto py_commands = py_modules.get_commands(); + + int cmdnum = 0; + JSONFormatter f; + f.open_object_section("command_descriptions"); + + auto dump_cmd = [&cmdnum, &f, m](const MonCommand &mc){ + ostringstream secname; + secname << "cmd" << setfill('0') << std::setw(3) << cmdnum; + dump_cmddesc_to_json(&f, m->get_connection()->get_features(), + secname.str(), mc.cmdstring, mc.helpstring, + mc.module, mc.req_perms, 0); + cmdnum++; + }; + + for (const auto &pyc : py_commands) { + dump_cmd(pyc); + } + + for (const auto &mgr_cmd : mgr_commands) { + dump_cmd(mgr_cmd); + } + + f.close_section(); // command_descriptions + f.flush(cmdctx->odata); + cmdctx->reply(0, ss); + return true; + } + + // lookup command + const MonCommand *mgr_cmd = _get_mgrcommand(prefix, mgr_commands); + _generate_command_map(cmdctx->cmdmap, param_str_map); + + bool is_allowed = false; + ModuleCommand py_command; + if (!mgr_cmd) { + // Resolve the command to the name of the module that will + // handle it (if the command exists) + auto py_commands = py_modules.get_py_commands(); + for (const auto &pyc : py_commands) { + auto pyc_prefix = cmddesc_get_prefix(pyc.cmdstring); + if (pyc_prefix == prefix) { + py_command = pyc; + break; + } + } + + MonCommand pyc = {"", "", "py", py_command.perm}; + is_allowed = _allowed_command(session, "py", py_command.module_name, + prefix, cmdctx->cmdmap, param_str_map, + &pyc); + } else { + // validate user's permissions for requested command + is_allowed = _allowed_command(session, mgr_cmd->module, "", + prefix, cmdctx->cmdmap, param_str_map, mgr_cmd); + } + + if (!is_allowed) { + log_access_denied(cmdctx, session, ss); + cmdctx->reply(-EACCES, ss); + return true; + } + + audit_clog->debug() + << "from='" << session->inst << "' " + << "entity='" << session->entity_name << "' " + << "cmd=" << cmdctx->cmd << ": dispatch"; + + // ---------------- + // service map commands + if (prefix == "service dump") { + if (!f) + f.reset(Formatter::create("json-pretty")); + cluster_state.with_servicemap([&](const ServiceMap &service_map) { + f->dump_object("service_map", service_map); + }); + f->flush(cmdctx->odata); + cmdctx->reply(0, ss); + return true; + } + if (prefix == "service status") { + if (!f) + f.reset(Formatter::create("json-pretty")); + // only include state from services that are in the persisted service map + f->open_object_section("service_status"); + for (auto& [type, service] : pending_service_map.services) { + if (ServiceMap::is_normal_ceph_entity(type)) { + continue; + } + + f->open_object_section(type.c_str()); + for (auto& q : service.daemons) { + f->open_object_section(q.first.c_str()); + DaemonKey key{type, q.first}; + ceph_assert(daemon_state.exists(key)); + auto daemon = daemon_state.get(key); + std::lock_guard l(daemon->lock); + f->dump_stream("status_stamp") << daemon->service_status_stamp; + f->dump_stream("last_beacon") << daemon->last_service_beacon; + f->open_object_section("status"); + for (auto& r : daemon->service_status) { + f->dump_string(r.first.c_str(), r.second); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + } + f->close_section(); + f->flush(cmdctx->odata); + cmdctx->reply(0, ss); + return true; + } + + if (prefix == "config set") { + std::string key; + std::string val; + cmd_getval(cmdctx->cmdmap, "key", key); + cmd_getval(cmdctx->cmdmap, "value", val); + r = cct->_conf.set_val(key, val, &ss); + if (r == 0) { + cct->_conf.apply_changes(nullptr); + } + cmdctx->reply(0, ss); + return true; + } + + // ----------- + // PG commands + + if (prefix == "pg scrub" || + prefix == "pg repair" || + prefix == "pg deep-scrub") { + string scrubop = prefix.substr(3, string::npos); + pg_t pgid; + spg_t spgid; + string pgidstr; + cmd_getval(cmdctx->cmdmap, "pgid", pgidstr); + if (!pgid.parse(pgidstr.c_str())) { + ss << "invalid pgid '" << pgidstr << "'"; + cmdctx->reply(-EINVAL, ss); + return true; + } + bool pg_exists = false; + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + pg_exists = osdmap.pg_exists(pgid); + }); + if (!pg_exists) { + ss << "pg " << pgid << " does not exist"; + cmdctx->reply(-ENOENT, ss); + return true; + } + int acting_primary = -1; + epoch_t epoch; + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + epoch = osdmap.get_epoch(); + osdmap.get_primary_shard(pgid, &acting_primary, &spgid); + }); + if (acting_primary == -1) { + ss << "pg " << pgid << " has no primary osd"; + cmdctx->reply(-EAGAIN, ss); + return true; + } + auto p = osd_cons.find(acting_primary); + if (p == osd_cons.end()) { + ss << "pg " << pgid << " primary osd." << acting_primary + << " is not currently connected"; + cmdctx->reply(-EAGAIN, ss); + return true; + } + for (auto& con : p->second) { + if (HAVE_FEATURE(con->get_features(), SERVER_MIMIC)) { + vector pgs = { spgid }; + con->send_message(new MOSDScrub2(monc->get_fsid(), + epoch, + pgs, + scrubop == "repair", + scrubop == "deep-scrub")); + } else { + vector pgs = { pgid }; + con->send_message(new MOSDScrub(monc->get_fsid(), + pgs, + scrubop == "repair", + scrubop == "deep-scrub")); + } + } + ss << "instructing pg " << spgid << " on osd." << acting_primary + << " to " << scrubop; + cmdctx->reply(0, ss); + return true; + } else if (prefix == "osd scrub" || + prefix == "osd deep-scrub" || + prefix == "osd repair") { + string whostr; + cmd_getval(cmdctx->cmdmap, "who", whostr); + vector pvec; + get_str_vec(prefix, pvec); + + set osds; + if (whostr == "*" || whostr == "all" || whostr == "any") { + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + for (int i = 0; i < osdmap.get_max_osd(); i++) + if (osdmap.is_up(i)) { + osds.insert(i); + } + }); + } else { + long osd = parse_osd_id(whostr.c_str(), &ss); + if (osd < 0) { + ss << "invalid osd '" << whostr << "'"; + cmdctx->reply(-EINVAL, ss); + return true; + } + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + if (osdmap.is_up(osd)) { + osds.insert(osd); + } + }); + if (osds.empty()) { + ss << "osd." << osd << " is not up"; + cmdctx->reply(-EAGAIN, ss); + return true; + } + } + set sent_osds, failed_osds; + for (auto osd : osds) { + vector spgs; + epoch_t epoch; + cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pgmap) { + epoch = osdmap.get_epoch(); + auto p = pgmap.pg_by_osd.find(osd); + if (p != pgmap.pg_by_osd.end()) { + for (auto pgid : p->second) { + int primary; + spg_t spg; + osdmap.get_primary_shard(pgid, &primary, &spg); + if (primary == osd) { + spgs.push_back(spg); + } + } + } + }); + auto p = osd_cons.find(osd); + if (p == osd_cons.end()) { + failed_osds.insert(osd); + } else { + sent_osds.insert(osd); + for (auto& con : p->second) { + if (HAVE_FEATURE(con->get_features(), SERVER_MIMIC)) { + con->send_message(new MOSDScrub2(monc->get_fsid(), + epoch, + spgs, + pvec.back() == "repair", + pvec.back() == "deep-scrub")); + } else { + con->send_message(new MOSDScrub(monc->get_fsid(), + pvec.back() == "repair", + pvec.back() == "deep-scrub")); + } + } + } + } + if (failed_osds.size() == osds.size()) { + ss << "failed to instruct osd(s) " << osds << " to " << pvec.back() + << " (not connected)"; + r = -EAGAIN; + } else { + ss << "instructed osd(s) " << sent_osds << " to " << pvec.back(); + if (!failed_osds.empty()) { + ss << "; osd(s) " << failed_osds << " were not connected"; + } + r = 0; + } + cmdctx->reply(0, ss); + return true; + } else if (prefix == "osd pool scrub" || + prefix == "osd pool deep-scrub" || + prefix == "osd pool repair") { + vector pool_names; + cmd_getval(cmdctx->cmdmap, "who", pool_names); + if (pool_names.empty()) { + ss << "must specify one or more pool names"; + cmdctx->reply(-EINVAL, ss); + return true; + } + epoch_t epoch; + map> pgs_by_primary; // legacy + map> spgs_by_primary; + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + epoch = osdmap.get_epoch(); + for (auto& pool_name : pool_names) { + auto pool_id = osdmap.lookup_pg_pool_name(pool_name); + if (pool_id < 0) { + ss << "unrecognized pool '" << pool_name << "'"; + r = -ENOENT; + return; + } + auto pool_pg_num = osdmap.get_pg_num(pool_id); + for (int i = 0; i < pool_pg_num; i++) { + pg_t pg(i, pool_id); + int primary; + spg_t spg; + auto got = osdmap.get_primary_shard(pg, &primary, &spg); + if (!got) + continue; + pgs_by_primary[primary].push_back(pg); + spgs_by_primary[primary].push_back(spg); + } + } + }); + if (r < 0) { + cmdctx->reply(r, ss); + return true; + } + for (auto& it : spgs_by_primary) { + auto primary = it.first; + auto p = osd_cons.find(primary); + if (p == osd_cons.end()) { + ss << "osd." << primary << " is not currently connected"; + cmdctx->reply(-EAGAIN, ss); + return true; + } + for (auto& con : p->second) { + if (HAVE_FEATURE(con->get_features(), SERVER_MIMIC)) { + con->send_message(new MOSDScrub2(monc->get_fsid(), + epoch, + it.second, + prefix == "osd pool repair", + prefix == "osd pool deep-scrub")); + } else { + // legacy + auto q = pgs_by_primary.find(primary); + ceph_assert(q != pgs_by_primary.end()); + con->send_message(new MOSDScrub(monc->get_fsid(), + q->second, + prefix == "osd pool repair", + prefix == "osd pool deep-scrub")); + } + } + } + cmdctx->reply(0, ""); + return true; + } else if (prefix == "osd reweight-by-pg" || + prefix == "osd reweight-by-utilization" || + prefix == "osd test-reweight-by-pg" || + prefix == "osd test-reweight-by-utilization") { + bool by_pg = + prefix == "osd reweight-by-pg" || prefix == "osd test-reweight-by-pg"; + bool dry_run = + prefix == "osd test-reweight-by-pg" || + prefix == "osd test-reweight-by-utilization"; + int64_t oload; + cmd_getval(cmdctx->cmdmap, "oload", oload, int64_t(120)); + set pools; + vector poolnames; + cmd_getval(cmdctx->cmdmap, "pools", poolnames); + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + for (const auto& poolname : poolnames) { + int64_t pool = osdmap.lookup_pg_pool_name(poolname); + if (pool < 0) { + ss << "pool '" << poolname << "' does not exist"; + r = -ENOENT; + } + pools.insert(pool); + } + }); + if (r) { + cmdctx->reply(r, ss); + return true; + } + + double max_change = g_conf().get_val("mon_reweight_max_change"); + cmd_getval(cmdctx->cmdmap, "max_change", max_change); + if (max_change <= 0.0) { + ss << "max_change " << max_change << " must be positive"; + cmdctx->reply(-EINVAL, ss); + return true; + } + int64_t max_osds = g_conf().get_val("mon_reweight_max_osds"); + cmd_getval(cmdctx->cmdmap, "max_osds", max_osds); + if (max_osds <= 0) { + ss << "max_osds " << max_osds << " must be positive"; + cmdctx->reply(-EINVAL, ss); + return true; + } + bool no_increasing = false; + cmd_getval(cmdctx->cmdmap, "no_increasing", no_increasing); + string out_str; + mempool::osdmap::map new_weights; + r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap &osdmap, const PGMap& pgmap) { + return reweight::by_utilization(osdmap, pgmap, + oload, + max_change, + max_osds, + by_pg, + pools.empty() ? NULL : &pools, + no_increasing, + &new_weights, + &ss, &out_str, f.get()); + }); + if (r >= 0) { + dout(10) << "reweight::by_utilization: finished with " << out_str << dendl; + } + if (f) { + f->flush(cmdctx->odata); + } else { + cmdctx->odata.append(out_str); + } + if (r < 0) { + ss << "FAILED reweight-by-pg"; + cmdctx->reply(r, ss); + return true; + } else if (r == 0 || dry_run) { + ss << "no change"; + cmdctx->reply(r, ss); + return true; + } else { + json_spirit::Object json_object; + for (const auto& osd_weight : new_weights) { + json_spirit::Config::add(json_object, + std::to_string(osd_weight.first), + std::to_string(osd_weight.second)); + } + string s = json_spirit::write(json_object); + std::replace(begin(s), end(s), '\"', '\''); + const string cmd = + "{" + "\"prefix\": \"osd reweightn\", " + "\"weights\": \"" + s + "\"" + "}"; + auto on_finish = new ReplyOnFinish(cmdctx); + monc->start_mon_command({cmd}, {}, + &on_finish->from_mon, &on_finish->outs, on_finish); + return true; + } + } else if (prefix == "osd df") { + string method, filter; + cmd_getval(cmdctx->cmdmap, "output_method", method); + cmd_getval(cmdctx->cmdmap, "filter", filter); + stringstream rs; + r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pgmap) { + // sanity check filter(s) + if (!filter.empty() && + osdmap.lookup_pg_pool_name(filter) < 0 && + !osdmap.crush->class_exists(filter) && + !osdmap.crush->name_exists(filter)) { + rs << "'" << filter << "' not a pool, crush node or device class name"; + return -EINVAL; + } + print_osd_utilization(osdmap, pgmap, ss, + f.get(), method == "tree", filter); + cmdctx->odata.append(ss); + return 0; + }); + cmdctx->reply(r, rs); + return true; + } else if (prefix == "osd pool stats") { + string pool_name; + cmd_getval(cmdctx->cmdmap, "pool_name", pool_name); + int64_t poolid = -ENOENT; + bool one_pool = false; + r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) { + if (!pool_name.empty()) { + poolid = osdmap.lookup_pg_pool_name(pool_name); + if (poolid < 0) { + ceph_assert(poolid == -ENOENT); + ss << "unrecognized pool '" << pool_name << "'"; + return -ENOENT; + } + one_pool = true; + } + stringstream rs; + if (f) + f->open_array_section("pool_stats"); + else { + if (osdmap.get_pools().empty()) { + ss << "there are no pools!"; + goto stats_out; + } + } + for (auto &p : osdmap.get_pools()) { + if (!one_pool) { + poolid = p.first; + } + pg_map.dump_pool_stats_and_io_rate(poolid, osdmap, f.get(), &rs); + if (one_pool) { + break; + } + } + stats_out: + if (f) { + f->close_section(); + f->flush(cmdctx->odata); + } else { + cmdctx->odata.append(rs.str()); + } + return 0; + }); + if (r != -EOPNOTSUPP) { + cmdctx->reply(r, ss); + return true; + } + } else if (prefix == "osd safe-to-destroy" || + prefix == "osd destroy" || + prefix == "osd purge") { + set osds; + int r = 0; + if (prefix == "osd safe-to-destroy") { + vector ids; + cmd_getval(cmdctx->cmdmap, "ids", ids); + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + r = osdmap.parse_osd_id_list(ids, &osds, &ss); + }); + if (!r && osds.empty()) { + ss << "must specify one or more OSDs"; + r = -EINVAL; + } + } else { + int64_t id; + if (!cmd_getval(cmdctx->cmdmap, "id", id)) { + r = -EINVAL; + ss << "must specify OSD id"; + } else { + osds.insert(id); + } + } + if (r < 0) { + cmdctx->reply(r, ss); + return true; + } + set active_osds, missing_stats, stored_pgs, safe_to_destroy; + int affected_pgs = 0; + cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) { + if (pg_map.num_pg_unknown > 0) { + ss << pg_map.num_pg_unknown << " pgs have unknown state; cannot draw" + << " any conclusions"; + r = -EAGAIN; + return; + } + int num_active_clean = 0; + for (auto& p : pg_map.num_pg_by_state) { + unsigned want = PG_STATE_ACTIVE|PG_STATE_CLEAN; + if ((p.first & want) == want) { + num_active_clean += p.second; + } + } + for (auto osd : osds) { + if (!osdmap.exists(osd)) { + safe_to_destroy.insert(osd); + continue; // clearly safe to destroy + } + auto q = pg_map.num_pg_by_osd.find(osd); + if (q != pg_map.num_pg_by_osd.end()) { + if (q->second.acting > 0 || q->second.up_not_acting > 0) { + active_osds.insert(osd); + // XXX: For overlapping PGs, this counts them again + affected_pgs += q->second.acting + q->second.up_not_acting; + continue; + } + } + if (num_active_clean < pg_map.num_pg) { + // all pgs aren't active+clean; we need to be careful. + auto p = pg_map.osd_stat.find(osd); + if (p == pg_map.osd_stat.end() || !osdmap.is_up(osd)) { + missing_stats.insert(osd); + continue; + } else if (p->second.num_pgs > 0) { + stored_pgs.insert(osd); + continue; + } + } + safe_to_destroy.insert(osd); + } + }); + if (r && prefix == "osd safe-to-destroy") { + cmdctx->reply(r, ss); // regardless of formatter + return true; + } + if (!r && (!active_osds.empty() || + !missing_stats.empty() || !stored_pgs.empty())) { + if (!safe_to_destroy.empty()) { + ss << "OSD(s) " << safe_to_destroy + << " are safe to destroy without reducing data durability. "; + } + if (!active_osds.empty()) { + ss << "OSD(s) " << active_osds << " have " << affected_pgs + << " pgs currently mapped to them. "; + } + if (!missing_stats.empty()) { + ss << "OSD(s) " << missing_stats << " have no reported stats, and not all" + << " PGs are active+clean; we cannot draw any conclusions. "; + } + if (!stored_pgs.empty()) { + ss << "OSD(s) " << stored_pgs << " last reported they still store some PG" + << " data, and not all PGs are active+clean; we cannot be sure they" + << " aren't still needed."; + } + if (!active_osds.empty() || !stored_pgs.empty()) { + r = -EBUSY; + } else { + r = -EAGAIN; + } + } + + if (prefix == "osd safe-to-destroy") { + if (!r) { + ss << "OSD(s) " << osds << " are safe to destroy without reducing data" + << " durability."; + } + if (f) { + f->open_object_section("osd_status"); + f->open_array_section("safe_to_destroy"); + for (auto i : safe_to_destroy) + f->dump_int("osd", i); + f->close_section(); + f->open_array_section("active"); + for (auto i : active_osds) + f->dump_int("osd", i); + f->close_section(); + f->open_array_section("missing_stats"); + for (auto i : missing_stats) + f->dump_int("osd", i); + f->close_section(); + f->open_array_section("stored_pgs"); + for (auto i : stored_pgs) + f->dump_int("osd", i); + f->close_section(); + f->close_section(); // osd_status + f->flush(cmdctx->odata); + r = 0; + std::stringstream().swap(ss); + } + cmdctx->reply(r, ss); + return true; + } + + if (r) { + bool force = false; + cmd_getval(cmdctx->cmdmap, "force", force); + if (!force) { + // Backward compat + cmd_getval(cmdctx->cmdmap, "yes_i_really_mean_it", force); + } + if (!force) { + ss << "\nYou can proceed by passing --force, but be warned that" + " this will likely mean real, permanent data loss."; + } else { + r = 0; + } + } + if (r) { + cmdctx->reply(r, ss); + return true; + } + const string cmd = + "{" + "\"prefix\": \"" + prefix + "-actual\", " + "\"id\": " + stringify(osds) + ", " + "\"yes_i_really_mean_it\": true" + "}"; + auto on_finish = new ReplyOnFinish(cmdctx); + monc->start_mon_command({cmd}, {}, nullptr, &on_finish->outs, on_finish); + return true; + } else if (prefix == "osd ok-to-stop") { + vector ids; + cmd_getval(cmdctx->cmdmap, "ids", ids); + set osds; + int r; + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + r = osdmap.parse_osd_id_list(ids, &osds, &ss); + }); + if (!r && osds.empty()) { + ss << "must specify one or more OSDs"; + r = -EINVAL; + } + if (r < 0) { + cmdctx->reply(r, ss); + return true; + } + int touched_pgs = 0; + int dangerous_pgs = 0; + cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) { + if (pg_map.num_pg_unknown > 0) { + ss << pg_map.num_pg_unknown << " pgs have unknown state; " + << "cannot draw any conclusions"; + r = -EAGAIN; + return; + } + for (const auto& q : pg_map.pg_stat) { + set pg_acting; // net acting sets (with no missing if degraded) + bool found = false; + if (q.second.state & PG_STATE_DEGRADED) { + for (auto& anm : q.second.avail_no_missing) { + if (osds.count(anm.osd)) { + found = true; + continue; + } + if (anm.osd != CRUSH_ITEM_NONE) { + pg_acting.insert(anm.osd); + } + } + } else { + for (auto& a : q.second.acting) { + if (osds.count(a)) { + found = true; + continue; + } + if (a != CRUSH_ITEM_NONE) { + pg_acting.insert(a); + } + } + } + if (!found) { + continue; + } + touched_pgs++; + if (!(q.second.state & PG_STATE_ACTIVE) || + (q.second.state & PG_STATE_DEGRADED)) { + ++dangerous_pgs; + continue; + } + const pg_pool_t *pi = osdmap.get_pg_pool(q.first.pool()); + if (!pi) { + ++dangerous_pgs; // pool is creating or deleting + } else { + if (pg_acting.size() < pi->min_size) { + ++dangerous_pgs; + } + } + } + }); + if (r) { + cmdctx->reply(r, ss); + return true; + } + if (dangerous_pgs) { + ss << dangerous_pgs << " PGs are already too degraded, would become" + << " too degraded or might become unavailable"; + cmdctx->reply(-EBUSY, ss); + return true; + } + ss << "OSD(s) " << osds << " are ok to stop without reducing" + << " availability or risking data, provided there are no other concurrent failures" + << " or interventions." << std::endl; + ss << touched_pgs << " PGs are likely to be" + << " degraded (but remain available) as a result."; + cmdctx->reply(0, ss); + return true; + } else if (prefix == "pg force-recovery" || + prefix == "pg force-backfill" || + prefix == "pg cancel-force-recovery" || + prefix == "pg cancel-force-backfill" || + prefix == "osd pool force-recovery" || + prefix == "osd pool force-backfill" || + prefix == "osd pool cancel-force-recovery" || + prefix == "osd pool cancel-force-backfill") { + vector vs; + get_str_vec(prefix, vs); + auto& granularity = vs.front(); + auto& forceop = vs.back(); + vector pgs; + + // figure out actual op just once + int actual_op = 0; + if (forceop == "force-recovery") { + actual_op = OFR_RECOVERY; + } else if (forceop == "force-backfill") { + actual_op = OFR_BACKFILL; + } else if (forceop == "cancel-force-backfill") { + actual_op = OFR_BACKFILL | OFR_CANCEL; + } else if (forceop == "cancel-force-recovery") { + actual_op = OFR_RECOVERY | OFR_CANCEL; + } + + set candidates; // deduped + if (granularity == "pg") { + // covnert pg names to pgs, discard any invalid ones while at it + vector pgids; + cmd_getval(cmdctx->cmdmap, "pgid", pgids); + for (auto& i : pgids) { + pg_t pgid; + if (!pgid.parse(i.c_str())) { + ss << "invlaid pgid '" << i << "'; "; + r = -EINVAL; + continue; + } + candidates.insert(pgid); + } + } else { + // per pool + vector pool_names; + cmd_getval(cmdctx->cmdmap, "who", pool_names); + if (pool_names.empty()) { + ss << "must specify one or more pool names"; + cmdctx->reply(-EINVAL, ss); + return true; + } + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + for (auto& pool_name : pool_names) { + auto pool_id = osdmap.lookup_pg_pool_name(pool_name); + if (pool_id < 0) { + ss << "unrecognized pool '" << pool_name << "'"; + r = -ENOENT; + return; + } + auto pool_pg_num = osdmap.get_pg_num(pool_id); + for (int i = 0; i < pool_pg_num; i++) + candidates.insert({(unsigned int)i, (uint64_t)pool_id}); + } + }); + if (r < 0) { + cmdctx->reply(r, ss); + return true; + } + } + + cluster_state.with_pgmap([&](const PGMap& pg_map) { + for (auto& i : candidates) { + auto it = pg_map.pg_stat.find(i); + if (it == pg_map.pg_stat.end()) { + ss << "pg " << i << " does not exist; "; + r = -ENOENT; + continue; + } + auto state = it->second.state; + // discard pgs for which user requests are pointless + switch (actual_op) { + case OFR_RECOVERY: + if ((state & (PG_STATE_DEGRADED | + PG_STATE_RECOVERY_WAIT | + PG_STATE_RECOVERING)) == 0) { + // don't return error, user script may be racing with cluster. + // not fatal. + ss << "pg " << i << " doesn't require recovery; "; + continue; + } else if (state & PG_STATE_FORCED_RECOVERY) { + ss << "pg " << i << " recovery already forced; "; + // return error, as it may be a bug in user script + r = -EINVAL; + continue; + } + break; + case OFR_BACKFILL: + if ((state & (PG_STATE_DEGRADED | + PG_STATE_BACKFILL_WAIT | + PG_STATE_BACKFILLING)) == 0) { + ss << "pg " << i << " doesn't require backfilling; "; + continue; + } else if (state & PG_STATE_FORCED_BACKFILL) { + ss << "pg " << i << " backfill already forced; "; + r = -EINVAL; + continue; + } + break; + case OFR_BACKFILL | OFR_CANCEL: + if ((state & PG_STATE_FORCED_BACKFILL) == 0) { + ss << "pg " << i << " backfill not forced; "; + continue; + } + break; + case OFR_RECOVERY | OFR_CANCEL: + if ((state & PG_STATE_FORCED_RECOVERY) == 0) { + ss << "pg " << i << " recovery not forced; "; + continue; + } + break; + default: + ceph_abort_msg("actual_op value is not supported"); + } + pgs.push_back(i); + } // for + }); + + // respond with error only when no pgs are correct + // yes, in case of mixed errors, only the last one will be emitted, + // but the message presented will be fine + if (pgs.size() != 0) { + // clear error to not confuse users/scripts + r = 0; + } + + // optimize the command -> messages conversion, use only one + // message per distinct OSD + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + // group pgs to process by osd + map> osdpgs; + for (auto& pgid : pgs) { + int primary; + spg_t spg; + if (osdmap.get_primary_shard(pgid, &primary, &spg)) { + osdpgs[primary].push_back(spg); + } + } + for (auto& i : osdpgs) { + if (osdmap.is_up(i.first)) { + auto p = osd_cons.find(i.first); + if (p == osd_cons.end()) { + ss << "osd." << i.first << " is not currently connected"; + r = -EAGAIN; + continue; + } + for (auto& con : p->second) { + con->send_message( + new MOSDForceRecovery(monc->get_fsid(), i.second, actual_op)); + } + ss << "instructing pg(s) " << i.second << " on osd." << i.first + << " to " << forceop << "; "; + } + } + }); + ss << std::endl; + cmdctx->reply(r, ss); + return true; + } else if (prefix == "config show" || + prefix == "config show-with-defaults") { + string who; + cmd_getval(cmdctx->cmdmap, "who", who); + auto [key, valid] = DaemonKey::parse(who); + if (!valid) { + ss << "invalid daemon name: use ."; + cmdctx->reply(-EINVAL, ss); + return true; + } + DaemonStatePtr daemon = daemon_state.get(key); + if (!daemon) { + ss << "no config state for daemon " << who; + cmdctx->reply(-ENOENT, ss); + return true; + } + + std::lock_guard l(daemon->lock); + + int r = 0; + string name; + if (cmd_getval(cmdctx->cmdmap, "key", name)) { + auto p = daemon->config.find(name); + if (p != daemon->config.end() && + !p->second.empty()) { + cmdctx->odata.append(p->second.rbegin()->second + "\n"); + } else { + auto& defaults = daemon->_get_config_defaults(); + auto q = defaults.find(name); + if (q != defaults.end()) { + cmdctx->odata.append(q->second + "\n"); + } else { + r = -ENOENT; + } + } + } else if (daemon->config_defaults_bl.length() > 0) { + TextTable tbl; + if (f) { + f->open_array_section("config"); + } else { + tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("VALUE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("SOURCE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("OVERRIDES", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("IGNORES", TextTable::LEFT, TextTable::LEFT); + } + if (prefix == "config show") { + // show + for (auto& i : daemon->config) { + dout(20) << " " << i.first << " -> " << i.second << dendl; + if (i.second.empty()) { + continue; + } + if (f) { + f->open_object_section("value"); + f->dump_string("name", i.first); + f->dump_string("value", i.second.rbegin()->second); + f->dump_string("source", ceph_conf_level_name( + i.second.rbegin()->first)); + if (i.second.size() > 1) { + f->open_array_section("overrides"); + auto j = i.second.rend(); + for (--j; j != i.second.rbegin(); --j) { + f->open_object_section("value"); + f->dump_string("source", ceph_conf_level_name(j->first)); + f->dump_string("value", j->second); + f->close_section(); + } + f->close_section(); + } + if (daemon->ignored_mon_config.count(i.first)) { + f->dump_string("ignores", "mon"); + } + f->close_section(); + } else { + tbl << i.first; + tbl << i.second.rbegin()->second; + tbl << ceph_conf_level_name(i.second.rbegin()->first); + if (i.second.size() > 1) { + list ov; + auto j = i.second.rend(); + for (--j; j != i.second.rbegin(); --j) { + if (j->second == i.second.rbegin()->second) { + ov.push_front(string("(") + ceph_conf_level_name(j->first) + + string("[") + j->second + string("]") + + string(")")); + } else { + ov.push_front(ceph_conf_level_name(j->first) + + string("[") + j->second + string("]")); + + } + } + tbl << ov; + } else { + tbl << ""; + } + tbl << (daemon->ignored_mon_config.count(i.first) ? "mon" : ""); + tbl << TextTable::endrow; + } + } + } else { + // show-with-defaults + auto& defaults = daemon->_get_config_defaults(); + for (auto& i : defaults) { + if (f) { + f->open_object_section("value"); + f->dump_string("name", i.first); + } else { + tbl << i.first; + } + auto j = daemon->config.find(i.first); + if (j != daemon->config.end() && !j->second.empty()) { + // have config + if (f) { + f->dump_string("value", j->second.rbegin()->second); + f->dump_string("source", ceph_conf_level_name( + j->second.rbegin()->first)); + if (j->second.size() > 1) { + f->open_array_section("overrides"); + auto k = j->second.rend(); + for (--k; k != j->second.rbegin(); --k) { + f->open_object_section("value"); + f->dump_string("source", ceph_conf_level_name(k->first)); + f->dump_string("value", k->second); + f->close_section(); + } + f->close_section(); + } + if (daemon->ignored_mon_config.count(i.first)) { + f->dump_string("ignores", "mon"); + } + f->close_section(); + } else { + tbl << j->second.rbegin()->second; + tbl << ceph_conf_level_name(j->second.rbegin()->first); + if (j->second.size() > 1) { + list ov; + auto k = j->second.rend(); + for (--k; k != j->second.rbegin(); --k) { + if (k->second == j->second.rbegin()->second) { + ov.push_front(string("(") + ceph_conf_level_name(k->first) + + string("[") + k->second + string("]") + + string(")")); + } else { + ov.push_front(ceph_conf_level_name(k->first) + + string("[") + k->second + string("]")); + } + } + tbl << ov; + } else { + tbl << ""; + } + tbl << (daemon->ignored_mon_config.count(i.first) ? "mon" : ""); + tbl << TextTable::endrow; + } + } else { + // only have default + if (f) { + f->dump_string("value", i.second); + f->dump_string("source", ceph_conf_level_name(CONF_DEFAULT)); + f->close_section(); + } else { + tbl << i.second; + tbl << ceph_conf_level_name(CONF_DEFAULT); + tbl << ""; + tbl << ""; + tbl << TextTable::endrow; + } + } + } + } + if (f) { + f->close_section(); + f->flush(cmdctx->odata); + } else { + cmdctx->odata.append(stringify(tbl)); + } + } + cmdctx->reply(r, ss); + return true; + } else if (prefix == "device ls") { + set devids; + TextTable tbl; + if (f) { + f->open_array_section("devices"); + daemon_state.with_devices([&f](const DeviceState& dev) { + f->dump_object("device", dev); + }); + f->close_section(); + f->flush(cmdctx->odata); + } else { + tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("HOST:DEV", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("DAEMONS", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("LIFE EXPECTANCY", TextTable::LEFT, TextTable::LEFT); + auto now = ceph_clock_now(); + daemon_state.with_devices([&tbl, now](const DeviceState& dev) { + string h; + for (auto& i : dev.attachments) { + if (h.size()) { + h += " "; + } + h += std::get<0>(i) + ":" + std::get<1>(i); + } + string d; + for (auto& i : dev.daemons) { + if (d.size()) { + d += " "; + } + d += to_string(i); + } + tbl << dev.devid + << h + << d + << dev.get_life_expectancy_str(now) + << TextTable::endrow; + }); + cmdctx->odata.append(stringify(tbl)); + } + cmdctx->reply(0, ss); + return true; + } else if (prefix == "device ls-by-daemon") { + string who; + cmd_getval(cmdctx->cmdmap, "who", who); + if (auto [k, valid] = DaemonKey::parse(who); !valid) { + ss << who << " is not a valid daemon name"; + r = -EINVAL; + } else { + auto dm = daemon_state.get(k); + if (dm) { + if (f) { + f->open_array_section("devices"); + for (auto& i : dm->devices) { + daemon_state.with_device(i.first, [&f] (const DeviceState& dev) { + f->dump_object("device", dev); + }); + } + f->close_section(); + f->flush(cmdctx->odata); + } else { + TextTable tbl; + tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("HOST:DEV", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("EXPECTED FAILURE", TextTable::LEFT, + TextTable::LEFT); + auto now = ceph_clock_now(); + for (auto& i : dm->devices) { + daemon_state.with_device( + i.first, [&tbl, now] (const DeviceState& dev) { + string h; + for (auto& i : dev.attachments) { + if (h.size()) { + h += " "; + } + h += std::get<0>(i) + ":" + std::get<1>(i); + } + tbl << dev.devid + << h + << dev.get_life_expectancy_str(now) + << TextTable::endrow; + }); + } + cmdctx->odata.append(stringify(tbl)); + } + } else { + r = -ENOENT; + ss << "daemon " << who << " not found"; + } + cmdctx->reply(r, ss); + } + } else if (prefix == "device ls-by-host") { + string host; + cmd_getval(cmdctx->cmdmap, "host", host); + set devids; + daemon_state.list_devids_by_server(host, &devids); + if (f) { + f->open_array_section("devices"); + for (auto& devid : devids) { + daemon_state.with_device( + devid, [&f] (const DeviceState& dev) { + f->dump_object("device", dev); + }); + } + f->close_section(); + f->flush(cmdctx->odata); + } else { + TextTable tbl; + tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("DEV", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("DAEMONS", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("EXPECTED FAILURE", TextTable::LEFT, TextTable::LEFT); + auto now = ceph_clock_now(); + for (auto& devid : devids) { + daemon_state.with_device( + devid, [&tbl, &host, now] (const DeviceState& dev) { + string n; + for (auto& j : dev.attachments) { + if (std::get<0>(j) == host) { + if (n.size()) { + n += " "; + } + n += std::get<1>(j); + } + } + string d; + for (auto& i : dev.daemons) { + if (d.size()) { + d += " "; + } + d += to_string(i); + } + tbl << dev.devid + << n + << d + << dev.get_life_expectancy_str(now) + << TextTable::endrow; + }); + } + cmdctx->odata.append(stringify(tbl)); + } + cmdctx->reply(0, ss); + return true; + } else if (prefix == "device info") { + string devid; + cmd_getval(cmdctx->cmdmap, "devid", devid); + int r = 0; + ostringstream rs; + if (!daemon_state.with_device(devid, + [&f, &rs] (const DeviceState& dev) { + if (f) { + f->dump_object("device", dev); + } else { + dev.print(rs); + } + })) { + ss << "device " << devid << " not found"; + r = -ENOENT; + } else { + if (f) { + f->flush(cmdctx->odata); + } else { + cmdctx->odata.append(rs.str()); + } + } + cmdctx->reply(r, ss); + return true; + } else if (prefix == "device set-life-expectancy") { + string devid; + cmd_getval(cmdctx->cmdmap, "devid", devid); + string from_str, to_str; + cmd_getval(cmdctx->cmdmap, "from", from_str); + cmd_getval(cmdctx->cmdmap, "to", to_str); + utime_t from, to; + if (!from.parse(from_str)) { + ss << "unable to parse datetime '" << from_str << "'"; + r = -EINVAL; + cmdctx->reply(r, ss); + } else if (to_str.size() && !to.parse(to_str)) { + ss << "unable to parse datetime '" << to_str << "'"; + r = -EINVAL; + cmdctx->reply(r, ss); + } else { + map meta; + daemon_state.with_device_create( + devid, + [from, to, &meta] (DeviceState& dev) { + dev.set_life_expectancy(from, to, ceph_clock_now()); + meta = dev.metadata; + }); + json_spirit::Object json_object; + for (auto& i : meta) { + json_spirit::Config::add(json_object, i.first, i.second); + } + bufferlist json; + json.append(json_spirit::write(json_object)); + const string cmd = + "{" + "\"prefix\": \"config-key set\", " + "\"key\": \"device/" + devid + "\"" + "}"; + auto on_finish = new ReplyOnFinish(cmdctx); + monc->start_mon_command({cmd}, json, nullptr, nullptr, on_finish); + } + return true; + } else if (prefix == "device rm-life-expectancy") { + string devid; + cmd_getval(cmdctx->cmdmap, "devid", devid); + map meta; + if (daemon_state.with_device_write(devid, [&meta] (DeviceState& dev) { + dev.rm_life_expectancy(); + meta = dev.metadata; + })) { + string cmd; + bufferlist json; + if (meta.empty()) { + cmd = + "{" + "\"prefix\": \"config-key rm\", " + "\"key\": \"device/" + devid + "\"" + "}"; + } else { + json_spirit::Object json_object; + for (auto& i : meta) { + json_spirit::Config::add(json_object, i.first, i.second); + } + json.append(json_spirit::write(json_object)); + cmd = + "{" + "\"prefix\": \"config-key set\", " + "\"key\": \"device/" + devid + "\"" + "}"; + } + auto on_finish = new ReplyOnFinish(cmdctx); + monc->start_mon_command({cmd}, json, nullptr, nullptr, on_finish); + } else { + cmdctx->reply(0, ss); + } + return true; + } else { + if (!pgmap_ready) { + ss << "Warning: due to ceph-mgr restart, some PG states may not be up to date\n"; + } + if (f) { + f->open_object_section("pg_info"); + f->dump_bool("pg_ready", pgmap_ready); + } + + // fall back to feeding command to PGMap + r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) { + return process_pg_map_command(prefix, cmdctx->cmdmap, pg_map, osdmap, + f.get(), &ss, &cmdctx->odata); + }); + + if (f) { + f->close_section(); + } + if (r != -EOPNOTSUPP) { + if (f) { + f->flush(cmdctx->odata); + } + cmdctx->reply(r, ss); + return true; + } + } + + // Was the command unfound? + if (py_command.cmdstring.empty()) { + ss << "No handler found for '" << prefix << "'"; + dout(4) << "No handler found for '" << prefix << "'" << dendl; + cmdctx->reply(-EINVAL, ss); + return true; + } + + dout(10) << "passing through " << cmdctx->cmdmap.size() << dendl; + finisher.queue(new LambdaContext([this, cmdctx, session, py_command, prefix] + (int r_) mutable { + std::stringstream ss; + + // Validate that the module is enabled + auto& py_handler_name = py_command.module_name; + PyModuleRef module = py_modules.get_module(py_handler_name); + ceph_assert(module); + if (!module->is_enabled()) { + ss << "Module '" << py_handler_name << "' is not enabled (required by " + "command '" << prefix << "'): use `ceph mgr module enable " + << py_handler_name << "` to enable it"; + dout(4) << ss.str() << dendl; + cmdctx->reply(-EOPNOTSUPP, ss); + return; + } + + // Hack: allow the self-test method to run on unhealthy modules. + // Fix this in future by creating a special path for self test rather + // than having the hook be a normal module command. + std::string self_test_prefix = py_handler_name + " " + "self-test"; + + // Validate that the module is healthy + bool accept_command; + if (module->is_loaded()) { + if (module->get_can_run() && !module->is_failed()) { + // Healthy module + accept_command = true; + } else if (self_test_prefix == prefix) { + // Unhealthy, but allow because it's a self test command + accept_command = true; + } else { + accept_command = false; + ss << "Module '" << py_handler_name << "' has experienced an error and " + "cannot handle commands: " << module->get_error_string(); + } + } else { + // Module not loaded + accept_command = false; + ss << "Module '" << py_handler_name << "' failed to load and " + "cannot handle commands: " << module->get_error_string(); + } + + if (!accept_command) { + dout(4) << ss.str() << dendl; + cmdctx->reply(-EIO, ss); + return; + } + + std::stringstream ds; + bufferlist inbl = cmdctx->data; + int r = py_modules.handle_command(py_command, *session, cmdctx->cmdmap, + inbl, &ds, &ss); + if (r == -EACCES) { + log_access_denied(cmdctx, session, ss); + } + + cmdctx->odata.append(ds); + cmdctx->reply(r, ss); + })); + return true; +} + +void DaemonServer::_prune_pending_service_map() +{ + utime_t cutoff = ceph_clock_now(); + cutoff -= g_conf().get_val("mgr_service_beacon_grace"); + auto p = pending_service_map.services.begin(); + while (p != pending_service_map.services.end()) { + auto q = p->second.daemons.begin(); + while (q != p->second.daemons.end()) { + DaemonKey key{p->first, q->first}; + if (!daemon_state.exists(key)) { + derr << "missing key " << key << dendl; + ++q; + continue; + } + auto daemon = daemon_state.get(key); + std::lock_guard l(daemon->lock); + if (daemon->last_service_beacon == utime_t()) { + // we must have just restarted; assume they are alive now. + daemon->last_service_beacon = ceph_clock_now(); + ++q; + continue; + } + if (daemon->last_service_beacon < cutoff) { + dout(10) << "pruning stale " << p->first << "." << q->first + << " last_beacon " << daemon->last_service_beacon << dendl; + q = p->second.daemons.erase(q); + pending_service_map_dirty = pending_service_map.epoch; + } else { + ++q; + } + } + if (p->second.daemons.empty()) { + p = pending_service_map.services.erase(p); + pending_service_map_dirty = pending_service_map.epoch; + } else { + ++p; + } + } +} + +void DaemonServer::send_report() +{ + if (!pgmap_ready) { + if (ceph_clock_now() - started_at > g_conf().get_val("mgr_stats_period") * 4.0) { + pgmap_ready = true; + reported_osds.clear(); + dout(1) << "Giving up on OSDs that haven't reported yet, sending " + << "potentially incomplete PG state to mon" << dendl; + } else { + dout(1) << "Not sending PG status to monitor yet, waiting for OSDs" + << dendl; + return; + } + } + + auto m = ceph::make_message(); + py_modules.get_health_checks(&m->health_checks); + py_modules.get_progress_events(&m->progress_events); + + cluster_state.with_mutable_pgmap([&](PGMap& pg_map) { + cluster_state.update_delta_stats(); + + if (pending_service_map.epoch) { + _prune_pending_service_map(); + if (pending_service_map_dirty >= pending_service_map.epoch) { + pending_service_map.modified = ceph_clock_now(); + encode(pending_service_map, m->service_map_bl, CEPH_FEATURES_ALL); + dout(10) << "sending service_map e" << pending_service_map.epoch + << dendl; + pending_service_map.epoch++; + } + } + + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + // FIXME: no easy way to get mon features here. this will do for + // now, though, as long as we don't make a backward-incompat change. + pg_map.encode_digest(osdmap, m->get_data(), CEPH_FEATURES_ALL); + dout(10) << pg_map << dendl; + + pg_map.get_health_checks(g_ceph_context, osdmap, + &m->health_checks); + + dout(10) << m->health_checks.checks.size() << " health checks" + << dendl; + dout(20) << "health checks:\n"; + JSONFormatter jf(true); + jf.dump_object("health_checks", m->health_checks); + jf.flush(*_dout); + *_dout << dendl; + if (osdmap.require_osd_release >= ceph_release_t::luminous) { + clog->debug() << "pgmap v" << pg_map.version << ": " << pg_map; + } + }); + }); + + map> accumulated; + for (auto service : {"osd", "mon"} ) { + auto daemons = daemon_state.get_by_service(service); + for (const auto& [key,state] : daemons) { + std::lock_guard l{state->lock}; + for (const auto& metric : state->daemon_health_metrics) { + auto acc = accumulated.find(metric.get_type()); + if (acc == accumulated.end()) { + auto collector = DaemonHealthMetricCollector::create(metric.get_type()); + if (!collector) { + derr << __func__ << " " << key + << " sent me an unknown health metric: " + << std::hex << static_cast(metric.get_type()) + << std::dec << dendl; + continue; + } + dout(20) << " + " << state->key << " " + << metric << dendl; + tie(acc, std::ignore) = accumulated.emplace(metric.get_type(), + std::move(collector)); + } + acc->second->update(key, metric); + } + } + } + for (const auto& acc : accumulated) { + acc.second->summarize(m->health_checks); + } + // TODO? We currently do not notify the PyModules + // TODO: respect needs_send, so we send the report only if we are asked to do + // so, or the state is updated. + monc->send_mon_message(std::move(m)); +} + +void DaemonServer::adjust_pgs() +{ + dout(20) << dendl; + unsigned max = std::max(1, g_conf()->mon_osd_max_creating_pgs); + double max_misplaced = g_conf().get_val("target_max_misplaced_ratio"); + bool aggro = g_conf().get_val("mgr_debug_aggressive_pg_num_changes"); + + map pg_num_to_set; + map pgp_num_to_set; + set upmaps_to_clear; + cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) { + unsigned creating_or_unknown = 0; + for (auto& i : pg_map.num_pg_by_state) { + if ((i.first & (PG_STATE_CREATING)) || + i.first == 0) { + creating_or_unknown += i.second; + } + } + unsigned left = max; + if (creating_or_unknown >= max) { + return; + } + left -= creating_or_unknown; + dout(10) << "creating_or_unknown " << creating_or_unknown + << " max_creating " << max + << " left " << left + << dendl; + + // FIXME: These checks are fundamentally racy given that adjust_pgs() + // can run more frequently than we get updated pg stats from OSDs. We + // may make multiple adjustments with stale informaiton. + double misplaced_ratio, degraded_ratio; + double inactive_pgs_ratio, unknown_pgs_ratio; + pg_map.get_recovery_stats(&misplaced_ratio, °raded_ratio, + &inactive_pgs_ratio, &unknown_pgs_ratio); + dout(20) << "misplaced_ratio " << misplaced_ratio + << " degraded_ratio " << degraded_ratio + << " inactive_pgs_ratio " << inactive_pgs_ratio + << " unknown_pgs_ratio " << unknown_pgs_ratio + << "; target_max_misplaced_ratio " << max_misplaced + << dendl; + + for (auto& i : osdmap.get_pools()) { + const pg_pool_t& p = i.second; + + // adjust pg_num? + if (p.get_pg_num_target() != p.get_pg_num()) { + dout(20) << "pool " << i.first + << " pg_num " << p.get_pg_num() + << " target " << p.get_pg_num_target() + << dendl; + if (p.has_flag(pg_pool_t::FLAG_CREATING)) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " - still creating initial pgs" + << dendl; + } else if (p.get_pg_num_target() < p.get_pg_num()) { + // pg_num decrease (merge) + pg_t merge_source(p.get_pg_num() - 1, i.first); + pg_t merge_target = merge_source.get_parent(); + bool ok = true; + + if (p.get_pg_num() != p.get_pg_num_pending()) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " - decrease and pg_num_pending != pg_num, waiting" + << dendl; + ok = false; + } else if (p.get_pg_num() == p.get_pgp_num()) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " - decrease blocked by pgp_num " + << p.get_pgp_num() + << dendl; + ok = false; + } + vector source_acting; + for (auto &merge_participant : {merge_source, merge_target}) { + bool is_merge_source = merge_participant == merge_source; + if (osdmap.have_pg_upmaps(merge_participant)) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << (is_merge_source ? " - merge source " : " - merge target ") + << merge_participant + << " has upmap" << dendl; + upmaps_to_clear.insert(merge_participant); + ok = false; + } + auto q = pg_map.pg_stat.find(merge_participant); + if (q == pg_map.pg_stat.end()) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " - no state for " << merge_participant + << (is_merge_source ? " (merge source)" : " (merge target)") + << dendl; + ok = false; + } else if ((q->second.state & (PG_STATE_ACTIVE | PG_STATE_CLEAN)) != + (PG_STATE_ACTIVE | PG_STATE_CLEAN)) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << (is_merge_source ? " - merge source " : " - merge target ") + << merge_participant + << " not clean (" << pg_state_string(q->second.state) + << ")" << dendl; + ok = false; + } + if (is_merge_source) { + source_acting = q->second.acting; + } else if (ok && q->second.acting != source_acting) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << (is_merge_source ? " - merge source " : " - merge target ") + << merge_participant + << " acting does not match (source " << source_acting + << " != target " << q->second.acting + << ")" << dendl; + ok = false; + } + } + + if (ok) { + unsigned target = p.get_pg_num() - 1; + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " -> " << target + << " (merging " << merge_source + << " and " << merge_target + << ")" << dendl; + pg_num_to_set[osdmap.get_pool_name(i.first)] = target; + continue; + } + } else if (p.get_pg_num_target() > p.get_pg_num()) { + // pg_num increase (split) + bool active = true; + auto q = pg_map.num_pg_by_pool_state.find(i.first); + if (q != pg_map.num_pg_by_pool_state.end()) { + for (auto& j : q->second) { + if ((j.first & (PG_STATE_ACTIVE|PG_STATE_PEERED)) == 0) { + dout(20) << "pool " << i.first << " has " << j.second + << " pgs in " << pg_state_string(j.first) + << dendl; + active = false; + break; + } + } + } else { + active = false; + } + if (!active) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " - not all pgs active" + << dendl; + } else { + unsigned add = std::min( + left, + p.get_pg_num_target() - p.get_pg_num()); + unsigned target = p.get_pg_num() + add; + left -= add; + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " -> " << target << dendl; + pg_num_to_set[osdmap.get_pool_name(i.first)] = target; + } + } + } + + // adjust pgp_num? + unsigned target = std::min(p.get_pg_num_pending(), + p.get_pgp_num_target()); + if (target != p.get_pgp_num()) { + dout(20) << "pool " << i.first + << " pgp_num_target " << p.get_pgp_num_target() + << " pgp_num " << p.get_pgp_num() + << " -> " << target << dendl; + if (target > p.get_pgp_num() && + p.get_pgp_num() == p.get_pg_num()) { + dout(10) << "pool " << i.first + << " pgp_num_target " << p.get_pgp_num_target() + << " pgp_num " << p.get_pgp_num() + << " - increase blocked by pg_num " << p.get_pg_num() + << dendl; + } else if (!aggro && (inactive_pgs_ratio > 0 || + degraded_ratio > 0 || + unknown_pgs_ratio > 0)) { + dout(10) << "pool " << i.first + << " pgp_num_target " << p.get_pgp_num_target() + << " pgp_num " << p.get_pgp_num() + << " - inactive|degraded|unknown pgs, deferring pgp_num" + << " update" << dendl; + } else if (!aggro && (misplaced_ratio > max_misplaced)) { + dout(10) << "pool " << i.first + << " pgp_num_target " << p.get_pgp_num_target() + << " pgp_num " << p.get_pgp_num() + << " - misplaced_ratio " << misplaced_ratio + << " > max " << max_misplaced + << ", deferring pgp_num update" << dendl; + } else { + // NOTE: this calculation assumes objects are + // basically uniformly distributed across all PGs + // (regardless of pool), which is probably not + // perfectly correct, but it's a start. make no + // single adjustment that's more than half of the + // max_misplaced, to somewhat limit the magnitude of + // our potential error here. + int next; + static constexpr unsigned MAX_NUM_OBJECTS_PER_PG_FOR_LEAP = 1; + pool_stat_t s = pg_map.get_pg_pool_sum_stat(i.first); + if (aggro || + // pool is (virtually) empty; just jump to final pgp_num? + (p.get_pgp_num_target() > p.get_pgp_num() && + s.stats.sum.num_objects <= (MAX_NUM_OBJECTS_PER_PG_FOR_LEAP * + p.get_pgp_num_target()))) { + next = target; + } else { + double room = + std::min(max_misplaced - misplaced_ratio, + max_misplaced / 2.0); + unsigned estmax = std::max( + (double)p.get_pg_num() * room, 1u); + next = std::clamp(target, + p.get_pgp_num() - estmax, + p.get_pgp_num() + estmax); + dout(20) << " room " << room << " estmax " << estmax + << " delta " << (target-p.get_pgp_num()) + << " next " << next << dendl; + if (p.get_pgp_num_target() == p.get_pg_num_target() && + p.get_pgp_num_target() < p.get_pg_num()) { + // since pgp_num is tracking pg_num, ceph is handling + // pgp_num. so, be responsible: don't let pgp_num get + // too far out ahead of merges (if we are merging). + // this avoids moving lots of unmerged pgs onto a + // small number of OSDs where we might blow out the + // per-osd pg max. + unsigned max_outpace_merges = + std::max(8, p.get_pg_num() * max_misplaced); + if (next + max_outpace_merges < p.get_pg_num()) { + next = p.get_pg_num() - max_outpace_merges; + dout(10) << " using next " << next + << " to avoid outpacing merges (max_outpace_merges " + << max_outpace_merges << ")" << dendl; + } + } + } + dout(10) << "pool " << i.first + << " pgp_num_target " << p.get_pgp_num_target() + << " pgp_num " << p.get_pgp_num() + << " -> " << next << dendl; + pgp_num_to_set[osdmap.get_pool_name(i.first)] = next; + } + } + if (left == 0) { + return; + } + } + }); + for (auto i : pg_num_to_set) { + const string cmd = + "{" + "\"prefix\": \"osd pool set\", " + "\"pool\": \"" + i.first + "\", " + "\"var\": \"pg_num_actual\", " + "\"val\": \"" + stringify(i.second) + "\"" + "}"; + monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr); + } + for (auto i : pgp_num_to_set) { + const string cmd = + "{" + "\"prefix\": \"osd pool set\", " + "\"pool\": \"" + i.first + "\", " + "\"var\": \"pgp_num_actual\", " + "\"val\": \"" + stringify(i.second) + "\"" + "}"; + monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr); + } + for (auto pg : upmaps_to_clear) { + const string cmd = + "{" + "\"prefix\": \"osd rm-pg-upmap\", " + "\"pgid\": \"" + stringify(pg) + "\"" + "}"; + monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr); + const string cmd2 = + "{" + "\"prefix\": \"osd rm-pg-upmap-items\", " + "\"pgid\": \"" + stringify(pg) + "\"" + + "}"; + monc->start_mon_command({cmd2}, {}, nullptr, nullptr, nullptr); + } +} + +void DaemonServer::got_service_map() +{ + std::lock_guard l(lock); + + cluster_state.with_servicemap([&](const ServiceMap& service_map) { + if (pending_service_map.epoch == 0) { + // we just started up + dout(10) << "got initial map e" << service_map.epoch << dendl; + pending_service_map = service_map; + } else { + // we we already active and therefore must have persisted it, + // which means ours is the same or newer. + dout(10) << "got updated map e" << service_map.epoch << dendl; + } + pending_service_map.epoch = service_map.epoch + 1; + }); + + // cull missing daemons, populate new ones + std::set types; + for (auto& [type, service] : pending_service_map.services) { + if (ServiceMap::is_normal_ceph_entity(type)) { + continue; + } + + types.insert(type); + + std::set names; + for (auto& q : service.daemons) { + names.insert(q.first); + DaemonKey key{type, q.first}; + if (!daemon_state.exists(key)) { + auto daemon = std::make_shared(daemon_state.types); + daemon->key = key; + daemon->set_metadata(q.second.metadata); + daemon->service_daemon = true; + daemon_state.insert(daemon); + dout(10) << "added missing " << key << dendl; + } + } + daemon_state.cull(type, names); + } + daemon_state.cull_services(types); +} + +void DaemonServer::got_mgr_map() +{ + std::lock_guard l(lock); + set have; + cluster_state.with_mgrmap([&](const MgrMap& mgrmap) { + auto md_update = [&] (DaemonKey key) { + std::ostringstream oss; + auto c = new MetadataUpdate(daemon_state, key); + // FIXME remove post-nautilus: include 'id' for luminous mons + oss << "{\"prefix\": \"mgr metadata\", \"who\": \"" + << key.name << "\", \"id\": \"" << key.name << "\"}"; + monc->start_mon_command({oss.str()}, {}, &c->outbl, &c->outs, c); + }; + if (mgrmap.active_name.size()) { + DaemonKey key{"mgr", mgrmap.active_name}; + have.insert(mgrmap.active_name); + if (!daemon_state.exists(key) && !daemon_state.is_updating(key)) { + md_update(key); + dout(10) << "triggered addition of " << key << " via metadata update" << dendl; + } + } + for (auto& i : mgrmap.standbys) { + DaemonKey key{"mgr", i.second.name}; + have.insert(i.second.name); + if (!daemon_state.exists(key) && !daemon_state.is_updating(key)) { + md_update(key); + dout(10) << "triggered addition of " << key << " via metadata update" << dendl; + } + } + }); + daemon_state.cull("mgr", have); +} + +const char** DaemonServer::get_tracked_conf_keys() const +{ + static const char *KEYS[] = { + "mgr_stats_threshold", + "mgr_stats_period", + nullptr + }; + + return KEYS; +} + +void DaemonServer::handle_conf_change(const ConfigProxy& conf, + const std::set &changed) +{ + + if (changed.count("mgr_stats_threshold") || changed.count("mgr_stats_period")) { + dout(4) << "Updating stats threshold/period on " + << daemon_connections.size() << " clients" << dendl; + // Send a fresh MMgrConfigure to all clients, so that they can follow + // the new policy for transmitting stats + finisher.queue(new LambdaContext([this](int r) { + std::lock_guard l(lock); + for (auto &c : daemon_connections) { + _send_configure(c); + } + })); + } +} + +void DaemonServer::_send_configure(ConnectionRef c) +{ + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + + auto configure = make_message(); + configure->stats_period = g_conf().get_val("mgr_stats_period"); + configure->stats_threshold = g_conf().get_val("mgr_stats_threshold"); + + if (c->peer_is_osd()) { + configure->osd_perf_metric_queries = + osd_perf_metric_collector.get_queries(); + } + + c->send_message2(configure); +} + +MetricQueryID DaemonServer::add_osd_perf_query( + const OSDPerfMetricQuery &query, + const std::optional &limit) +{ + return osd_perf_metric_collector.add_query(query, limit); +} + +int DaemonServer::remove_osd_perf_query(MetricQueryID query_id) +{ + return osd_perf_metric_collector.remove_query(query_id); +} + +int DaemonServer::get_osd_perf_counters( + MetricQueryID query_id, + std::map *counters) +{ + return osd_perf_metric_collector.get_counters(query_id, counters); +} diff --git a/src/mon/MonCap.cc b/src/mon/MonCap.cc index 644d614bdf94a..c397e10a58553 100644 --- a/src/mon/MonCap.cc +++ b/src/mon/MonCap.cc @@ -214,6 +214,12 @@ mon_rwxa_t MonCapGrant::get_allowed(CephContext *cct, } return MON_CAP_ALL; } + // we don't allow config-key service to be accessed with blanket caps other + // than '*' (i.e., 'any'), and that should have been checked by the caller + // via 'is_allow_all()'. + if (s == "config-key") { + return 0; + } return allow; } @@ -346,7 +352,7 @@ struct MonCapParser : qi::grammar quoted_string %= lexeme['"' >> +(char_ - '"') >> '"'] | lexeme['\'' >> +(char_ - '\'') >> '\'']; - unquoted_word %= +char_("a-zA-Z0-9_.-"); + unquoted_word %= +char_("a-zA-Z0-9_/.-"); str %= quoted_string | unquoted_word; spaces = +lit(' '); diff --git a/src/mon/MonCap.cc.orig b/src/mon/MonCap.cc.orig new file mode 100644 index 0000000000000..644d614bdf94a --- /dev/null +++ b/src/mon/MonCap.cc.orig @@ -0,0 +1,450 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "MonCap.h" +#include "include/stringify.h" +#include "common/config.h" +#include "common/debug.h" +#include "common/Formatter.h" + +#include + +static inline bool is_not_alnum_space(char c) +{ + return !(isalpha(c) || isdigit(c) || (c == '-') || (c == '_')); +} + +static string maybe_quote_string(const std::string& str) +{ + if (find_if(str.begin(), str.end(), is_not_alnum_space) == str.end()) + return str; + return string("\"") + str + string("\""); +} + +using std::ostream; +using std::vector; + +#define dout_subsys ceph_subsys_mon + +ostream& operator<<(ostream& out, mon_rwxa_t p) +{ + if (p == MON_CAP_ANY) + return out << "*"; + + if (p & MON_CAP_R) + out << "r"; + if (p & MON_CAP_W) + out << "w"; + if (p & MON_CAP_X) + out << "x"; + return out; +} + +ostream& operator<<(ostream& out, const StringConstraint& c) +{ + if (c.prefix.length()) + return out << "prefix " << c.prefix; + else + return out << "value " << c.value; +} + +ostream& operator<<(ostream& out, const MonCapGrant& m) +{ + out << "allow"; + if (m.service.length()) { + out << " service " << maybe_quote_string(m.service); + } + if (m.command.length()) { + out << " command " << maybe_quote_string(m.command); + if (!m.command_args.empty()) { + out << " with"; + for (map::const_iterator p = m.command_args.begin(); + p != m.command_args.end(); + ++p) { + if (p->second.value.length()) + out << " " << maybe_quote_string(p->first) << "=" << maybe_quote_string(p->second.value); + else + out << " " << maybe_quote_string(p->first) << " prefix " << maybe_quote_string(p->second.prefix); + } + } + } + if (m.profile.length()) { + out << " profile " << maybe_quote_string(m.profile); + } + if (m.allow != 0) + out << " " << m.allow; + return out; +} + + +// +// fusion lets us easily populate structs via the qi parser. + +typedef map kvmap; + +BOOST_FUSION_ADAPT_STRUCT(MonCapGrant, + (std::string, service) + (std::string, profile) + (std::string, command) + (kvmap, command_args) + (mon_rwxa_t, allow)) + +BOOST_FUSION_ADAPT_STRUCT(StringConstraint, + (std::string, value) + (std::string, prefix)) + +// + +void MonCapGrant::expand_profile(entity_name_t name) const +{ + // only generate this list once + if (!profile_grants.empty()) + return; + + if (profile == "mon") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_ALL)); + profile_grants.push_back(MonCapGrant("log", MON_CAP_ALL)); + } + if (profile == "osd") { + profile_grants.push_back(MonCapGrant("osd", MON_CAP_ALL)); + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("pg", MON_CAP_R | MON_CAP_W)); + profile_grants.push_back(MonCapGrant("log", MON_CAP_W)); + } + if (profile == "mds") { + profile_grants.push_back(MonCapGrant("mds", MON_CAP_ALL)); + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("log", MON_CAP_W)); + } + if (profile == "osd" || profile == "mds" || profile == "mon") { + string prefix = string("daemon-private/") + stringify(name) + string("/"); + profile_grants.push_back(MonCapGrant("config-key get", "key", StringConstraint("", prefix))); + profile_grants.push_back(MonCapGrant("config-key put", "key", StringConstraint("", prefix))); + profile_grants.push_back(MonCapGrant("config-key exists", "key", StringConstraint("", prefix))); + profile_grants.push_back(MonCapGrant("config-key delete", "key", StringConstraint("", prefix))); + } + if (profile == "bootstrap-osd") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); // read osdmap + profile_grants.push_back(MonCapGrant("mon getmap")); + profile_grants.push_back(MonCapGrant("osd create")); + profile_grants.push_back(MonCapGrant("auth add")); + profile_grants.back().command_args["entity"] = StringConstraint("", "osd."); + profile_grants.back().command_args["caps_mon"] = StringConstraint("allow profile osd", ""); + profile_grants.back().command_args["caps_osd"] = StringConstraint("allow *", ""); + } + if (profile == "bootstrap-mds") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); // read osdmap + profile_grants.push_back(MonCapGrant("mon getmap")); + profile_grants.push_back(MonCapGrant("auth get-or-create")); // FIXME: this can expose other mds keys + profile_grants.back().command_args["entity"] = StringConstraint("", "mds."); + profile_grants.back().command_args["caps_mon"] = StringConstraint("allow profile mds", ""); + profile_grants.back().command_args["caps_osd"] = StringConstraint("allow rwx", ""); + profile_grants.back().command_args["caps_mds"] = StringConstraint("allow", ""); + } + if (profile == "fs-client") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("mds", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("pg", MON_CAP_R)); + } + if (profile == "simple-rados-client") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("pg", MON_CAP_R)); + } +} + +mon_rwxa_t MonCapGrant::get_allowed(CephContext *cct, + entity_name_t name, + const std::string& s, const std::string& c, + const map& c_args) const +{ + if (profile.length()) { + expand_profile(name); + mon_rwxa_t a; + for (list::const_iterator p = profile_grants.begin(); + p != profile_grants.end(); ++p) + a = a | p->get_allowed(cct, name, s, c, c_args); + return a; + } + if (service.length()) { + if (service != s) + return 0; + return allow; + } + if (command.length()) { + if (command != c) + return 0; + for (map::const_iterator p = command_args.begin(); p != command_args.end(); ++p) { + map::const_iterator q = c_args.find(p->first); + // argument must be present if a constraint exists + if (q == c_args.end()) + return 0; + if (p->second.value.length()) { + // match value + if (p->second.value != q->second) + return 0; + } else { + // match prefix + if (q->second.find(p->second.prefix) != 0) + return 0; + } + } + return MON_CAP_ALL; + } + return allow; +} + +ostream& operator<<(ostream&out, const MonCap& m) +{ + for (vector::const_iterator p = m.grants.begin(); p != m.grants.end(); ++p) { + if (p != m.grants.begin()) + out << ", "; + out << *p; + } + return out; +} + +bool MonCap::is_allow_all() const +{ + for (vector::const_iterator p = grants.begin(); p != grants.end(); ++p) + if (p->is_allow_all()) + return true; + return false; +} + +void MonCap::set_allow_all() +{ + grants.clear(); + grants.push_back(MonCapGrant(MON_CAP_ANY)); + text = "allow *"; +} + +bool MonCap::is_capable(CephContext *cct, + entity_name_t name, + const string& service, + const string& command, const map& command_args, + bool op_may_read, bool op_may_write, bool op_may_exec) const +{ + if (cct) + ldout(cct, 20) << "is_capable service=" << service << " command=" << command + << (op_may_read ? " read":"") + << (op_may_write ? " write":"") + << (op_may_exec ? " exec":"") + << " on cap " << *this + << dendl; + mon_rwxa_t allow = 0; + for (vector::const_iterator p = grants.begin(); + p != grants.end(); ++p) { + if (cct) + ldout(cct, 20) << " allow so far " << allow << ", doing grant " << *p << dendl; + + if (p->is_allow_all()) { + if (cct) + ldout(cct, 20) << " allow all" << dendl; + return true; + } + + // check enumerated caps + allow = allow | p->get_allowed(cct, name, service, command, command_args); + if ((!op_may_read || (allow & MON_CAP_R)) && + (!op_may_write || (allow & MON_CAP_W)) && + (!op_may_exec || (allow & MON_CAP_X))) { + if (cct) + ldout(cct, 20) << " match" << dendl; + return true; + } + } + return false; +} + +void MonCap::encode(bufferlist& bl) const +{ + ENCODE_START(4, 4, bl); // legacy MonCaps was 3, 3 + ::encode(text, bl); + ENCODE_FINISH(bl); +} + +void MonCap::decode(bufferlist::iterator& bl) +{ + string s; + DECODE_START(4, bl); + ::decode(s, bl); + DECODE_FINISH(bl); + parse(s, NULL); +} + +void MonCap::dump(Formatter *f) const +{ + f->dump_string("text", text); +} + +void MonCap::generate_test_instances(list& ls) +{ + ls.push_back(new MonCap); + ls.push_back(new MonCap); + ls.back()->parse("allow *"); + ls.push_back(new MonCap); + ls.back()->parse("allow rwx"); + ls.push_back(new MonCap); + ls.back()->parse("allow service foo x"); + ls.push_back(new MonCap); + ls.back()->parse("allow command bar x"); + ls.push_back(new MonCap); + ls.back()->parse("allow service foo r, allow command bar x"); + ls.push_back(new MonCap); + ls.back()->parse("allow command bar with k1=v1 x"); + ls.push_back(new MonCap); + ls.back()->parse("allow command bar with k1=v1 k2=v2 x"); +} + +// grammar +namespace qi = boost::spirit::qi; +namespace ascii = boost::spirit::ascii; +namespace phoenix = boost::phoenix; + + +template +struct MonCapParser : qi::grammar +{ + MonCapParser() : MonCapParser::base_type(moncap) + { + using qi::char_; + using qi::int_; + using qi::ulong_long; + using qi::lexeme; + using qi::alnum; + using qi::_val; + using qi::_1; + using qi::_2; + using qi::_3; + using qi::eps; + using qi::lit; + + quoted_string %= + lexeme['"' >> +(char_ - '"') >> '"'] | + lexeme['\'' >> +(char_ - '\'') >> '\'']; + unquoted_word %= +char_("a-zA-Z0-9_.-"); + str %= quoted_string | unquoted_word; + + spaces = +lit(' '); + + // command := command[=]cmd [k1=v1 k2=v2 ...] + str_match = '=' >> str >> qi::attr(string()); + str_prefix = spaces >> lit("prefix") >> spaces >> qi::attr(string()) >> str; + kv_pair = str >> (str_match | str_prefix); + kv_map %= kv_pair >> *(spaces >> kv_pair); + command_match = -spaces >> lit("allow") >> spaces >> lit("command") >> (lit('=') | spaces) + >> qi::attr(string()) >> qi::attr(string()) + >> str + >> -(spaces >> lit("with") >> spaces >> kv_map) + >> qi::attr(0); + + // service foo rwxa + service_match %= -spaces >> lit("allow") >> spaces >> lit("service") >> (lit('=') | spaces) + >> str >> qi::attr(string()) >> qi::attr(string()) + >> qi::attr(map()) + >> spaces >> rwxa; + + // profile foo + profile_match %= -spaces >> lit("allow") >> spaces >> lit("profile") >> (lit('=') | spaces) + >> qi::attr(string()) + >> str + >> qi::attr(string()) + >> qi::attr(map()) + >> qi::attr(0); + + // rwxa + rwxa_match %= -spaces >> lit("allow") >> spaces + >> qi::attr(string()) >> qi::attr(string()) >> qi::attr(string()) + >> qi::attr(map()) + >> rwxa; + + // rwxa := * | [r][w][x] + rwxa = + (lit("*")[_val = MON_CAP_ANY]) | + ( eps[_val = 0] >> + ( lit('r')[_val |= MON_CAP_R] || + lit('w')[_val |= MON_CAP_W] || + lit('x')[_val |= MON_CAP_X] + ) + ); + + // grant := allow ... + grant = -spaces >> (rwxa_match | profile_match | service_match | command_match) >> -spaces; + + // moncap := grant [grant ...] + grants %= (grant % (*lit(' ') >> (lit(';') | lit(',')) >> *lit(' '))); + moncap = grants [_val = phoenix::construct(_1)]; + + } + qi::rule spaces; + qi::rule rwxa; + qi::rule quoted_string; + qi::rule unquoted_word; + qi::rule str; + + qi::rule str_match, str_prefix; + qi::rule()> kv_pair; + qi::rule()> kv_map; + + qi::rule rwxa_match; + qi::rule command_match; + qi::rule service_match; + qi::rule profile_match; + qi::rule grant; + qi::rule()> grants; + qi::rule moncap; +}; + +bool MonCap::parse(const string& str, ostream *err) +{ + string s = str; + string::iterator iter = s.begin(); + string::iterator end = s.end(); + + MonCapParser g; + bool r = qi::parse(iter, end, g, *this); + //MonCapGrant foo; + //bool r = qi::phrase_parse(iter, end, g, ascii::space, foo); + if (r && iter == end) { + text = str; + return true; + } + + // Make sure no grants are kept after parsing failed! + grants.clear(); + + if (err) { + if (iter != end) + *err << "moncap parse failed, stopped at '" << std::string(iter, end) + << "' of '" << str << "'\n"; + else + *err << "moncap parse failed, stopped at end of '" << str << "'\n"; + } + + return false; +} + diff --git a/src/mon/MonClient.cc.orig b/src/mon/MonClient.cc.orig new file mode 100644 index 0000000000000..57fb9b1554f3e --- /dev/null +++ b/src/mon/MonClient.cc.orig @@ -0,0 +1,2008 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include "common/weighted_shuffle.h" + +#include "include/scope_guard.h" +#include "include/stringify.h" + +#include "messages/MMonGetMap.h" +#include "messages/MMonGetVersion.h" +#include "messages/MMonGetVersionReply.h" +#include "messages/MMonMap.h" +#include "messages/MConfig.h" +#include "messages/MGetConfig.h" +#include "messages/MAuth.h" +#include "messages/MLogAck.h" +#include "messages/MAuthReply.h" +#include "messages/MMonCommand.h" +#include "messages/MMonCommandAck.h" +#include "messages/MCommand.h" +#include "messages/MCommandReply.h" +#include "messages/MPing.h" + +#include "messages/MMonSubscribe.h" +#include "messages/MMonSubscribeAck.h" +#include "common/errno.h" +#include "common/hostname.h" +#include "common/LogClient.h" + +#include "MonClient.h" +#include "MonMap.h" + +#include "auth/Auth.h" +#include "auth/KeyRing.h" +#include "auth/AuthClientHandler.h" +#include "auth/AuthRegistry.h" +#include "auth/RotatingKeyRing.h" + +#define dout_subsys ceph_subsys_monc +#undef dout_prefix +#define dout_prefix *_dout << "monclient" << (_hunting() ? "(hunting)":"") << ": " + +using std::string; + +MonClient::MonClient(CephContext *cct_) : + Dispatcher(cct_), + AuthServer(cct_), + messenger(NULL), + timer(cct_, monc_lock), + finisher(cct_), + initialized(false), + log_client(NULL), + more_log_pending(false), + want_monmap(true), + had_a_connection(false), + reopen_interval_multiplier( + cct_->_conf.get_val("mon_client_hunt_interval_min_multiple")), + last_mon_command_tid(0), + version_req_id(0) +{} + +MonClient::~MonClient() +{ +} + +int MonClient::build_initial_monmap() +{ + ldout(cct, 10) << __func__ << dendl; + int r = monmap.build_initial(cct, false, std::cerr); + ldout(cct,10) << "monmap:\n"; + monmap.print(*_dout); + *_dout << dendl; + return r; +} + +int MonClient::get_monmap() +{ + ldout(cct, 10) << __func__ << dendl; + std::unique_lock l(monc_lock); + + sub.want("monmap", 0, 0); + if (!_opened()) + _reopen_session(); + map_cond.wait(l, [this] { return !want_monmap; }); + ldout(cct, 10) << __func__ << " done" << dendl; + return 0; +} + +int MonClient::get_monmap_and_config() +{ + ldout(cct, 10) << __func__ << dendl; + ceph_assert(!messenger); + + int tries = 10; + + cct->init_crypto(); + auto shutdown_crypto = make_scope_guard([this] { + cct->shutdown_crypto(); + }); + + int r = build_initial_monmap(); + if (r < 0) { + lderr(cct) << __func__ << " cannot identify monitors to contact" << dendl; + return r; + } + + messenger = Messenger::create_client_messenger( + cct, "temp_mon_client"); + ceph_assert(messenger); + messenger->add_dispatcher_head(this); + messenger->start(); + auto shutdown_msgr = make_scope_guard([this] { + messenger->shutdown(); + messenger->wait(); + delete messenger; + messenger = nullptr; + if (!monmap.fsid.is_zero()) { + cct->_conf.set_val("fsid", stringify(monmap.fsid)); + } + }); + + while (tries-- > 0) { + r = init(); + if (r < 0) { + return r; + } + r = authenticate(cct->_conf->client_mount_timeout); + if (r == -ETIMEDOUT) { + shutdown(); + continue; + } + if (r < 0) { + break; + } + { + std::unique_lock l(monc_lock); + if (monmap.get_epoch() && + !monmap.persistent_features.contains_all( + ceph::features::mon::FEATURE_MIMIC)) { + ldout(cct,10) << __func__ << " pre-mimic monitor, no config to fetch" + << dendl; + r = 0; + break; + } + while ((!got_config || monmap.get_epoch() == 0) && r == 0) { + ldout(cct,20) << __func__ << " waiting for monmap|config" << dendl; + map_cond.wait_for(l, ceph::make_timespan( + cct->_conf->mon_client_hunt_interval)); + } + if (got_config) { + ldout(cct,10) << __func__ << " success" << dendl; + r = 0; + break; + } + } + lderr(cct) << __func__ << " failed to get config" << dendl; + shutdown(); + continue; + } + + shutdown(); + return r; +} + + +/** + * Ping the monitor with id @p mon_id and set the resulting reply in + * the provided @p result_reply, if this last parameter is not NULL. + * + * So that we don't rely on the MonClient's default messenger, set up + * during connect(), we create our own messenger to comunicate with the + * specified monitor. This is advantageous in the following ways: + * + * - Isolate the ping procedure from the rest of the MonClient's operations, + * allowing us to not acquire or manage the big monc_lock, thus not + * having to block waiting for some other operation to finish before we + * can proceed. + * * for instance, we can ping mon.FOO even if we are currently hunting + * or blocked waiting for auth to complete with mon.BAR. + * + * - Ping a monitor prior to establishing a connection (using connect()) + * and properly establish the MonClient's messenger. This frees us + * from dealing with the complex foo that happens in connect(). + * + * We also don't rely on MonClient as a dispatcher for this messenger, + * unlike what happens with the MonClient's default messenger. This allows + * us to sandbox the whole ping, having it much as a separate entity in + * the MonClient class, considerably simplifying the handling and dispatching + * of messages without needing to consider monc_lock. + * + * Current drawback is that we will establish a messenger for each ping + * we want to issue, instead of keeping a single messenger instance that + * would be used for all pings. + */ +int MonClient::ping_monitor(const string &mon_id, string *result_reply) +{ + ldout(cct, 10) << __func__ << dendl; + + string new_mon_id; + if (monmap.contains("noname-"+mon_id)) { + new_mon_id = "noname-"+mon_id; + } else { + new_mon_id = mon_id; + } + + if (new_mon_id.empty()) { + ldout(cct, 10) << __func__ << " specified mon id is empty!" << dendl; + return -EINVAL; + } else if (!monmap.contains(new_mon_id)) { + ldout(cct, 10) << __func__ << " no such monitor 'mon." << new_mon_id << "'" + << dendl; + return -ENOENT; + } + + // N.B. monc isn't initialized + + auth_registry.refresh_config(); + + KeyRing keyring; + keyring.from_ceph_context(cct); + RotatingKeyRing rkeyring(cct, cct->get_module_type(), &keyring); + + MonClientPinger *pinger = new MonClientPinger(cct, + &rkeyring, + result_reply); + + Messenger *smsgr = Messenger::create_client_messenger(cct, "temp_ping_client"); + smsgr->add_dispatcher_head(pinger); + smsgr->set_auth_client(pinger); + smsgr->start(); + + ConnectionRef con = smsgr->connect_to_mon(monmap.get_addrs(new_mon_id)); + ldout(cct, 10) << __func__ << " ping mon." << new_mon_id + << " " << con->get_peer_addr() << dendl; + + pinger->mc.reset(new MonConnection(cct, con, 0, &auth_registry)); + pinger->mc->start(monmap.get_epoch(), entity_name); + con->send_message(new MPing); + + int ret = pinger->wait_for_reply(cct->_conf->mon_client_ping_timeout); + if (ret == 0) { + ldout(cct,10) << __func__ << " got ping reply" << dendl; + } else { + ret = -ret; + } + + con->mark_down(); + pinger->mc.reset(); + smsgr->shutdown(); + smsgr->wait(); + delete smsgr; + delete pinger; + return ret; +} + +bool MonClient::ms_dispatch(Message *m) +{ + // we only care about these message types + switch (m->get_type()) { + case CEPH_MSG_MON_MAP: + case CEPH_MSG_AUTH_REPLY: + case CEPH_MSG_MON_SUBSCRIBE_ACK: + case CEPH_MSG_MON_GET_VERSION_REPLY: + case MSG_MON_COMMAND_ACK: + case MSG_COMMAND_REPLY: + case MSG_LOGACK: + case MSG_CONFIG: + break; + case CEPH_MSG_PING: + m->put(); + return true; + default: + return false; + } + + std::lock_guard lock(monc_lock); + + if (!m->get_connection()->is_anon() && + m->get_source().type() == CEPH_ENTITY_TYPE_MON) { + if (_hunting()) { + auto p = _find_pending_con(m->get_connection()); + if (p == pending_cons.end()) { + // ignore any messages outside hunting sessions + ldout(cct, 10) << "discarding stray monitor message " << *m << dendl; + m->put(); + return true; + } + } else if (!active_con || active_con->get_con() != m->get_connection()) { + // ignore any messages outside our session(s) + ldout(cct, 10) << "discarding stray monitor message " << *m << dendl; + m->put(); + return true; + } + } + + switch (m->get_type()) { + case CEPH_MSG_MON_MAP: + handle_monmap(static_cast(m)); + if (passthrough_monmap) { + return false; + } else { + m->put(); + } + break; + case CEPH_MSG_AUTH_REPLY: + handle_auth(static_cast(m)); + break; + case CEPH_MSG_MON_SUBSCRIBE_ACK: + handle_subscribe_ack(static_cast(m)); + break; + case CEPH_MSG_MON_GET_VERSION_REPLY: + handle_get_version_reply(static_cast(m)); + break; + case MSG_MON_COMMAND_ACK: + handle_mon_command_ack(static_cast(m)); + break; + case MSG_COMMAND_REPLY: + if (m->get_connection()->is_anon() && + m->get_source().type() == CEPH_ENTITY_TYPE_MON) { + // this connection is from 'tell'... ignore everything except our command + // reply. (we'll get misc other message because we authenticated, but we + // don't need them.) + handle_command_reply(static_cast(m)); + return true; + } + // leave the message for another dispatch handler (e.g., Objecter) + return false; + case MSG_LOGACK: + if (log_client) { + log_client->handle_log_ack(static_cast(m)); + m->put(); + if (more_log_pending) { + send_log(); + } + } else { + m->put(); + } + break; + case MSG_CONFIG: + handle_config(static_cast(m)); + break; + } + return true; +} + +void MonClient::send_log(bool flush) +{ + if (log_client) { + auto lm = log_client->get_mon_log_message(flush); + if (lm) + _send_mon_message(std::move(lm)); + more_log_pending = log_client->are_pending(); + } +} + +void MonClient::flush_log() +{ + std::lock_guard l(monc_lock); + send_log(); +} + +/* Unlike all the other message-handling functions, we don't put away a reference +* because we want to support MMonMap passthrough to other Dispatchers. */ +void MonClient::handle_monmap(MMonMap *m) +{ + ldout(cct, 10) << __func__ << " " << *m << dendl; + auto con_addrs = m->get_source_addrs(); + string old_name = monmap.get_name(con_addrs); + const auto old_epoch = monmap.get_epoch(); + + auto p = m->monmapbl.cbegin(); + decode(monmap, p); + + ldout(cct, 10) << " got monmap " << monmap.epoch + << " from mon." << old_name + << " (according to old e" << monmap.get_epoch() << ")" + << dendl; + ldout(cct, 10) << "dump:\n"; + monmap.print(*_dout); + *_dout << dendl; + + if (old_epoch != monmap.get_epoch()) { + tried.clear(); + } + if (old_name.size() == 0) { + ldout(cct,10) << " can't identify which mon we were connected to" << dendl; + _reopen_session(); + } else { + auto new_name = monmap.get_name(con_addrs); + if (new_name.empty()) { + ldout(cct, 10) << "mon." << old_name << " at " << con_addrs + << " went away" << dendl; + // can't find the mon we were talking to (above) + _reopen_session(); + } else if (messenger->should_use_msgr2() && + monmap.get_addrs(new_name).has_msgr2() && + !con_addrs.has_msgr2()) { + ldout(cct,1) << " mon." << new_name << " has (v2) addrs " + << monmap.get_addrs(new_name) << " but i'm connected to " + << con_addrs << ", reconnecting" << dendl; + _reopen_session(); + } + } + + sub.got("monmap", monmap.get_epoch()); + map_cond.notify_all(); + want_monmap = false; + + if (authenticate_err == 1) { + _finish_auth(0); + } +} + +void MonClient::handle_config(MConfig *m) +{ + ldout(cct,10) << __func__ << " " << *m << dendl; + finisher.queue(new LambdaContext([this, m](int r) { + cct->_conf.set_mon_vals(cct, m->config, config_cb); + if (config_notify_cb) { + config_notify_cb(); + } + m->put(); + })); + got_config = true; + map_cond.notify_all(); +} + +// ---------------------- + +int MonClient::init() +{ + ldout(cct, 10) << __func__ << dendl; + + entity_name = cct->_conf->name; + + auth_registry.refresh_config(); + + std::lock_guard l(monc_lock); + keyring.reset(new KeyRing); + if (auth_registry.is_supported_method(messenger->get_mytype(), + CEPH_AUTH_CEPHX)) { + // this should succeed, because auth_registry just checked! + int r = keyring->from_ceph_context(cct); + if (r != 0) { + // but be somewhat graceful in case there was a race condition + lderr(cct) << "keyring not found" << dendl; + return r; + } + } + if (!auth_registry.any_supported_methods(messenger->get_mytype())) { + return -ENOENT; + } + + rotating_secrets.reset( + new RotatingKeyRing(cct, cct->get_module_type(), keyring.get())); + + initialized = true; + + messenger->set_auth_client(this); + messenger->add_dispatcher_head(this); + + timer.init(); + finisher.start(); + schedule_tick(); + + return 0; +} + +void MonClient::shutdown() +{ + ldout(cct, 10) << __func__ << dendl; + monc_lock.lock(); + stopping = true; + while (!version_requests.empty()) { + version_requests.begin()->second->context->complete(-ECANCELED); + ldout(cct, 20) << __func__ << " canceling and discarding version request " + << version_requests.begin()->second << dendl; + delete version_requests.begin()->second; + version_requests.erase(version_requests.begin()); + } + while (!mon_commands.empty()) { + auto tid = mon_commands.begin()->first; + _cancel_mon_command(tid); + } + ldout(cct, 20) << __func__ << " discarding " << waiting_for_session.size() + << " pending message(s)" << dendl; + waiting_for_session.clear(); + + active_con.reset(); + pending_cons.clear(); + auth.reset(); + + monc_lock.unlock(); + + if (initialized) { + finisher.wait_for_empty(); + finisher.stop(); + initialized = false; + } + monc_lock.lock(); + timer.shutdown(); + stopping = false; + monc_lock.unlock(); +} + +int MonClient::authenticate(double timeout) +{ + std::unique_lock lock{monc_lock}; + + if (active_con) { + ldout(cct, 5) << "already authenticated" << dendl; + return 0; + } + sub.want("monmap", monmap.get_epoch() ? monmap.get_epoch() + 1 : 0, 0); + sub.want("config", 0, 0); + if (!_opened()) + _reopen_session(); + + auto until = ceph::real_clock::now(); + until += ceph::make_timespan(timeout); + if (timeout > 0.0) + ldout(cct, 10) << "authenticate will time out at " << until << dendl; + authenticate_err = 1; // == in progress + while (!active_con && authenticate_err >= 0) { + if (timeout > 0.0) { + auto r = auth_cond.wait_until(lock, until); + if (r == cv_status::timeout && !active_con) { + ldout(cct, 0) << "authenticate timed out after " << timeout << dendl; + authenticate_err = -ETIMEDOUT; + } + } else { + auth_cond.wait(lock); + } + } + + if (active_con) { + ldout(cct, 5) << __func__ << " success, global_id " + << active_con->get_global_id() << dendl; + // active_con should not have been set if there was an error + ceph_assert(authenticate_err >= 0); + authenticated = true; + } + + if (authenticate_err < 0 && auth_registry.no_keyring_disabled_cephx()) { + lderr(cct) << __func__ << " NOTE: no keyring found; disabled cephx authentication" << dendl; + } + + return authenticate_err; +} + +void MonClient::handle_auth(MAuthReply *m) +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + + if (m->get_connection()->is_anon()) { + // anon connection, used for mon tell commands + for (auto& p : mon_commands) { + if (p.second->target_con == m->get_connection()) { + auto& mc = p.second->target_session; + int ret = mc->handle_auth(m, entity_name, + CEPH_ENTITY_TYPE_MON, + rotating_secrets.get()); + (void)ret; // we don't care + break; + } + } + m->put(); + return; + } + + if (!_hunting()) { + std::swap(active_con->get_auth(), auth); + int ret = active_con->authenticate(m); + m->put(); + std::swap(auth, active_con->get_auth()); + if (global_id != active_con->get_global_id()) { + lderr(cct) << __func__ << " peer assigned me a different global_id: " + << active_con->get_global_id() << dendl; + } + if (ret != -EAGAIN) { + _finish_auth(ret); + } + return; + } + + // hunting + auto found = _find_pending_con(m->get_connection()); + ceph_assert(found != pending_cons.end()); + int auth_err = found->second.handle_auth(m, entity_name, want_keys, + rotating_secrets.get()); + m->put(); + if (auth_err == -EAGAIN) { + return; + } + if (auth_err) { + pending_cons.erase(found); + if (!pending_cons.empty()) { + // keep trying with pending connections + return; + } + // the last try just failed, give up. + } else { + auto& mc = found->second; + ceph_assert(mc.have_session()); + active_con.reset(new MonConnection(std::move(mc))); + pending_cons.clear(); + } + + _finish_hunting(auth_err); + _finish_auth(auth_err); +} + +void MonClient::_finish_auth(int auth_err) +{ + ldout(cct,10) << __func__ << " " << auth_err << dendl; + authenticate_err = auth_err; + // _resend_mon_commands() could _reopen_session() if the connected mon is not + // the one the MonCommand is targeting. + if (!auth_err && active_con) { + ceph_assert(auth); + _check_auth_tickets(); + } + auth_cond.notify_all(); + + if (!auth_err) { + Context *cb = nullptr; + if (session_established_context) { + cb = session_established_context.release(); + } + if (cb) { + monc_lock.unlock(); + cb->complete(0); + monc_lock.lock(); + } + } +} + +// --------- + +void MonClient::send_mon_message(MessageRef m) +{ + std::lock_guard l{monc_lock}; + _send_mon_message(std::move(m)); +} + +void MonClient::_send_mon_message(MessageRef m) +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + if (active_con) { + auto cur_con = active_con->get_con(); + ldout(cct, 10) << "_send_mon_message to mon." + << monmap.get_name(cur_con->get_peer_addr()) + << " at " << cur_con->get_peer_addr() << dendl; + cur_con->send_message2(std::move(m)); + } else { + waiting_for_session.push_back(std::move(m)); + } +} + +void MonClient::_reopen_session(int rank) +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + ldout(cct, 10) << __func__ << " rank " << rank << dendl; + + active_con.reset(); + pending_cons.clear(); + + _start_hunting(); + + if (rank >= 0) { + _add_conn(rank, global_id); + } else { + _add_conns(global_id); + } + + // throw out old queued messages + waiting_for_session.clear(); + + // throw out version check requests + while (!version_requests.empty()) { + finisher.queue(version_requests.begin()->second->context, -EAGAIN); + delete version_requests.begin()->second; + version_requests.erase(version_requests.begin()); + } + + for (auto& c : pending_cons) { + c.second.start(monmap.get_epoch(), entity_name); + } + + if (sub.reload()) { + _renew_subs(); + } +} + +MonConnection& MonClient::_add_conn(unsigned rank, uint64_t global_id) +{ + auto peer = monmap.get_addrs(rank); + auto conn = messenger->connect_to_mon(peer); + MonConnection mc(cct, conn, global_id, &auth_registry); + auto inserted = pending_cons.insert(std::make_pair(peer, std::move(mc))); + ldout(cct, 10) << "picked mon." << monmap.get_name(rank) + << " con " << conn + << " addr " << peer + << dendl; + return inserted.first->second; +} + +void MonClient::_add_conns(uint64_t global_id) +{ + // collect the next batch of candidates who are listed right next to the ones + // already tried + auto get_next_batch = [this]() -> std::vector { + std::multimap ranks_by_priority; + boost::copy( + monmap.mon_info | boost::adaptors::filtered( + [this](auto& info) { + auto rank = monmap.get_rank(info.first); + return tried.count(rank) == 0; + }) | boost::adaptors::transformed( + [this](auto& info) { + auto rank = monmap.get_rank(info.first); + return std::make_pair(info.second.priority, rank); + }), std::inserter(ranks_by_priority, end(ranks_by_priority))); + if (ranks_by_priority.empty()) { + return {}; + } + // only choose the monitors with lowest priority + auto cands = boost::make_iterator_range( + ranks_by_priority.equal_range(ranks_by_priority.begin()->first)); + std::vector ranks; + boost::range::copy(cands | boost::adaptors::map_values, + std::back_inserter(ranks)); + return ranks; + }; + auto ranks = get_next_batch(); + if (ranks.empty()) { + tried.clear(); // start over + ranks = get_next_batch(); + } + ceph_assert(!ranks.empty()); + if (ranks.size() > 1) { + std::vector weights; + for (auto i : ranks) { + auto rank_name = monmap.get_name(i); + weights.push_back(monmap.get_weight(rank_name)); + } + std::random_device rd; + if (std::accumulate(begin(weights), end(weights), 0u) == 0) { + std::shuffle(begin(ranks), end(ranks), std::mt19937{rd()}); + } else { + weighted_shuffle(begin(ranks), end(ranks), begin(weights), end(weights), + std::mt19937{rd()}); + } + } + ldout(cct, 10) << __func__ << " ranks=" << ranks << dendl; + unsigned n = cct->_conf->mon_client_hunt_parallel; + if (n == 0 || n > ranks.size()) { + n = ranks.size(); + } + for (unsigned i = 0; i < n; i++) { + _add_conn(ranks[i], global_id); + tried.insert(ranks[i]); + } +} + +bool MonClient::ms_handle_reset(Connection *con) +{ + std::lock_guard lock(monc_lock); + + if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON) + return false; + + if (con->is_anon()) { + auto p = mon_commands.begin(); + while (p != mon_commands.end()) { + auto cmd = p->second; + ++p; + if (cmd->target_con == con) { + _send_command(cmd); // may retry or fail + break; + } + } + return true; + } + + if (_hunting()) { + if (pending_cons.count(con->get_peer_addrs())) { + ldout(cct, 10) << __func__ << " hunted mon " << con->get_peer_addrs() + << dendl; + } else { + ldout(cct, 10) << __func__ << " stray mon " << con->get_peer_addrs() + << dendl; + } + return true; + } else { + if (active_con && con == active_con->get_con()) { + ldout(cct, 10) << __func__ << " current mon " << con->get_peer_addrs() + << dendl; + _reopen_session(); + return false; + } else { + ldout(cct, 10) << "ms_handle_reset stray mon " << con->get_peer_addrs() + << dendl; + return true; + } + } +} + +bool MonClient::_opened() const +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + return active_con || _hunting(); +} + +bool MonClient::_hunting() const +{ + return !pending_cons.empty(); +} + +void MonClient::_start_hunting() +{ + ceph_assert(!_hunting()); + // adjust timeouts if necessary + if (!had_a_connection) + return; + reopen_interval_multiplier *= cct->_conf->mon_client_hunt_interval_backoff; + if (reopen_interval_multiplier > + cct->_conf->mon_client_hunt_interval_max_multiple) { + reopen_interval_multiplier = + cct->_conf->mon_client_hunt_interval_max_multiple; + } +} + +void MonClient::_finish_hunting(int auth_err) +{ + ldout(cct,10) << __func__ << " " << auth_err << dendl; + ceph_assert(ceph_mutex_is_locked(monc_lock)); + // the pending conns have been cleaned. + ceph_assert(!_hunting()); + if (active_con) { + auto con = active_con->get_con(); + ldout(cct, 1) << "found mon." + << monmap.get_name(con->get_peer_addr()) + << dendl; + } else { + ldout(cct, 1) << "no mon sessions established" << dendl; + } + + had_a_connection = true; + _un_backoff(); + + if (!auth_err) { + last_rotating_renew_sent = utime_t(); + while (!waiting_for_session.empty()) { + _send_mon_message(std::move(waiting_for_session.front())); + waiting_for_session.pop_front(); + } + _resend_mon_commands(); + send_log(true); + if (active_con) { + std::swap(auth, active_con->get_auth()); + if (global_id && global_id != active_con->get_global_id()) { + lderr(cct) << __func__ << " global_id changed from " << global_id + << " to " << active_con->get_global_id() << dendl; + } + global_id = active_con->get_global_id(); + } + } +} + +void MonClient::tick() +{ + ldout(cct, 10) << __func__ << dendl; + + utime_t now = ceph_clock_now(); + + auto reschedule_tick = make_scope_guard([this] { + schedule_tick(); + }); + + _check_auth_tickets(); + _check_tell_commands(); + + if (_hunting()) { + ldout(cct, 1) << "continuing hunt" << dendl; + return _reopen_session(); + } else if (active_con) { + // just renew as needed + auto cur_con = active_con->get_con(); + if (!cur_con->has_feature(CEPH_FEATURE_MON_STATEFUL_SUB)) { + const bool maybe_renew = sub.need_renew(); + ldout(cct, 10) << "renew subs? -- " << (maybe_renew ? "yes" : "no") + << dendl; + if (maybe_renew) { + _renew_subs(); + } + } + + if (now > last_keepalive + cct->_conf->mon_client_ping_interval) { + cur_con->send_keepalive(); + last_keepalive = now; + + if (cct->_conf->mon_client_ping_timeout > 0 && + cur_con->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) { + utime_t lk = cur_con->get_last_keepalive_ack(); + utime_t interval = now - lk; + if (interval > cct->_conf->mon_client_ping_timeout) { + ldout(cct, 1) << "no keepalive since " << lk << " (" << interval + << " seconds), reconnecting" << dendl; + return _reopen_session(); + } + } + + _un_backoff(); + } + + if (now > last_send_log + cct->_conf->mon_client_log_interval) { + send_log(); + last_send_log = now; + } + } +} + +void MonClient::_un_backoff() +{ + // un-backoff our reconnect interval + reopen_interval_multiplier = std::max( + cct->_conf.get_val("mon_client_hunt_interval_min_multiple"), + reopen_interval_multiplier / + cct->_conf.get_val("mon_client_hunt_interval_backoff")); + ldout(cct, 20) << __func__ << " reopen_interval_multipler now " + << reopen_interval_multiplier << dendl; +} + +void MonClient::schedule_tick() +{ + auto do_tick = make_lambda_context([this](int) { tick(); }); + if (_hunting()) { + const auto hunt_interval = (cct->_conf->mon_client_hunt_interval * + reopen_interval_multiplier); + timer.add_event_after(hunt_interval, do_tick); + } else { + timer.add_event_after(std::min(cct->_conf->mon_client_ping_interval, + cct->_conf->mon_client_log_interval), + do_tick); + } +} + +// --------- + +void MonClient::_renew_subs() +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + if (!sub.have_new()) { + ldout(cct, 10) << __func__ << " - empty" << dendl; + return; + } + + ldout(cct, 10) << __func__ << dendl; + if (!_opened()) + _reopen_session(); + else { + auto m = ceph::make_message(); + m->what = sub.get_subs(); + m->hostname = ceph_get_short_hostname(); + _send_mon_message(std::move(m)); + sub.renewed(); + } +} + +void MonClient::handle_subscribe_ack(MMonSubscribeAck *m) +{ + sub.acked(m->interval); + m->put(); +} + +int MonClient::_check_auth_tickets() +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + if (active_con && auth) { + if (auth->need_tickets()) { + ldout(cct, 10) << __func__ << " getting new tickets!" << dendl; + auto m = ceph::make_message(); + m->protocol = auth->get_protocol(); + auth->prepare_build_request(); + auth->build_request(m->auth_payload); + _send_mon_message(m); + } + + _check_auth_rotating(); + } + return 0; +} + +int MonClient::_check_auth_rotating() +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + if (!rotating_secrets || + !auth_principal_needs_rotating_keys(entity_name)) { + ldout(cct, 20) << "_check_auth_rotating not needed by " << entity_name << dendl; + return 0; + } + + if (!active_con || !auth) { + ldout(cct, 10) << "_check_auth_rotating waiting for auth session" << dendl; + return 0; + } + + utime_t now = ceph_clock_now(); + utime_t cutoff = now; + cutoff -= std::min(30.0, cct->_conf->auth_service_ticket_ttl / 4.0); + utime_t issued_at_lower_bound = now; + issued_at_lower_bound -= cct->_conf->auth_service_ticket_ttl; + if (!rotating_secrets->need_new_secrets(cutoff)) { + ldout(cct, 10) << "_check_auth_rotating have uptodate secrets (they expire after " << cutoff << ")" << dendl; + rotating_secrets->dump_rotating(); + return 0; + } + + ldout(cct, 10) << "_check_auth_rotating renewing rotating keys (they expired before " << cutoff << ")" << dendl; + if (!rotating_secrets->need_new_secrets() && + rotating_secrets->need_new_secrets(issued_at_lower_bound)) { + // the key has expired before it has been issued? + lderr(cct) << __func__ << " possible clock skew, rotating keys expired way too early" + << " (before " << issued_at_lower_bound << ")" << dendl; + } + if ((now > last_rotating_renew_sent) && + double(now - last_rotating_renew_sent) < 1) { + ldout(cct, 10) << __func__ << " called too often (last: " + << last_rotating_renew_sent << "), skipping refresh" << dendl; + return 0; + } + auto m = ceph::make_message(); + m->protocol = auth->get_protocol(); + if (auth->build_rotating_request(m->auth_payload)) { + last_rotating_renew_sent = now; + _send_mon_message(std::move(m)); + } + return 0; +} + +int MonClient::wait_auth_rotating(double timeout) +{ + std::unique_lock l(monc_lock); + + // Must be initialized + ceph_assert(auth != nullptr); + + if (auth->get_protocol() == CEPH_AUTH_NONE) + return 0; + + if (!rotating_secrets) + return 0; + + ldout(cct, 10) << __func__ << " waiting for " << timeout << dendl; + utime_t now = ceph_clock_now(); + if (auth_cond.wait_for(l, ceph::make_timespan(timeout), [now, this] { + return (!auth_principal_needs_rotating_keys(entity_name) || + !rotating_secrets->need_new_secrets(now)); + })) { + ldout(cct, 10) << __func__ << " done" << dendl; + return 0; + } else { + ldout(cct, 0) << __func__ << " timed out after " << timeout << dendl; + return -ETIMEDOUT; + } +} + +// --------- + +void MonClient::_send_command(MonCommand *r) +{ + if (r->is_tell()) { + ++r->send_attempts; + if (r->send_attempts > cct->_conf->mon_client_directed_command_retry) { + _finish_command(r, -ENXIO, "mon unavailable"); + return; + } + + // tell-style command + if (monmap.min_mon_release >= ceph_release_t::octopus) { + if (r->target_con) { + r->target_con->mark_down(); + } + if (r->target_rank >= 0) { + if (r->target_rank >= (int)monmap.size()) { + ldout(cct, 10) << " target " << r->target_rank + << " >= max mon " << monmap.size() << dendl; + _finish_command(r, -ENOENT, "mon rank dne"); + return; + } + r->target_con = messenger->connect_to_mon( + monmap.get_addrs(r->target_rank), true /* anon */); + } else { + if (!monmap.contains(r->target_name)) { + ldout(cct, 10) << " target " << r->target_name + << " not present in monmap" << dendl; + _finish_command(r, -ENOENT, "mon dne"); + return; + } + r->target_con = messenger->connect_to_mon( + monmap.get_addrs(r->target_name), true /* anon */); + } + + r->target_session.reset(new MonConnection(cct, r->target_con, 0, + &auth_registry)); + r->target_session->start(monmap.get_epoch(), entity_name); + r->last_send_attempt = ceph_clock_now(); + + MCommand *m = new MCommand(monmap.fsid); + m->set_tid(r->tid); + m->cmd = r->cmd; + m->set_data(r->inbl); + r->target_session->queue_command(m); + return; + } + + // ugly legacy handling of pre-octopus mons + entity_addr_t peer; + if (active_con) { + peer = active_con->get_con()->get_peer_addr(); + } + + if (r->target_rank >= 0 && + r->target_rank != monmap.get_rank(peer)) { + ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd + << " wants rank " << r->target_rank + << ", reopening session" + << dendl; + if (r->target_rank >= (int)monmap.size()) { + ldout(cct, 10) << " target " << r->target_rank + << " >= max mon " << monmap.size() << dendl; + _finish_command(r, -ENOENT, "mon rank dne"); + return; + } + _reopen_session(r->target_rank); + return; + } + if (r->target_name.length() && + r->target_name != monmap.get_name(peer)) { + ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd + << " wants mon " << r->target_name + << ", reopening session" + << dendl; + if (!monmap.contains(r->target_name)) { + ldout(cct, 10) << " target " << r->target_name + << " not present in monmap" << dendl; + _finish_command(r, -ENOENT, "mon dne"); + return; + } + _reopen_session(monmap.get_rank(r->target_name)); + return; + } + // fall-thru to send 'normal' CLI command + } + + // normal CLI command + ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd << dendl; + auto m = ceph::make_message(monmap.fsid); + m->set_tid(r->tid); + m->cmd = r->cmd; + m->set_data(r->inbl); + _send_mon_message(std::move(m)); + return; +} + +void MonClient::_check_tell_commands() +{ + // resend any requests + auto now = ceph_clock_now(); + auto p = mon_commands.begin(); + while (p != mon_commands.end()) { + auto cmd = p->second; + ++p; + if (cmd->is_tell() && + cmd->last_send_attempt != utime_t() && + now - cmd->last_send_attempt > cct->_conf->mon_client_hunt_interval) { + ldout(cct,5) << __func__ << " timeout tell command " << cmd->tid << dendl; + _send_command(cmd); // might remove cmd from mon_commands + } + } +} + +void MonClient::_resend_mon_commands() +{ + // resend any requests + auto p = mon_commands.begin(); + while (p != mon_commands.end()) { + auto cmd = p->second; + ++p; + if (cmd->is_tell() && monmap.min_mon_release >= ceph_release_t::octopus) { + // starting with octopus, tell commands use their own connetion and need no + // special resend when we finish hunting. + } else { + _send_command(cmd); // might remove cmd from mon_commands + } + } +} + +void MonClient::handle_mon_command_ack(MMonCommandAck *ack) +{ + MonCommand *r = NULL; + uint64_t tid = ack->get_tid(); + + if (tid == 0 && !mon_commands.empty()) { + r = mon_commands.begin()->second; + ldout(cct, 10) << __func__ << " has tid 0, assuming it is " << r->tid << dendl; + } else { + auto p = mon_commands.find(tid); + if (p == mon_commands.end()) { + ldout(cct, 10) << __func__ << " " << ack->get_tid() << " not found" << dendl; + ack->put(); + return; + } + r = p->second; + } + + ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd << dendl; + if (r->poutbl) + r->poutbl->claim(ack->get_data()); + _finish_command(r, ack->r, ack->rs); + ack->put(); +} + +void MonClient::handle_command_reply(MCommandReply *reply) +{ + MonCommand *r = NULL; + uint64_t tid = reply->get_tid(); + + if (tid == 0 && !mon_commands.empty()) { + r = mon_commands.begin()->second; + ldout(cct, 10) << __func__ << " has tid 0, assuming it is " << r->tid + << dendl; + } else { + auto p = mon_commands.find(tid); + if (p == mon_commands.end()) { + ldout(cct, 10) << __func__ << " " << reply->get_tid() << " not found" + << dendl; + reply->put(); + return; + } + r = p->second; + } + + ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd << dendl; + if (r->poutbl) + r->poutbl->claim(reply->get_data()); + _finish_command(r, reply->r, reply->rs); + reply->put(); +} + +int MonClient::_cancel_mon_command(uint64_t tid) +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + + auto it = mon_commands.find(tid); + if (it == mon_commands.end()) { + ldout(cct, 10) << __func__ << " tid " << tid << " dne" << dendl; + return -ENOENT; + } + + ldout(cct, 10) << __func__ << " tid " << tid << dendl; + + MonCommand *cmd = it->second; + _finish_command(cmd, -ETIMEDOUT, ""); + return 0; +} + +void MonClient::_finish_command(MonCommand *r, int ret, string rs) +{ + ldout(cct, 10) << __func__ << " " << r->tid << " = " << ret << " " << rs << dendl; + if (r->prval) + *(r->prval) = ret; + if (r->prs) + *(r->prs) = rs; + if (r->onfinish) + finisher.queue(r->onfinish, ret); + if (r->target_con) { + r->target_con->mark_down(); + } + mon_commands.erase(r->tid); + delete r; +} + +void MonClient::start_mon_command(const std::vector& cmd, + const ceph::buffer::list& inbl, + ceph::buffer::list *outbl, string *outs, + Context *onfinish) +{ + ldout(cct,10) << __func__ << " cmd=" << cmd << dendl; + std::lock_guard l(monc_lock); + if (!initialized || stopping) { + if (onfinish) { + onfinish->complete(-ECANCELED); + } + return; + } + MonCommand *r = new MonCommand(++last_mon_command_tid); + r->cmd = cmd; + r->inbl = inbl; + r->poutbl = outbl; + r->prs = outs; + r->onfinish = onfinish; + if (cct->_conf->rados_mon_op_timeout > 0) { + class C_CancelMonCommand : public Context + { + uint64_t tid; + MonClient *monc; + public: + C_CancelMonCommand(uint64_t tid, MonClient *monc) : tid(tid), monc(monc) {} + void finish(int r) override { + monc->_cancel_mon_command(tid); + } + }; + r->ontimeout = new C_CancelMonCommand(r->tid, this); + timer.add_event_after(cct->_conf->rados_mon_op_timeout, r->ontimeout); + } + mon_commands[r->tid] = r; + _send_command(r); +} + +void MonClient::start_mon_command(const string &mon_name, + const std::vector& cmd, + const ceph::buffer::list& inbl, + ceph::buffer::list *outbl, string *outs, + Context *onfinish) +{ + ldout(cct,10) << __func__ << " mon." << mon_name << " cmd=" << cmd << dendl; + std::lock_guard l(monc_lock); + if (!initialized || stopping) { + if (onfinish) { + onfinish->complete(-ECANCELED); + } + return; + } + MonCommand *r = new MonCommand(++last_mon_command_tid); + + // detect/tolerate mon *rank* passed as a string + string err; + int rank = strict_strtoll(mon_name.c_str(), 10, &err); + if (err.size() == 0 && rank >= 0) { + ldout(cct,10) << __func__ << " interpreting name '" << mon_name + << "' as rank " << rank << dendl; + r->target_rank = rank; + } else { + r->target_name = mon_name; + } + r->cmd = cmd; + r->inbl = inbl; + r->poutbl = outbl; + r->prs = outs; + r->onfinish = onfinish; + mon_commands[r->tid] = r; + _send_command(r); +} + +void MonClient::start_mon_command(int rank, + const std::vector& cmd, + const ceph::buffer::list& inbl, + ceph::buffer::list *outbl, string *outs, + Context *onfinish) +{ + ldout(cct,10) << __func__ << " rank " << rank << " cmd=" << cmd << dendl; + std::lock_guard l(monc_lock); + if (!initialized || stopping) { + if (onfinish) { + onfinish->complete(-ECANCELED); + } + return; + } + MonCommand *r = new MonCommand(++last_mon_command_tid); + r->target_rank = rank; + r->cmd = cmd; + r->inbl = inbl; + r->poutbl = outbl; + r->prs = outs; + r->onfinish = onfinish; + mon_commands[r->tid] = r; + _send_command(r); +} + +// --------- + +void MonClient::get_version(string map, version_t *newest, version_t *oldest, Context *onfinish) +{ + version_req_d *req = new version_req_d(onfinish, newest, oldest); + ldout(cct, 10) << "get_version " << map << " req " << req << dendl; + std::lock_guard l(monc_lock); + auto m = ceph::make_message(); + m->what = map; + m->handle = ++version_req_id; + version_requests[m->handle] = req; + _send_mon_message(std::move(m)); +} + +void MonClient::handle_get_version_reply(MMonGetVersionReply* m) +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + auto iter = version_requests.find(m->handle); + if (iter == version_requests.end()) { + ldout(cct, 0) << __func__ << " version request with handle " << m->handle + << " not found" << dendl; + } else { + version_req_d *req = iter->second; + ldout(cct, 10) << __func__ << " finishing " << req << " version " << m->version << dendl; + version_requests.erase(iter); + if (req->newest) + *req->newest = m->version; + if (req->oldest) + *req->oldest = m->oldest_version; + finisher.queue(req->context, 0); + delete req; + } + m->put(); +} + +int MonClient::get_auth_request( + Connection *con, + AuthConnectionMeta *auth_meta, + uint32_t *auth_method, + std::vector *preferred_modes, + ceph::buffer::list *bl) +{ + std::lock_guard l(monc_lock); + ldout(cct,10) << __func__ << " con " << con << " auth_method " << *auth_method + << dendl; + + // connection to mon? + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { + ceph_assert(!auth_meta->authorizer); + if (con->is_anon()) { + for (auto& i : mon_commands) { + if (i.second->target_con == con) { + return i.second->target_session->get_auth_request( + auth_method, preferred_modes, bl, + entity_name, want_keys, rotating_secrets.get()); + } + } + } + for (auto& i : pending_cons) { + if (i.second.is_con(con)) { + return i.second.get_auth_request( + auth_method, preferred_modes, bl, + entity_name, want_keys, rotating_secrets.get()); + } + } + return -ENOENT; + } + + // generate authorizer + if (!auth) { + lderr(cct) << __func__ << " but no auth handler is set up" << dendl; + return -EACCES; + } + auth_meta->authorizer.reset(auth->build_authorizer(con->get_peer_type())); + if (!auth_meta->authorizer) { + lderr(cct) << __func__ << " failed to build_authorizer for type " + << ceph_entity_type_name(con->get_peer_type()) << dendl; + return -EACCES; + } + auth_meta->auth_method = auth_meta->authorizer->protocol; + auth_registry.get_supported_modes(con->get_peer_type(), + auth_meta->auth_method, + preferred_modes); + *bl = auth_meta->authorizer->bl; + return 0; +} + +int MonClient::handle_auth_reply_more( + Connection *con, + AuthConnectionMeta *auth_meta, + const ceph::buffer::list& bl, + ceph::buffer::list *reply) +{ + std::lock_guard l(monc_lock); + + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { + if (con->is_anon()) { + for (auto& i : mon_commands) { + if (i.second->target_con == con) { + return i.second->target_session->handle_auth_reply_more( + auth_meta, bl, reply); + } + } + } + for (auto& i : pending_cons) { + if (i.second.is_con(con)) { + return i.second.handle_auth_reply_more(auth_meta, bl, reply); + } + } + return -ENOENT; + } + + // authorizer challenges + if (!auth || !auth_meta->authorizer) { + lderr(cct) << __func__ << " no authorizer?" << dendl; + return -1; + } + auth_meta->authorizer->add_challenge(cct, bl); + *reply = auth_meta->authorizer->bl; + return 0; +} + +int MonClient::handle_auth_done( + Connection *con, + AuthConnectionMeta *auth_meta, + uint64_t global_id, + uint32_t con_mode, + const ceph::buffer::list& bl, + CryptoKey *session_key, + std::string *connection_secret) +{ + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { + std::lock_guard l(monc_lock); + if (con->is_anon()) { + for (auto& i : mon_commands) { + if (i.second->target_con == con) { + return i.second->target_session->handle_auth_done( + auth_meta, global_id, bl, + session_key, connection_secret); + } + } + } + for (auto& i : pending_cons) { + if (i.second.is_con(con)) { + int r = i.second.handle_auth_done( + auth_meta, global_id, bl, + session_key, connection_secret); + if (r) { + pending_cons.erase(i.first); + if (!pending_cons.empty()) { + return r; + } + } else { + active_con.reset(new MonConnection(std::move(i.second))); + pending_cons.clear(); + ceph_assert(active_con->have_session()); + } + + _finish_hunting(r); + if (r || monmap.get_epoch() > 0) { + _finish_auth(r); + } + return r; + } + } + return -ENOENT; + } else { + // verify authorizer reply + auto p = bl.begin(); + if (!auth_meta->authorizer->verify_reply(p, &auth_meta->connection_secret)) { + ldout(cct, 0) << __func__ << " failed verifying authorizer reply" + << dendl; + return -EACCES; + } + auth_meta->session_key = auth_meta->authorizer->session_key; + return 0; + } +} + +int MonClient::handle_auth_bad_method( + Connection *con, + AuthConnectionMeta *auth_meta, + uint32_t old_auth_method, + int result, + const std::vector& allowed_methods, + const std::vector& allowed_modes) +{ + auth_meta->allowed_methods = allowed_methods; + + std::lock_guard l(monc_lock); + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { + if (con->is_anon()) { + for (auto& i : mon_commands) { + if (i.second->target_con == con) { + int r = i.second->target_session->handle_auth_bad_method( + old_auth_method, + result, + allowed_methods, + allowed_modes); + if (r < 0) { + _finish_command(i.second, r, "auth failed"); + } + return r; + } + } + } + for (auto& i : pending_cons) { + if (i.second.is_con(con)) { + int r = i.second.handle_auth_bad_method(old_auth_method, + result, + allowed_methods, + allowed_modes); + if (r == 0) { + return r; // try another method on this con + } + pending_cons.erase(i.first); + if (!pending_cons.empty()) { + return r; // fail this con, maybe another con will succeed + } + // fail hunt + _finish_hunting(r); + _finish_auth(r); + return r; + } + } + return -ENOENT; + } else { + // huh... + ldout(cct,10) << __func__ << " hmm, they didn't like " << old_auth_method + << " result " << cpp_strerror(result) + << " and auth is " << (auth ? auth->get_protocol() : 0) + << dendl; + return -EACCES; + } +} + +int MonClient::handle_auth_request( + Connection *con, + AuthConnectionMeta *auth_meta, + bool more, + uint32_t auth_method, + const ceph::buffer::list& payload, + ceph::buffer::list *reply) +{ + if (payload.length() == 0) { + // for some channels prior to nautilus (osd heartbeat), we + // tolerate the lack of an authorizer. + if (!con->get_messenger()->require_authorizer) { + handle_authentication_dispatcher->ms_handle_authentication(con); + return 1; + } + return -EACCES; + } + auth_meta->auth_mode = payload[0]; + if (auth_meta->auth_mode < AUTH_MODE_AUTHORIZER || + auth_meta->auth_mode > AUTH_MODE_AUTHORIZER_MAX) { + return -EACCES; + } + AuthAuthorizeHandler *ah = get_auth_authorize_handler(con->get_peer_type(), + auth_method); + if (!ah) { + lderr(cct) << __func__ << " no AuthAuthorizeHandler found for auth method " + << auth_method << dendl; + return -EOPNOTSUPP; + } + + auto ac = &auth_meta->authorizer_challenge; + if (!HAVE_FEATURE(con->get_features(), CEPHX_V2)) { + if (cct->_conf->cephx_service_require_version >= 2) { + ldout(cct,10) << __func__ << " client missing CEPHX_V2 (" + << "cephx_service_requre_version = " + << cct->_conf->cephx_service_require_version << ")" << dendl; + return -EACCES; + } + ac = nullptr; + } + + bool was_challenge = (bool)auth_meta->authorizer_challenge; + bool isvalid = ah->verify_authorizer( + cct, + *rotating_secrets, + payload, + auth_meta->get_connection_secret_length(), + reply, + &con->peer_name, + &con->peer_global_id, + &con->peer_caps_info, + &auth_meta->session_key, + &auth_meta->connection_secret, + ac); + if (isvalid) { + handle_authentication_dispatcher->ms_handle_authentication(con); + return 1; + } + if (!more && !was_challenge && auth_meta->authorizer_challenge) { + ldout(cct,10) << __func__ << " added challenge on " << con << dendl; + return 0; + } + ldout(cct,10) << __func__ << " bad authorizer on " << con << dendl; + // discard old challenge + auth_meta->authorizer_challenge.reset(); + return -EACCES; +} + +AuthAuthorizer* MonClient::build_authorizer(int service_id) const { + std::lock_guard l(monc_lock); + if (auth) { + return auth->build_authorizer(service_id); + } else { + ldout(cct, 0) << __func__ << " for " << ceph_entity_type_name(service_id) + << ", but no auth is available now" << dendl; + return nullptr; + } +} + +#define dout_subsys ceph_subsys_monc +#undef dout_prefix +#define dout_prefix *_dout << "monclient" << (have_session() ? ": " : "(hunting): ") + +MonConnection::MonConnection( + CephContext *cct, ConnectionRef con, uint64_t global_id, + AuthRegistry *ar) + : cct(cct), con(con), global_id(global_id), auth_registry(ar) +{} + +MonConnection::~MonConnection() +{ + if (con) { + con->mark_down(); + con.reset(); + } +} + +bool MonConnection::have_session() const +{ + return state == State::HAVE_SESSION; +} + +void MonConnection::start(epoch_t epoch, + const EntityName& entity_name) +{ + using ceph::encode; + auth_start = ceph_clock_now(); + + if (con->get_peer_addr().is_msgr2()) { + ldout(cct, 10) << __func__ << " opening mon connection" << dendl; + state = State::AUTHENTICATING; + con->send_message(new MMonGetMap()); + return; + } + + // restart authentication handshake + state = State::NEGOTIATING; + + // send an initial keepalive to ensure our timestamp is valid by the + // time we are in an OPENED state (by sequencing this before + // authentication). + con->send_keepalive(); + + auto m = new MAuth; + m->protocol = CEPH_AUTH_UNKNOWN; + m->monmap_epoch = epoch; + __u8 struct_v = 1; + encode(struct_v, m->auth_payload); + std::vector auth_supported; + auth_registry->get_supported_methods(con->get_peer_type(), &auth_supported); + encode(auth_supported, m->auth_payload); + encode(entity_name, m->auth_payload); + encode(global_id, m->auth_payload); + con->send_message(m); +} + +int MonConnection::get_auth_request( + uint32_t *method, + std::vector *preferred_modes, + ceph::buffer::list *bl, + const EntityName& entity_name, + uint32_t want_keys, + RotatingKeyRing* keyring) +{ + using ceph::encode; + // choose method + if (auth_method < 0) { + std::vector as; + auth_registry->get_supported_methods(con->get_peer_type(), &as); + if (as.empty()) { + return -EACCES; + } + auth_method = as.front(); + } + *method = auth_method; + auth_registry->get_supported_modes(con->get_peer_type(), auth_method, + preferred_modes); + ldout(cct,10) << __func__ << " method " << *method + << " preferred_modes " << *preferred_modes << dendl; + if (preferred_modes->empty()) { + return -EACCES; + } + + if (auth) { + auth.reset(); + } + int r = _init_auth(*method, entity_name, want_keys, keyring, true); + ceph_assert(r == 0); + + // initial requset includes some boilerplate... + encode((char)AUTH_MODE_MON, *bl); + encode(entity_name, *bl); + encode(global_id, *bl); + + // and (maybe) some method-specific initial payload + auth->build_initial_request(bl); + + return 0; +} + +int MonConnection::handle_auth_reply_more( + AuthConnectionMeta *auth_meta, + const ceph::buffer::list& bl, + ceph::buffer::list *reply) +{ + ldout(cct, 10) << __func__ << " payload " << bl.length() << dendl; + ldout(cct, 30) << __func__ << " got\n"; + bl.hexdump(*_dout); + *_dout << dendl; + + auto p = bl.cbegin(); + ldout(cct, 10) << __func__ << " payload_len " << bl.length() << dendl; + int r = auth->handle_response(0, p, &auth_meta->session_key, + &auth_meta->connection_secret); + if (r == -EAGAIN) { + auth->prepare_build_request(); + auth->build_request(*reply); + ldout(cct, 10) << __func__ << " responding with " << reply->length() + << " bytes" << dendl; + r = 0; + } else if (r < 0) { + lderr(cct) << __func__ << " handle_response returned " << r << dendl; + } else { + ldout(cct, 10) << __func__ << " authenticated!" << dendl; + // FIXME + ceph_abort(cct, "write me"); + } + return r; +} + +int MonConnection::handle_auth_done( + AuthConnectionMeta *auth_meta, + uint64_t new_global_id, + const ceph::buffer::list& bl, + CryptoKey *session_key, + std::string *connection_secret) +{ + ldout(cct,10) << __func__ << " global_id " << new_global_id + << " payload " << bl.length() + << dendl; + global_id = new_global_id; + auth->set_global_id(global_id); + auto p = bl.begin(); + int auth_err = auth->handle_response(0, p, &auth_meta->session_key, + &auth_meta->connection_secret); + if (auth_err >= 0) { + state = State::HAVE_SESSION; + } + con->set_last_keepalive_ack(auth_start); + + if (pending_tell_command) { + con->send_message2(std::move(pending_tell_command)); + } + return auth_err; +} + +int MonConnection::handle_auth_bad_method( + uint32_t old_auth_method, + int result, + const std::vector& allowed_methods, + const std::vector& allowed_modes) +{ + ldout(cct,10) << __func__ << " old_auth_method " << old_auth_method + << " result " << cpp_strerror(result) + << " allowed_methods " << allowed_methods << dendl; + std::vector auth_supported; + auth_registry->get_supported_methods(con->get_peer_type(), &auth_supported); + auto p = std::find(auth_supported.begin(), auth_supported.end(), + old_auth_method); + assert(p != auth_supported.end()); + p = std::find_first_of(std::next(p), auth_supported.end(), + allowed_methods.begin(), allowed_methods.end()); + if (p == auth_supported.end()) { + lderr(cct) << __func__ << " server allowed_methods " << allowed_methods + << " but i only support " << auth_supported << dendl; + return -EACCES; + } + auth_method = *p; + ldout(cct,10) << __func__ << " will try " << auth_method << " next" << dendl; + return 0; +} + +int MonConnection::handle_auth(MAuthReply* m, + const EntityName& entity_name, + uint32_t want_keys, + RotatingKeyRing* keyring) +{ + if (state == State::NEGOTIATING) { + int r = _negotiate(m, entity_name, want_keys, keyring); + if (r) { + return r; + } + state = State::AUTHENTICATING; + } + int r = authenticate(m); + if (!r) { + state = State::HAVE_SESSION; + } + return r; +} + +int MonConnection::_negotiate(MAuthReply *m, + const EntityName& entity_name, + uint32_t want_keys, + RotatingKeyRing* keyring) +{ + if (auth && (int)m->protocol == auth->get_protocol()) { + // good, negotiation completed + auth->reset(); + return 0; + } + + int r = _init_auth(m->protocol, entity_name, want_keys, keyring, false); + if (r == -ENOTSUP) { + if (m->result == -ENOTSUP) { + ldout(cct, 10) << "none of our auth protocols are supported by the server" + << dendl; + } + return m->result; + } + return r; +} + +int MonConnection::_init_auth( + uint32_t method, + const EntityName& entity_name, + uint32_t want_keys, + RotatingKeyRing* keyring, + bool msgr2) +{ + ldout(cct,10) << __func__ << " method " << method << dendl; + auth.reset( + AuthClientHandler::create(cct, method, keyring)); + if (!auth) { + ldout(cct, 10) << " no handler for protocol " << method << dendl; + return -ENOTSUP; + } + + // do not request MGR key unless the mon has the SERVER_KRAKEN + // feature. otherwise it will give us an auth error. note that + // we have to use the FEATUREMASK because pre-jewel the kraken + // feature bit was used for something else. + if (!msgr2 && + (want_keys & CEPH_ENTITY_TYPE_MGR) && + !(con->has_features(CEPH_FEATUREMASK_SERVER_KRAKEN))) { + ldout(cct, 1) << __func__ + << " not requesting MGR keys from pre-kraken monitor" + << dendl; + want_keys &= ~CEPH_ENTITY_TYPE_MGR; + } + auth->set_want_keys(want_keys); + auth->init(entity_name); + auth->set_global_id(global_id); + return 0; +} + +int MonConnection::authenticate(MAuthReply *m) +{ + ceph_assert(auth); + if (!m->global_id) { + ldout(cct, 1) << "peer sent an invalid global_id" << dendl; + } + if (m->global_id != global_id) { + // it's a new session + auth->reset(); + global_id = m->global_id; + auth->set_global_id(global_id); + ldout(cct, 10) << "my global_id is " << m->global_id << dendl; + } + auto p = m->result_bl.cbegin(); + int ret = auth->handle_response(m->result, p, nullptr, nullptr); + if (ret == -EAGAIN) { + auto ma = new MAuth; + ma->protocol = auth->get_protocol(); + auth->prepare_build_request(); + auth->build_request(ma->auth_payload); + con->send_message(ma); + } + if (ret == 0 && pending_tell_command) { + con->send_message2(std::move(pending_tell_command)); + } + + return ret; +} + +void MonClient::register_config_callback(md_config_t::config_callback fn) { + ceph_assert(!config_cb); + config_cb = fn; +} + +md_config_t::config_callback MonClient::get_config_callback() { + return config_cb; +} diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index cd541f6bf8360..65bdd2d31cb39 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -1970,7 +1970,19 @@ void Monitor::handle_command(MMonCommand *m) return; } - cmd_getval(g_ceph_context, cmdmap, "prefix", prefix); + // check return value. If no prefix parameter provided, + // return value will be false, then return error info. + if(!cmd_getval(g_ceph_context, cmdmap, "prefix", prefix)) { + reply_command(op, -EINVAL, "command prefix not found", 0); + return; + } + + // check prefix is empty + if (prefix.empty()) { + reply_command(op, -EINVAL, "command prefix must not be empty", 0); + return; + } + if (prefix == "get_command_descriptions") { bufferlist rdata; Formatter *f = new_formatter("json"); @@ -1990,6 +2002,15 @@ void Monitor::handle_command(MMonCommand *m) boost::scoped_ptr f(new_formatter(format)); get_str_vec(prefix, fullcmd); + + // make sure fullcmd is not empty. + // invalid prefix will cause empty vector fullcmd. + // such as, prefix=";,,;" + if (fullcmd.empty()) { + reply_command(op, -EINVAL, "command requires a prefix to be valid", 0); + return; + } + module = fullcmd[0]; if (!_allowed_command(session, module, prefix, cmdmap)) { diff --git a/src/mon/Monitor.cc.orig b/src/mon/Monitor.cc.orig new file mode 100644 index 0000000000000..cd541f6bf8360 --- /dev/null +++ b/src/mon/Monitor.cc.orig @@ -0,0 +1,4212 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include +#include +#include +#include +#include + +#include "Monitor.h" +#include "common/version.h" + +#include "osd/OSDMap.h" + +#include "MonitorStore.h" +#include "MonitorDBStore.h" + +#include "msg/Messenger.h" + +#include "messages/PaxosServiceMessage.h" +#include "messages/MMonMap.h" +#include "messages/MMonGetMap.h" +#include "messages/MMonGetVersion.h" +#include "messages/MMonGetVersionReply.h" +#include "messages/MGenericMessage.h" +#include "messages/MMonCommand.h" +#include "messages/MMonCommandAck.h" +#include "messages/MMonSync.h" +#include "messages/MMonScrub.h" +#include "messages/MMonProbe.h" +#include "messages/MMonJoin.h" +#include "messages/MMonPaxos.h" +#include "messages/MRoute.h" +#include "messages/MForward.h" + +#include "messages/MMonSubscribe.h" +#include "messages/MMonSubscribeAck.h" + +#include "messages/MAuthReply.h" + +#include "messages/MTimeCheck.h" +#include "messages/MMonHealth.h" +#include "messages/MPing.h" + +#include "common/strtol.h" +#include "common/ceph_argparse.h" +#include "common/Timer.h" +#include "common/Clock.h" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "common/admin_socket.h" + +#include "include/color.h" +#include "include/ceph_fs.h" +#include "include/str_list.h" + +#include "OSDMonitor.h" +#include "MDSMonitor.h" +#include "MonmapMonitor.h" +#include "PGMonitor.h" +#include "LogMonitor.h" +#include "AuthMonitor.h" +#include "mon/QuorumService.h" +#include "mon/HealthMonitor.h" +#include "mon/ConfigKeyService.h" + +#include "auth/AuthMethodList.h" +#include "auth/KeyRing.h" + +#include "common/config.h" +#include "common/cmdparse.h" +#include "include/assert.h" + +#define dout_subsys ceph_subsys_mon +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) +static ostream& _prefix(std::ostream *_dout, const Monitor *mon) { + return *_dout << "mon." << mon->name << "@" << mon->rank + << "(" << mon->get_state_name() << ") e" << mon->monmap->get_epoch() << " "; +} + +const string Monitor::MONITOR_NAME = "monitor"; +const string Monitor::MONITOR_STORE_PREFIX = "monitor_store"; + +long parse_pos_long(const char *s, ostream *pss) +{ + if (*s == '-' || *s == '+') { + if (pss) + *pss << "expected numerical value, got: " << s; + return -EINVAL; + } + + string err; + long r = strict_strtol(s, 10, &err); + if ((r == 0) && !err.empty()) { + if (pss) + *pss << err; + return -1; + } + if (r < 0) { + if (pss) + *pss << "unable to parse positive integer '" << s << "'"; + return -1; + } + return r; +} + +Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s, + Messenger *m, MonMap *map) : + Dispatcher(cct_), + name(nm), + rank(-1), + messenger(m), + con_self(m ? m->get_loopback_connection() : NULL), + lock("Monitor::lock"), + timer(cct_, lock), + has_ever_joined(false), + logger(NULL), cluster_logger(NULL), cluster_logger_registered(false), + monmap(map), + clog(cct_, messenger, monmap, LogClient::FLAG_MON), + key_server(cct, &keyring), + auth_cluster_required(cct, + cct->_conf->auth_supported.length() ? + cct->_conf->auth_supported : cct->_conf->auth_cluster_required), + auth_service_required(cct, + cct->_conf->auth_supported.length() ? + cct->_conf->auth_supported : cct->_conf->auth_service_required), + store(s), + + state(STATE_PROBING), + + elector(this), + leader(0), + quorum_features(0), + scrub_version(0), + + // sync state + sync_provider_count(0), + sync_cookie(0), + sync_full(false), + sync_start_version(0), + sync_timeout_event(NULL), + sync_last_committed_floor(0), + + timecheck_round(0), + timecheck_acks(0), + timecheck_event(NULL), + + probe_timeout_event(NULL), + + paxos_service(PAXOS_NUM), + admin_hook(NULL), + routed_request_tid(0) +{ + rank = -1; + + paxos = new Paxos(this, "paxos"); + + paxos_service[PAXOS_MDSMAP] = new MDSMonitor(this, paxos, "mdsmap"); + paxos_service[PAXOS_MONMAP] = new MonmapMonitor(this, paxos, "monmap"); + paxos_service[PAXOS_OSDMAP] = new OSDMonitor(this, paxos, "osdmap"); + paxos_service[PAXOS_PGMAP] = new PGMonitor(this, paxos, "pgmap"); + paxos_service[PAXOS_LOG] = new LogMonitor(this, paxos, "logm"); + paxos_service[PAXOS_AUTH] = new AuthMonitor(this, paxos, "auth"); + + health_monitor = new HealthMonitor(this); + config_key_service = new ConfigKeyService(this, paxos); + + mon_caps = new MonCap(); + bool r = mon_caps->parse("allow *", NULL); + assert(r); + + exited_quorum = ceph_clock_now(g_ceph_context); +} + +PaxosService *Monitor::get_paxos_service_by_name(const string& name) +{ + if (name == "mdsmap") + return paxos_service[PAXOS_MDSMAP]; + if (name == "monmap") + return paxos_service[PAXOS_MONMAP]; + if (name == "osdmap") + return paxos_service[PAXOS_OSDMAP]; + if (name == "pgmap") + return paxos_service[PAXOS_PGMAP]; + if (name == "logm") + return paxos_service[PAXOS_LOG]; + if (name == "auth") + return paxos_service[PAXOS_AUTH]; + + assert(0 == "given name does not match known paxos service"); + return NULL; +} + +Monitor::~Monitor() +{ + for (vector::iterator p = paxos_service.begin(); p != paxos_service.end(); ++p) + delete *p; + delete health_monitor; + delete config_key_service; + delete paxos; + assert(session_map.sessions.empty()); + delete mon_caps; +} + + +enum { + l_mon_first = 456000, + l_mon_last, +}; + + +class AdminHook : public AdminSocketHook { + Monitor *mon; +public: + AdminHook(Monitor *m) : mon(m) {} + bool call(std::string command, cmdmap_t& cmdmap, std::string format, + bufferlist& out) { + stringstream ss; + mon->do_admin_command(command, cmdmap, format, ss); + out.append(ss); + return true; + } +}; + +void Monitor::do_admin_command(string command, cmdmap_t& cmdmap, string format, + ostream& ss) +{ + Mutex::Locker l(lock); + + boost::scoped_ptr f(new_formatter(format)); + + if (command == "mon_status") { + _mon_status(f.get(), ss); + if (f) + f->flush(ss); + } else if (command == "quorum_status") + _quorum_status(f.get(), ss); + else if (command == "sync_force") { + string validate; + if ((!cmd_getval(g_ceph_context, cmdmap, "validate", validate)) || + (validate != "--yes-i-really-mean-it")) { + ss << "are you SURE? this will mean the monitor store will be erased " + "the next time the monitor is restarted. pass " + "'--yes-i-really-mean-it' if you really do."; + return; + } + sync_force(f.get(), ss); + } else if (command.find("add_bootstrap_peer_hint") == 0) + _add_bootstrap_peer_hint(command, cmdmap, ss); + else + assert(0 == "bad AdminSocket command binding"); +} + +void Monitor::handle_signal(int signum) +{ + assert(signum == SIGINT || signum == SIGTERM); + derr << "*** Got Signal " << sys_siglist[signum] << " ***" << dendl; + shutdown(); +} + +CompatSet Monitor::get_supported_features() +{ + CompatSet::FeatureSet ceph_mon_feature_compat; + CompatSet::FeatureSet ceph_mon_feature_ro_compat; + CompatSet::FeatureSet ceph_mon_feature_incompat; + ceph_mon_feature_incompat.insert(CEPH_MON_FEATURE_INCOMPAT_BASE); + ceph_mon_feature_incompat.insert(CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS); + return CompatSet(ceph_mon_feature_compat, ceph_mon_feature_ro_compat, + ceph_mon_feature_incompat); +} + +CompatSet Monitor::get_legacy_features() +{ + CompatSet::FeatureSet ceph_mon_feature_compat; + CompatSet::FeatureSet ceph_mon_feature_ro_compat; + CompatSet::FeatureSet ceph_mon_feature_incompat; + ceph_mon_feature_incompat.insert(CEPH_MON_FEATURE_INCOMPAT_BASE); + return CompatSet(ceph_mon_feature_compat, ceph_mon_feature_ro_compat, + ceph_mon_feature_incompat); +} + +int Monitor::check_features(MonitorDBStore *store) +{ + CompatSet required = get_supported_features(); + CompatSet ondisk; + + bufferlist features; + store->get(MONITOR_NAME, COMPAT_SET_LOC, features); + if (features.length() == 0) { + generic_dout(0) << "WARNING: mon fs missing feature list.\n" + << "Assuming it is old-style and introducing one." << dendl; + //we only want the baseline ~v.18 features assumed to be on disk. + //If new features are introduced this code needs to disappear or + //be made smarter. + ondisk = get_legacy_features(); + + bufferlist bl; + ondisk.encode(bl); + MonitorDBStore::Transaction t; + t.put(MONITOR_NAME, COMPAT_SET_LOC, bl); + store->apply_transaction(t); + } else { + bufferlist::iterator it = features.begin(); + ondisk.decode(it); + } + + if (!required.writeable(ondisk)) { + CompatSet diff = required.unsupported(ondisk); + generic_derr << "ERROR: on disk data includes unsupported features: " << diff << dendl; + return -EPERM; + } + + return 0; +} + +void Monitor::read_features() +{ + bufferlist bl; + store->get(MONITOR_NAME, COMPAT_SET_LOC, bl); + assert(bl.length()); + + bufferlist::iterator p = bl.begin(); + ::decode(features, p); + dout(10) << "features " << features << dendl; +} + +void Monitor::write_features(MonitorDBStore::Transaction &t) +{ + bufferlist bl; + features.encode(bl); + t.put(MONITOR_NAME, COMPAT_SET_LOC, bl); +} + +int Monitor::preinit() +{ + lock.Lock(); + + dout(1) << "preinit fsid " << monmap->fsid << dendl; + + assert(!logger); + { + PerfCountersBuilder pcb(g_ceph_context, "mon", l_mon_first, l_mon_last); + // ... + logger = pcb.create_perf_counters(); + cct->get_perfcounters_collection()->add(logger); + } + + assert(!cluster_logger); + { + PerfCountersBuilder pcb(g_ceph_context, "cluster", l_cluster_first, l_cluster_last); + pcb.add_u64(l_cluster_num_mon, "num_mon"); + pcb.add_u64(l_cluster_num_mon_quorum, "num_mon_quorum"); + pcb.add_u64(l_cluster_num_osd, "num_osd"); + pcb.add_u64(l_cluster_num_osd_up, "num_osd_up"); + pcb.add_u64(l_cluster_num_osd_in, "num_osd_in"); + pcb.add_u64(l_cluster_osd_epoch, "osd_epoch"); + pcb.add_u64(l_cluster_osd_kb, "osd_kb"); + pcb.add_u64(l_cluster_osd_kb_used, "osd_kb_used"); + pcb.add_u64(l_cluster_osd_kb_avail, "osd_kb_avail"); + pcb.add_u64(l_cluster_num_pool, "num_pool"); + pcb.add_u64(l_cluster_num_pg, "num_pg"); + pcb.add_u64(l_cluster_num_pg_active_clean, "num_pg_active_clean"); + pcb.add_u64(l_cluster_num_pg_active, "num_pg_active"); + pcb.add_u64(l_cluster_num_pg_peering, "num_pg_peering"); + pcb.add_u64(l_cluster_num_object, "num_object"); + pcb.add_u64(l_cluster_num_object_degraded, "num_object_degraded"); + pcb.add_u64(l_cluster_num_object_unfound, "num_object_unfound"); + pcb.add_u64(l_cluster_num_bytes, "num_bytes"); + pcb.add_u64(l_cluster_num_mds_up, "num_mds_up"); + pcb.add_u64(l_cluster_num_mds_in, "num_mds_in"); + pcb.add_u64(l_cluster_num_mds_failed, "num_mds_failed"); + pcb.add_u64(l_cluster_mds_epoch, "mds_epoch"); + cluster_logger = pcb.create_perf_counters(); + } + + // verify cluster_uuid + { + int r = check_fsid(); + if (r == -ENOENT) + r = write_fsid(); + if (r < 0) { + lock.Unlock(); + return r; + } + } + + // open compatset + read_features(); + + // have we ever joined a quorum? + has_ever_joined = (store->get(MONITOR_NAME, "joined") != 0); + dout(10) << "has_ever_joined = " << (int)has_ever_joined << dendl; + + if (!has_ever_joined) { + // impose initial quorum restrictions? + list initial_members; + get_str_list(g_conf->mon_initial_members, initial_members); + + if (!initial_members.empty()) { + dout(1) << " initial_members " << initial_members << ", filtering seed monmap" << dendl; + + monmap->set_initial_members(g_ceph_context, initial_members, name, messenger->get_myaddr(), + &extra_probe_peers); + + dout(10) << " monmap is " << *monmap << dendl; + dout(10) << " extra probe peers " << extra_probe_peers << dendl; + } + } + + { + // We have a potentially inconsistent store state in hands. Get rid of it + // and start fresh. + bool clear_store = false; + if (store->exists("mon_sync", "in_sync")) { + dout(1) << __func__ << " clean up potentially inconsistent store state" + << dendl; + clear_store = true; + } + + if (store->get("mon_sync", "force_sync") > 0) { + dout(1) << __func__ << " force sync by clearing store state" << dendl; + clear_store = true; + } + + if (clear_store) { + set sync_prefixes = get_sync_targets_names(); + store->clear(sync_prefixes); + } + } + + sync_last_committed_floor = store->get("mon_sync", "last_committed_floor"); + dout(10) << "sync_last_committed_floor " << sync_last_committed_floor << dendl; + + init_paxos(); + health_monitor->init(); + + // we need to bootstrap authentication keys so we can form an + // initial quorum. + if (authmon()->get_last_committed() == 0) { + dout(10) << "loading initial keyring to bootstrap authentication for mkfs" << dendl; + bufferlist bl; + store->get("mkfs", "keyring", bl); + KeyRing keyring; + bufferlist::iterator p = bl.begin(); + ::decode(keyring, p); + extract_save_mon_key(keyring); + } + + string keyring_loc = g_conf->mon_data + "/keyring"; + + int r = keyring.load(cct, keyring_loc); + if (r < 0) { + EntityName mon_name; + mon_name.set_type(CEPH_ENTITY_TYPE_MON); + EntityAuth mon_key; + if (key_server.get_auth(mon_name, mon_key)) { + dout(1) << "copying mon. key from old db to external keyring" << dendl; + keyring.add(mon_name, mon_key); + bufferlist bl; + keyring.encode_plaintext(bl); + write_default_keyring(bl); + } else { + derr << "unable to load initial keyring " << g_conf->keyring << dendl; + lock.Unlock(); + return r; + } + } + + admin_hook = new AdminHook(this); + AdminSocket* admin_socket = cct->get_admin_socket(); + + // unlock while registering to avoid mon_lock -> admin socket lock dependency. + lock.Unlock(); + r = admin_socket->register_command("mon_status", "mon_status", admin_hook, + "show current monitor status"); + assert(r == 0); + r = admin_socket->register_command("quorum_status", "quorum_status", + admin_hook, "show current quorum status"); + assert(r == 0); + r = admin_socket->register_command("sync_force", + "sync_force name=validate," + "type=CephChoices," + "strings=--yes-i-really-mean-it", + admin_hook, + "force sync of and clear monitor store"); + assert(r == 0); + r = admin_socket->register_command("add_bootstrap_peer_hint", + "add_bootstrap_peer_hint name=addr," + "type=CephIPAddr", + admin_hook, + "add peer address as potential bootstrap" + " peer for cluster bringup"); + assert(r == 0); + lock.Lock(); + + lock.Unlock(); + return 0; +} + +int Monitor::init() +{ + dout(2) << "init" << dendl; + lock.Lock(); + + // start ticker + timer.init(); + new_tick(); + + // i'm ready! + messenger->add_dispatcher_tail(this); + + bootstrap(); + + lock.Unlock(); + return 0; +} + +void Monitor::init_paxos() +{ + dout(10) << __func__ << dendl; + paxos->init(); + + // init services + for (int i = 0; i < PAXOS_NUM; ++i) { + paxos_service[i]->init(); + } + + refresh_from_paxos(NULL); +} + +void Monitor::refresh_from_paxos(bool *need_bootstrap) +{ + dout(10) << __func__ << dendl; + for (int i = 0; i < PAXOS_NUM; ++i) { + paxos_service[i]->refresh(need_bootstrap); + } + for (int i = 0; i < PAXOS_NUM; ++i) { + paxos_service[i]->post_paxos_update(); + } +} + +void Monitor::register_cluster_logger() +{ + if (!cluster_logger_registered) { + dout(10) << "register_cluster_logger" << dendl; + cluster_logger_registered = true; + cct->get_perfcounters_collection()->add(cluster_logger); + } else { + dout(10) << "register_cluster_logger - already registered" << dendl; + } +} + +void Monitor::unregister_cluster_logger() +{ + if (cluster_logger_registered) { + dout(10) << "unregister_cluster_logger" << dendl; + cluster_logger_registered = false; + cct->get_perfcounters_collection()->remove(cluster_logger); + } else { + dout(10) << "unregister_cluster_logger - not registered" << dendl; + } +} + +void Monitor::update_logger() +{ + cluster_logger->set(l_cluster_num_mon, monmap->size()); + cluster_logger->set(l_cluster_num_mon_quorum, quorum.size()); +} + +void Monitor::shutdown() +{ + dout(1) << "shutdown" << dendl; + lock.Lock(); + + state = STATE_SHUTDOWN; + + if (admin_hook) { + AdminSocket* admin_socket = cct->get_admin_socket(); + admin_socket->unregister_command("mon_status"); + admin_socket->unregister_command("quorum_status"); + admin_socket->unregister_command("sync_force"); + admin_socket->unregister_command("add_bootstrap_peer_hint"); + delete admin_hook; + admin_hook = NULL; + } + + elector.shutdown(); + + if (logger) { + cct->get_perfcounters_collection()->remove(logger); + delete logger; + logger = NULL; + } + if (cluster_logger) { + if (cluster_logger_registered) + cct->get_perfcounters_collection()->remove(cluster_logger); + delete cluster_logger; + cluster_logger = NULL; + } + + // clean up + paxos->shutdown(); + for (vector::iterator p = paxos_service.begin(); p != paxos_service.end(); ++p) + (*p)->shutdown(); + health_monitor->shutdown(); + + finish_contexts(g_ceph_context, waitfor_quorum, -ECANCELED); + finish_contexts(g_ceph_context, maybe_wait_for_quorum, -ECANCELED); + + timer.shutdown(); + + remove_all_sessions(); + + // unlock before msgr shutdown... + lock.Unlock(); + + messenger->shutdown(); // last thing! ceph_mon.cc will delete mon. +} + +void Monitor::bootstrap() +{ + dout(10) << "bootstrap" << dendl; + + sync_reset_requester(); + unregister_cluster_logger(); + cancel_probe_timeout(); + + // note my rank + int newrank = monmap->get_rank(messenger->get_myaddr()); + if (newrank < 0 && rank >= 0) { + // was i ever part of the quorum? + if (has_ever_joined) { + dout(0) << " removed from monmap, suicide." << dendl; + exit(0); + } + } + if (newrank != rank) { + dout(0) << " my rank is now " << newrank << " (was " << rank << ")" << dendl; + messenger->set_myname(entity_name_t::MON(newrank)); + rank = newrank; + + // reset all connections, or else our peers will think we are someone else. + messenger->mark_down_all(); + } + + // reset + state = STATE_PROBING; + + _reset(); + + // sync store + if (g_conf->mon_compact_on_bootstrap) { + dout(10) << "bootstrap -- triggering compaction" << dendl; + store->compact(); + dout(10) << "bootstrap -- finished compaction" << dendl; + } + + // singleton monitor? + if (monmap->size() == 1 && rank == 0) { + win_standalone_election(); + return; + } + + reset_probe_timeout(); + + // i'm outside the quorum + if (monmap->contains(name)) + outside_quorum.insert(name); + + // probe monitors + dout(10) << "probing other monitors" << dendl; + for (unsigned i = 0; i < monmap->size(); i++) { + if ((int)i != rank) + messenger->send_message(new MMonProbe(monmap->fsid, MMonProbe::OP_PROBE, name, has_ever_joined), + monmap->get_inst(i)); + } + for (set::iterator p = extra_probe_peers.begin(); + p != extra_probe_peers.end(); + ++p) { + if (*p != messenger->get_myaddr()) { + entity_inst_t i; + i.name = entity_name_t::MON(-1); + i.addr = *p; + messenger->send_message(new MMonProbe(monmap->fsid, MMonProbe::OP_PROBE, name, has_ever_joined), i); + } + } +} + +void Monitor::_add_bootstrap_peer_hint(string cmd, cmdmap_t& cmdmap, ostream& ss) +{ + string addrstr; + cmd_getval(g_ceph_context, cmdmap, "addr", addrstr); + dout(10) << "_add_bootstrap_peer_hint '" << cmd << "' '" + << addrstr << "'" << dendl; + + entity_addr_t addr; + const char *end = 0; + if (!addr.parse(addrstr.c_str(), &end)) { + ss << "failed to parse addr '" << addrstr << "'; syntax is 'add_bootstrap_peer_hint ip[:port]'"; + return; + } + + if (is_leader() || is_peon()) { + ss << "mon already active; ignoring bootstrap hint"; + return; + } + + if (addr.get_port() == 0) + addr.set_port(CEPH_MON_PORT); + + extra_probe_peers.insert(addr); + ss << "adding peer " << addr << " to list: " << extra_probe_peers; +} + +// called by bootstrap(), or on leader|peon -> electing +void Monitor::_reset() +{ + dout(10) << __func__ << dendl; + + assert(state == STATE_ELECTING || + state == STATE_PROBING); + + cancel_probe_timeout(); + timecheck_finish(); + + leader_since = utime_t(); + if (!quorum.empty()) { + exited_quorum = ceph_clock_now(g_ceph_context); + } + quorum.clear(); + outside_quorum.clear(); + + scrub_reset(); + + paxos->restart(); + + for (vector::iterator p = paxos_service.begin(); p != paxos_service.end(); ++p) + (*p)->restart(); + health_monitor->finish(); +} + + +// ----------------------------------------------------------- +// sync + +set Monitor::get_sync_targets_names() +{ + set targets; + targets.insert(paxos->get_name()); + for (int i = 0; i < PAXOS_NUM; ++i) + paxos_service[i]->get_store_prefixes(targets); + + return targets; +} + + +void Monitor::sync_timeout() +{ + dout(10) << __func__ << dendl; + assert(state == STATE_SYNCHRONIZING); + bootstrap(); +} + +void Monitor::sync_obtain_latest_monmap(bufferlist &bl) +{ + dout(1) << __func__ << dendl; + + MonMap latest_monmap; + + // Grab latest monmap from MonmapMonitor + bufferlist monmon_bl; + int err = monmon()->get_monmap(monmon_bl); + if (err < 0) { + if (err != -ENOENT) { + derr << __func__ + << " something wrong happened while reading the store: " + << cpp_strerror(err) << dendl; + assert(0 == "error reading the store"); + } + } else { + latest_monmap.decode(monmon_bl); + } + + // Grab last backed up monmap (if any) and compare epochs + if (store->exists("mon_sync", "latest_monmap")) { + bufferlist backup_bl; + int err = store->get("mon_sync", "latest_monmap", backup_bl); + if (err < 0) { + assert(err != -ENOENT); + derr << __func__ + << " something wrong happened while reading the store: " + << cpp_strerror(err) << dendl; + assert(0 == "error reading the store"); + } + assert(backup_bl.length() > 0); + + MonMap backup_monmap; + backup_monmap.decode(backup_bl); + + if (backup_monmap.epoch > latest_monmap.epoch) + latest_monmap = backup_monmap; + } + + // Check if our current monmap's epoch is greater than the one we've + // got so far. + if (monmap->epoch > latest_monmap.epoch) + latest_monmap = *monmap; + + dout(1) << __func__ << " obtained monmap e" << latest_monmap.epoch << dendl; + + latest_monmap.encode(bl, CEPH_FEATURES_ALL); +} + +void Monitor::sync_reset_requester() +{ + dout(10) << __func__ << dendl; + + if (sync_timeout_event) { + timer.cancel_event(sync_timeout_event); + sync_timeout_event = NULL; + } + + sync_provider = entity_inst_t(); + sync_cookie = 0; + sync_full = false; + sync_start_version = 0; +} + +void Monitor::sync_reset_provider() +{ + dout(10) << __func__ << dendl; + sync_providers.clear(); +} + +void Monitor::sync_start(entity_inst_t &other, bool full) +{ + dout(10) << __func__ << " " << other << (full ? " full" : " recent") << dendl; + + assert(state == STATE_PROBING || + state == STATE_SYNCHRONIZING); + state = STATE_SYNCHRONIZING; + + // make sure are not a provider for anyone! + sync_reset_provider(); + + sync_full = full; + + if (sync_full) { + // stash key state, and mark that we are syncing + MonitorDBStore::Transaction t; + sync_stash_critical_state(&t); + t.put("mon_sync", "in_sync", 1); + + sync_last_committed_floor = MAX(sync_last_committed_floor, paxos->get_version()); + dout(10) << __func__ << " marking sync in progress, storing sync_last_committed_floor " + << sync_last_committed_floor << dendl; + t.put("mon_sync", "last_committed_floor", sync_last_committed_floor); + + store->apply_transaction(t); + + assert(g_conf->mon_sync_requester_kill_at != 1); + + // clear the underlying store + set targets = get_sync_targets_names(); + dout(10) << __func__ << " clearing prefixes " << targets << dendl; + store->clear(targets); + + // make sure paxos knows it has been reset. this prevents a + // bootstrap and then different probe reply order from possibly + // deciding a partial or no sync is needed. + paxos->init(); + + assert(g_conf->mon_sync_requester_kill_at != 2); + } + + // assume 'other' as the leader. We will update the leader once we receive + // a reply to the sync start. + sync_provider = other; + + sync_reset_timeout(); + + MMonSync *m = new MMonSync(sync_full ? MMonSync::OP_GET_COOKIE_FULL : MMonSync::OP_GET_COOKIE_RECENT); + if (!sync_full) + m->last_committed = paxos->get_version(); + messenger->send_message(m, sync_provider); +} + +void Monitor::sync_stash_critical_state(MonitorDBStore::Transaction *t) +{ + dout(10) << __func__ << dendl; + bufferlist backup_monmap; + sync_obtain_latest_monmap(backup_monmap); + assert(backup_monmap.length() > 0); + t->put("mon_sync", "latest_monmap", backup_monmap); +} + +void Monitor::sync_reset_timeout() +{ + dout(10) << __func__ << dendl; + if (sync_timeout_event) + timer.cancel_event(sync_timeout_event); + sync_timeout_event = new C_SyncTimeout(this); + timer.add_event_after(g_conf->mon_sync_timeout, sync_timeout_event); +} + +void Monitor::sync_finish(version_t last_committed) +{ + dout(10) << __func__ << " lc " << last_committed << " from " << sync_provider << dendl; + + assert(g_conf->mon_sync_requester_kill_at != 7); + + if (sync_full) { + // finalize the paxos commits + MonitorDBStore::Transaction tx; + paxos->read_and_prepare_transactions(&tx, sync_start_version, last_committed); + tx.put(paxos->get_name(), "last_committed", last_committed); + + dout(30) << __func__ << " final tx dump:\n"; + JSONFormatter f(true); + tx.dump(&f); + f.flush(*_dout); + *_dout << dendl; + + store->apply_transaction(tx); + } + + assert(g_conf->mon_sync_requester_kill_at != 8); + + MonitorDBStore::Transaction t; + t.erase("mon_sync", "in_sync"); + t.erase("mon_sync", "force_sync"); + t.erase("mon_sync", "last_committed_floor"); + store->apply_transaction(t); + + assert(g_conf->mon_sync_requester_kill_at != 9); + + init_paxos(); + + assert(g_conf->mon_sync_requester_kill_at != 10); + + bootstrap(); +} + +void Monitor::handle_sync(MMonSync *m) +{ + dout(10) << __func__ << " " << *m << dendl; + switch (m->op) { + + // provider --------- + + case MMonSync::OP_GET_COOKIE_FULL: + case MMonSync::OP_GET_COOKIE_RECENT: + handle_sync_get_cookie(m); + break; + case MMonSync::OP_GET_CHUNK: + handle_sync_get_chunk(m); + break; + + // client ----------- + + case MMonSync::OP_COOKIE: + handle_sync_cookie(m); + break; + + case MMonSync::OP_CHUNK: + case MMonSync::OP_LAST_CHUNK: + handle_sync_chunk(m); + break; + case MMonSync::OP_NO_COOKIE: + handle_sync_no_cookie(m); + break; + + default: + dout(0) << __func__ << " unknown op " << m->op << dendl; + assert(0 == "unknown op"); + } + m->put(); +} + +// leader + +void Monitor::_sync_reply_no_cookie(MMonSync *m) +{ + MMonSync *reply = new MMonSync(MMonSync::OP_NO_COOKIE, m->cookie); + messenger->send_message(reply, m->get_connection()); +} + +void Monitor::handle_sync_get_cookie(MMonSync *m) +{ + if (is_synchronizing()) { + _sync_reply_no_cookie(m); + return; + } + + assert(g_conf->mon_sync_provider_kill_at != 1); + + // make up a unique cookie. include election epoch (which persists + // across restarts for the whole cluster) and a counter for this + // process instance. there is no need to be unique *across* + // monitors, though. + uint64_t cookie = ((unsigned long long)elector.get_epoch() << 24) + ++sync_provider_count; + assert(sync_providers.count(cookie) == 0); + + dout(10) << __func__ << " cookie " << cookie << " for " << m->get_source_inst() << dendl; + + SyncProvider& sp = sync_providers[cookie]; + sp.cookie = cookie; + sp.entity = m->get_source_inst(); + sp.reset_timeout(g_ceph_context, g_conf->mon_sync_timeout * 2); + + set sync_targets; + if (m->op == MMonSync::OP_GET_COOKIE_FULL) { + // full scan + sync_targets = get_sync_targets_names(); + sp.last_committed = paxos->get_version(); + sp.synchronizer = store->get_synchronizer(sp.last_key, sync_targets); + sp.full = true; + dout(10) << __func__ << " will sync prefixes " << sync_targets << dendl; + } else { + // just catch up paxos + sp.last_committed = m->last_committed; + } + dout(10) << __func__ << " will sync from version " << sp.last_committed << dendl; + + MMonSync *reply = new MMonSync(MMonSync::OP_COOKIE, sp.cookie); + reply->last_committed = sp.last_committed; + messenger->send_message(reply, m->get_connection()); +} + +void Monitor::handle_sync_get_chunk(MMonSync *m) +{ + dout(10) << __func__ << " " << *m << dendl; + + if (sync_providers.count(m->cookie) == 0) { + dout(10) << __func__ << " no cookie " << m->cookie << dendl; + _sync_reply_no_cookie(m); + return; + } + + assert(g_conf->mon_sync_provider_kill_at != 2); + + SyncProvider& sp = sync_providers[m->cookie]; + sp.reset_timeout(g_ceph_context, g_conf->mon_sync_timeout * 2); + + if (sp.last_committed < paxos->get_first_committed() && + paxos->get_first_committed() > 1) { + dout(10) << __func__ << " sync requester fell behind paxos, their lc " << sp.last_committed + << " < our fc " << paxos->get_first_committed() << dendl; + sync_providers.erase(m->cookie); + _sync_reply_no_cookie(m); + return; + } + + MMonSync *reply = new MMonSync(MMonSync::OP_CHUNK, sp.cookie); + MonitorDBStore::Transaction tx; + + int left = g_conf->mon_sync_max_payload_size; + while (sp.last_committed < paxos->get_version() && left > 0) { + bufferlist bl; + sp.last_committed++; + store->get(paxos->get_name(), sp.last_committed, bl); + tx.put(paxos->get_name(), sp.last_committed, bl); + left -= bl.length(); + dout(20) << __func__ << " including paxos state " << sp.last_committed << dendl; + } + reply->last_committed = sp.last_committed; + + if (sp.full && left > 0) { + sp.synchronizer->get_chunk_tx(tx, left); + sp.last_key = sp.synchronizer->get_last_key(); + reply->last_key = sp.last_key; + } + + if ((sp.full && sp.synchronizer->has_next_chunk()) || + sp.last_committed < paxos->get_version()) { + dout(10) << __func__ << " chunk, through version " << sp.last_committed << " key " << sp.last_key << dendl; + } else { + dout(10) << __func__ << " last chunk, through version " << sp.last_committed << " key " << sp.last_key << dendl; + reply->op = MMonSync::OP_LAST_CHUNK; + + assert(g_conf->mon_sync_provider_kill_at != 3); + + // clean up our local state + sync_providers.erase(sp.cookie); + } + + ::encode(tx, reply->chunk_bl); + + messenger->send_message(reply, m->get_connection()); +} + +// requester + +void Monitor::handle_sync_cookie(MMonSync *m) +{ + dout(10) << __func__ << " " << *m << dendl; + if (sync_cookie) { + dout(10) << __func__ << " already have a cookie, ignoring" << dendl; + return; + } + if (m->get_source_inst() != sync_provider) { + dout(10) << __func__ << " source does not match, discarding" << dendl; + return; + } + sync_cookie = m->cookie; + sync_start_version = m->last_committed; + + sync_reset_timeout(); + sync_get_next_chunk(); + + assert(g_conf->mon_sync_requester_kill_at != 3); +} + +void Monitor::sync_get_next_chunk() +{ + dout(20) << __func__ << " cookie " << sync_cookie << " provider " << sync_provider << dendl; + if (g_conf->mon_inject_sync_get_chunk_delay > 0) { + dout(20) << __func__ << " injecting delay of " << g_conf->mon_inject_sync_get_chunk_delay << dendl; + usleep((long long)(g_conf->mon_inject_sync_get_chunk_delay * 1000000.0)); + } + MMonSync *r = new MMonSync(MMonSync::OP_GET_CHUNK, sync_cookie); + messenger->send_message(r, sync_provider); + + assert(g_conf->mon_sync_requester_kill_at != 4); +} + +void Monitor::handle_sync_chunk(MMonSync *m) +{ + dout(10) << __func__ << " " << *m << dendl; + + if (m->cookie != sync_cookie) { + dout(10) << __func__ << " cookie does not match, discarding" << dendl; + return; + } + if (m->get_source_inst() != sync_provider) { + dout(10) << __func__ << " source does not match, discarding" << dendl; + return; + } + + assert(state == STATE_SYNCHRONIZING); + assert(g_conf->mon_sync_requester_kill_at != 5); + + MonitorDBStore::Transaction tx; + tx.append_from_encoded(m->chunk_bl); + + dout(30) << __func__ << " tx dump:\n"; + JSONFormatter f(true); + tx.dump(&f); + f.flush(*_dout); + *_dout << dendl; + + store->apply_transaction(tx); + + assert(g_conf->mon_sync_requester_kill_at != 6); + + if (!sync_full) { + dout(10) << __func__ << " applying recent paxos transactions as we go" << dendl; + MonitorDBStore::Transaction tx; + paxos->read_and_prepare_transactions(&tx, paxos->get_version() + 1, m->last_committed); + tx.put(paxos->get_name(), "last_committed", m->last_committed); + + dout(30) << __func__ << " tx dump:\n"; + JSONFormatter f(true); + tx.dump(&f); + f.flush(*_dout); + *_dout << dendl; + + store->apply_transaction(tx); + paxos->init(); // to refresh what we just wrote + } + + if (m->op == MMonSync::OP_CHUNK) { + sync_reset_timeout(); + sync_get_next_chunk(); + } else if (m->op == MMonSync::OP_LAST_CHUNK) { + sync_finish(m->last_committed); + } +} + +void Monitor::handle_sync_no_cookie(MMonSync *m) +{ + dout(10) << __func__ << dendl; + bootstrap(); +} + +void Monitor::sync_trim_providers() +{ + dout(20) << __func__ << dendl; + + utime_t now = ceph_clock_now(g_ceph_context); + map::iterator p = sync_providers.begin(); + while (p != sync_providers.end()) { + if (now > p->second.timeout) { + dout(10) << __func__ << " expiring cookie " << p->second.cookie << " for " << p->second.entity << dendl; + sync_providers.erase(p++); + } else { + ++p; + } + } +} + +// --------------------------------------------------- +// probe + +void Monitor::cancel_probe_timeout() +{ + if (probe_timeout_event) { + dout(10) << "cancel_probe_timeout " << probe_timeout_event << dendl; + timer.cancel_event(probe_timeout_event); + probe_timeout_event = NULL; + } else { + dout(10) << "cancel_probe_timeout (none scheduled)" << dendl; + } +} + +void Monitor::reset_probe_timeout() +{ + cancel_probe_timeout(); + probe_timeout_event = new C_ProbeTimeout(this); + double t = g_conf->mon_probe_timeout; + timer.add_event_after(t, probe_timeout_event); + dout(10) << "reset_probe_timeout " << probe_timeout_event << " after " << t << " seconds" << dendl; +} + +void Monitor::probe_timeout(int r) +{ + dout(4) << "probe_timeout " << probe_timeout_event << dendl; + assert(is_probing() || is_synchronizing()); + assert(probe_timeout_event); + probe_timeout_event = NULL; + bootstrap(); +} + +void Monitor::handle_probe(MMonProbe *m) +{ + dout(10) << "handle_probe " << *m << dendl; + + if (m->fsid != monmap->fsid) { + dout(0) << "handle_probe ignoring fsid " << m->fsid << " != " << monmap->fsid << dendl; + m->put(); + return; + } + + switch (m->op) { + case MMonProbe::OP_PROBE: + handle_probe_probe(m); + break; + + case MMonProbe::OP_REPLY: + handle_probe_reply(m); + break; + + default: + m->put(); + } +} + +/** + * @todo fix this. This is going to cause trouble. + */ +void Monitor::handle_probe_probe(MMonProbe *m) +{ + dout(10) << "handle_probe_probe " << m->get_source_inst() << *m << dendl; + MMonProbe *r = new MMonProbe(monmap->fsid, MMonProbe::OP_REPLY, name, has_ever_joined); + r->name = name; + r->quorum = quorum; + monmap->encode(r->monmap_bl, m->get_connection()->get_features()); + r->paxos_first_version = paxos->get_first_committed(); + r->paxos_last_version = paxos->get_version(); + messenger->send_message(r, m->get_connection()); + + // did we discover a peer here? + if (!monmap->contains(m->get_source_addr())) { + dout(1) << " adding peer " << m->get_source_addr() << " to list of hints" << dendl; + extra_probe_peers.insert(m->get_source_addr()); + } + m->put(); +} + +void Monitor::handle_probe_reply(MMonProbe *m) +{ + dout(10) << "handle_probe_reply " << m->get_source_inst() << *m << dendl; + dout(10) << " monmap is " << *monmap << dendl; + + // discover name and addrs during probing or electing states. + if (!is_probing() && !is_electing()) { + m->put(); + return; + } + + // newer map, or they've joined a quorum and we haven't? + bufferlist mybl; + monmap->encode(mybl, m->get_connection()->get_features()); + // make sure it's actually different; the checks below err toward + // taking the other guy's map, which could cause us to loop. + if (!mybl.contents_equal(m->monmap_bl)) { + MonMap *newmap = new MonMap; + newmap->decode(m->monmap_bl); + if (m->has_ever_joined && (newmap->get_epoch() > monmap->get_epoch() || + !has_ever_joined)) { + dout(10) << " got newer/committed monmap epoch " << newmap->get_epoch() + << ", mine was " << monmap->get_epoch() << dendl; + delete newmap; + monmap->decode(m->monmap_bl); + m->put(); + + bootstrap(); + return; + } + delete newmap; + } + + // rename peer? + string peer_name = monmap->get_name(m->get_source_addr()); + if (monmap->get_epoch() == 0 && peer_name.find("noname-") == 0) { + dout(10) << " renaming peer " << m->get_source_addr() << " " + << peer_name << " -> " << m->name << " in my monmap" + << dendl; + monmap->rename(peer_name, m->name); + + if (is_electing()) { + m->put(); + bootstrap(); + return; + } + } else { + dout(10) << " peer name is " << peer_name << dendl; + } + + // new initial peer? + if (monmap->contains(m->name)) { + if (monmap->get_addr(m->name).is_blank_ip()) { + dout(1) << " learned initial mon " << m->name << " addr " << m->get_source_addr() << dendl; + monmap->set_addr(m->name, m->get_source_addr()); + m->put(); + + bootstrap(); + return; + } + } + + // end discover phase + if (!is_probing()) { + m->put(); + return; + } + + assert(paxos != NULL); + + if (is_synchronizing()) { + dout(10) << " currently syncing" << dendl; + m->put(); + return; + } + + entity_inst_t other = m->get_source_inst(); + + if (m->paxos_last_version < sync_last_committed_floor) { + dout(10) << " peer paxos versions [" << m->paxos_first_version + << "," << m->paxos_last_version << "] < my sync_last_committed_floor " + << sync_last_committed_floor << ", ignoring" + << dendl; + } else { + if (paxos->get_version() < m->paxos_first_version && + m->paxos_first_version > 1) { // no need to sync if we're 0 and they start at 1. + dout(10) << " peer paxos versions [" << m->paxos_first_version + << "," << m->paxos_last_version << "]" + << " vs my version " << paxos->get_version() + << " (too far ahead)" + << dendl; + cancel_probe_timeout(); + sync_start(other, true); + m->put(); + return; + } + if (paxos->get_version() + g_conf->paxos_max_join_drift < m->paxos_last_version) { + dout(10) << " peer paxos version " << m->paxos_last_version + << " vs my version " << paxos->get_version() + << " (too far ahead)" + << dendl; + cancel_probe_timeout(); + sync_start(other, false); + m->put(); + return; + } + } + + // is there an existing quorum? + if (m->quorum.size()) { + dout(10) << " existing quorum " << m->quorum << dendl; + + dout(10) << " peer paxos version " << m->paxos_last_version + << " vs my version " << paxos->get_version() + << " (ok)" + << dendl; + + if (monmap->contains(name) && + !monmap->get_addr(name).is_blank_ip()) { + // i'm part of the cluster; just initiate a new election + start_election(); + } else { + dout(10) << " ready to join, but i'm not in the monmap or my addr is blank, trying to join" << dendl; + messenger->send_message(new MMonJoin(monmap->fsid, name, messenger->get_myaddr()), + monmap->get_inst(*m->quorum.begin())); + } + } else { + if (monmap->contains(m->name)) { + dout(10) << " mon." << m->name << " is outside the quorum" << dendl; + outside_quorum.insert(m->name); + } else { + dout(10) << " mostly ignoring mon." << m->name << ", not part of monmap" << dendl; + m->put(); + return; + } + + unsigned need = monmap->size() / 2 + 1; + dout(10) << " outside_quorum now " << outside_quorum << ", need " << need << dendl; + if (outside_quorum.size() >= need) { + if (outside_quorum.count(name)) { + dout(10) << " that's enough to form a new quorum, calling election" << dendl; + start_election(); + } else { + dout(10) << " that's enough to form a new quorum, but it does not include me; waiting" << dendl; + } + } else { + dout(10) << " that's not yet enough for a new quorum, waiting" << dendl; + } + } + m->put(); +} + +void Monitor::join_election() +{ + dout(10) << __func__ << dendl; + state = STATE_ELECTING; + _reset(); +} + +void Monitor::start_election() +{ + dout(10) << "start_election" << dendl; + state = STATE_ELECTING; + _reset(); + + cancel_probe_timeout(); + + clog.info() << "mon." << name << " calling new monitor election\n"; + elector.call_election(); +} + +void Monitor::win_standalone_election() +{ + dout(1) << "win_standalone_election" << dendl; + + // bump election epoch, in case the previous epoch included other + // monitors; we need to be able to make the distinction. + elector.advance_epoch(); + + rank = monmap->get_rank(name); + assert(rank == 0); + set q; + q.insert(rank); + win_election(1, q, CEPH_FEATURES_ALL); +} + +const utime_t& Monitor::get_leader_since() const +{ + assert(state == STATE_LEADER); + return leader_since; +} + +epoch_t Monitor::get_epoch() +{ + return elector.get_epoch(); +} + +void Monitor::win_election(epoch_t epoch, set& active, uint64_t features) +{ + dout(10) << __func__ << " epoch " << epoch << " quorum " << active + << " features " << features << dendl; + assert(is_electing()); + state = STATE_LEADER; + leader_since = ceph_clock_now(g_ceph_context); + leader = rank; + quorum = active; + quorum_features = features; + outside_quorum.clear(); + + clog.info() << "mon." << name << "@" << rank + << " won leader election with quorum " << quorum << "\n"; + + paxos->leader_init(); + for (vector::iterator p = paxos_service.begin(); p != paxos_service.end(); ++p) + (*p)->election_finished(); + health_monitor->start(epoch); + + finish_election(); + if (monmap->size() > 1) + timecheck_start(); +} + +void Monitor::lose_election(epoch_t epoch, set &q, int l, uint64_t features) +{ + state = STATE_PEON; + leader_since = utime_t(); + leader = l; + quorum = q; + outside_quorum.clear(); + quorum_features = features; + dout(10) << "lose_election, epoch " << epoch << " leader is mon" << leader + << " quorum is " << quorum << " features are " << quorum_features << dendl; + + paxos->peon_init(); + for (vector::iterator p = paxos_service.begin(); p != paxos_service.end(); ++p) + (*p)->election_finished(); + health_monitor->start(epoch); + + finish_election(); +} + +void Monitor::finish_election() +{ + timecheck_finish(); + exited_quorum = utime_t(); + finish_contexts(g_ceph_context, waitfor_quorum); + finish_contexts(g_ceph_context, maybe_wait_for_quorum); + resend_routed_requests(); + update_logger(); + register_cluster_logger(); + + // am i named properly? + string cur_name = monmap->get_name(messenger->get_myaddr()); + if (cur_name != name) { + dout(10) << " renaming myself from " << cur_name << " -> " << name << dendl; + messenger->send_message(new MMonJoin(monmap->fsid, name, messenger->get_myaddr()), + monmap->get_inst(*quorum.begin())); + } +} + + +void Monitor::sync_force(Formatter *f, ostream& ss) +{ + bool free_formatter = false; + + if (!f) { + // louzy/lazy hack: default to json if no formatter has been defined + f = new JSONFormatter(); + free_formatter = true; + } + + MonitorDBStore::Transaction tx; + sync_stash_critical_state(&tx); + tx.put("mon_sync", "force_sync", 1); + store->apply_transaction(tx); + + f->open_object_section("sync_force"); + f->dump_int("ret", 0); + f->dump_stream("msg") << "forcing store sync the next time the monitor starts"; + f->close_section(); // sync_force + f->flush(ss); + if (free_formatter) + delete f; +} + +void Monitor::_quorum_status(Formatter *f, ostream& ss) +{ + bool free_formatter = false; + + if (!f) { + // louzy/lazy hack: default to json if no formatter has been defined + f = new JSONFormatter(); + free_formatter = true; + } + f->open_object_section("quorum_status"); + f->dump_int("election_epoch", get_epoch()); + + f->open_array_section("quorum"); + for (set::iterator p = quorum.begin(); p != quorum.end(); ++p) + f->dump_int("mon", *p); + f->close_section(); // quorum + + set quorum_names = get_quorum_names(); + f->open_array_section("quorum_names"); + for (set::iterator p = quorum_names.begin(); p != quorum_names.end(); ++p) + f->dump_string("mon", *p); + f->close_section(); // quorum_names + + f->dump_string("quorum_leader_name", quorum.empty() ? string() : monmap->get_name(*quorum.begin())); + + f->open_object_section("monmap"); + monmap->dump(f); + f->close_section(); // monmap + + f->close_section(); // quorum_status + f->flush(ss); + if (free_formatter) + delete f; +} + +void Monitor::_mon_status(Formatter *f, ostream& ss) +{ + bool free_formatter = false; + + if (!f) { + // louzy/lazy hack: default to json if no formatter has been defined + f = new JSONFormatter(); + free_formatter = true; + } + + f->open_object_section("mon_status"); + f->dump_string("name", name); + f->dump_int("rank", rank); + f->dump_string("state", get_state_name()); + f->dump_int("election_epoch", get_epoch()); + + f->open_array_section("quorum"); + for (set::iterator p = quorum.begin(); p != quorum.end(); ++p) { + f->dump_int("mon", *p); + } + + f->close_section(); // quorum + + f->open_array_section("outside_quorum"); + for (set::iterator p = outside_quorum.begin(); p != outside_quorum.end(); ++p) + f->dump_string("mon", *p); + f->close_section(); // outside_quorum + + f->open_array_section("extra_probe_peers"); + for (set::iterator p = extra_probe_peers.begin(); + p != extra_probe_peers.end(); + ++p) + f->dump_stream("peer") << *p; + f->close_section(); // extra_probe_peers + + f->open_array_section("sync_provider"); + for (map::const_iterator p = sync_providers.begin(); + p != sync_providers.end(); + ++p) { + f->dump_unsigned("cookie", p->second.cookie); + f->dump_stream("entity") << p->second.entity; + f->dump_stream("timeout") << p->second.timeout; + f->dump_unsigned("last_committed", p->second.last_committed); + f->dump_stream("last_key") << p->second.last_key; + } + f->close_section(); + + if (is_synchronizing()) { + f->open_object_section("sync"); + f->dump_stream("sync_provider") << sync_provider; + f->dump_unsigned("sync_cookie", sync_cookie); + f->dump_unsigned("sync_start_version", sync_start_version); + f->close_section(); + } + + if (g_conf->mon_sync_provider_kill_at > 0) + f->dump_int("provider_kill_at", g_conf->mon_sync_provider_kill_at); + if (g_conf->mon_sync_requester_kill_at > 0) + f->dump_int("requester_kill_at", g_conf->mon_sync_requester_kill_at); + + f->open_object_section("monmap"); + monmap->dump(f); + f->close_section(); + + f->close_section(); // mon_status + + if (free_formatter) { + // flush formatter to ss and delete it iff we created the formatter + f->flush(ss); + delete f; + } +} + +void Monitor::get_health(string& status, bufferlist *detailbl, Formatter *f) +{ + list > summary; + list > detail; + + if (f) + f->open_object_section("health"); + + for (vector::iterator p = paxos_service.begin(); + p != paxos_service.end(); + ++p) { + PaxosService *s = *p; + s->get_health(summary, detailbl ? &detail : NULL); + } + + if (f) + f->open_array_section("summary"); + stringstream ss; + health_status_t overall = HEALTH_OK; + if (!summary.empty()) { + if (f) { + f->open_object_section("item"); + f->dump_stream("severity") << summary.front().first; + f->dump_string("summary", summary.front().second); + f->close_section(); + } + ss << ' '; + while (!summary.empty()) { + if (overall > summary.front().first) + overall = summary.front().first; + ss << summary.front().second; + summary.pop_front(); + if (!summary.empty()) + ss << "; "; + } + } + if (f) + f->close_section(); + + if (f) { + f->open_object_section("timechecks"); + f->dump_int("epoch", get_epoch()); + f->dump_int("round", timecheck_round); + f->dump_stream("round_status") + << ((timecheck_round%2) ? "on-going" : "finished"); + } + + if (!timecheck_skews.empty()) { + list warns; + if (f) + f->open_array_section("mons"); + for (map::iterator i = timecheck_skews.begin(); + i != timecheck_skews.end(); ++i) { + entity_inst_t inst = i->first; + double skew = i->second; + double latency = timecheck_latencies[inst]; + string name = monmap->get_name(inst.addr); + + ostringstream tcss; + health_status_t tcstatus = timecheck_status(tcss, skew, latency); + if (tcstatus != HEALTH_OK) { + if (overall > tcstatus) + overall = tcstatus; + warns.push_back(name); + + ostringstream tmp_ss; + tmp_ss << "mon." << name + << " addr " << inst.addr << " " << tcss.str() + << " (latency " << latency << "s)"; + detail.push_back(make_pair(tcstatus, tmp_ss.str())); + } + + if (f) { + f->open_object_section("mon"); + f->dump_string("name", name.c_str()); + f->dump_float("skew", skew); + f->dump_float("latency", latency); + f->dump_stream("health") << tcstatus; + if (tcstatus != HEALTH_OK) + f->dump_stream("details") << tcss.str(); + f->close_section(); + } + } + if (!warns.empty()) { + if (!ss.str().empty()) + ss << ";"; + ss << " clock skew detected on"; + while (!warns.empty()) { + ss << " mon." << warns.front(); + warns.pop_front(); + if (!warns.empty()) + ss << ","; + } + } + if (f) + f->close_section(); + } + if (f) + f->close_section(); + + health_status_t hmstatus = health_monitor->get_health(f, (detailbl ? &detail : NULL)); + if (overall > hmstatus) + overall = hmstatus; + + stringstream fss; + fss << overall; + status = fss.str() + ss.str(); + if (f) + f->dump_stream("overall_status") << overall; + + if (f) + f->open_array_section("detail"); + while (!detail.empty()) { + if (f) + f->dump_string("item", detail.front().second); + else if (detailbl != NULL) { + detailbl->append(detail.front().second); + detailbl->append('\n'); + } + detail.pop_front(); + } + if (f) + f->close_section(); + + if (f) + f->close_section(); +} + +void Monitor::get_status(stringstream &ss, Formatter *f) +{ + if (f) + f->open_object_section("status"); + + // reply with the status for all the components + string health; + get_health(health, NULL, f); + + if (f) { + f->dump_stream("fsid") << monmap->get_fsid(); + f->dump_unsigned("election_epoch", get_epoch()); + { + f->open_array_section("quorum"); + for (set::iterator p = quorum.begin(); p != quorum.end(); ++p) + f->dump_int("rank", *p); + f->close_section(); + f->open_array_section("quorum_names"); + for (set::iterator p = quorum.begin(); p != quorum.end(); ++p) + f->dump_string("id", monmap->get_name(*p)); + f->close_section(); + } + f->open_object_section("monmap"); + monmap->dump(f); + f->close_section(); + f->open_object_section("osdmap"); + osdmon()->osdmap.print_summary(f, cout); + f->close_section(); + f->open_object_section("pgmap"); + pgmon()->pg_map.print_summary(f, NULL); + f->close_section(); + f->open_object_section("mdsmap"); + mdsmon()->mdsmap.print_summary(f, NULL); + f->close_section(); + f->close_section(); + } else { + ss << " cluster " << monmap->get_fsid() << "\n"; + ss << " health " << health << "\n"; + ss << " monmap " << *monmap << ", election epoch " << get_epoch() + << ", quorum " << get_quorum() << " " << get_quorum_names() << "\n"; + if (mdsmon()->mdsmap.get_epoch() > 1) + ss << " mdsmap " << mdsmon()->mdsmap << "\n"; + osdmon()->osdmap.print_summary(NULL, ss); + pgmon()->pg_map.print_summary(NULL, &ss); + } +} + +#undef COMMAND +MonCommand mon_commands[] = { +#define COMMAND(parsesig, helptext, modulename, req_perms, avail) \ + {parsesig, helptext, modulename, req_perms, avail}, +#include +}; + +bool Monitor::_allowed_command(MonSession *s, string &module, string &prefix, + map& cmdmap) { + + map strmap; + for (map::const_iterator p = cmdmap.begin(); + p != cmdmap.end(); ++p) { + if (p->first == "prefix") + continue; + if (p->first == "caps") { + vector cv; + if (cmd_getval(g_ceph_context, cmdmap, "caps", cv) && + cv.size() % 2 == 0) { + for (unsigned i = 0; i < cv.size(); i += 2) { + string k = string("caps_") + cv[i]; + strmap[k] = cv[i + 1]; + } + continue; + } + } + strmap[p->first] = cmd_vartype_stringify(p->second); + } + + MonCommand *this_cmd = NULL; + for (MonCommand *cp = mon_commands; + cp < &mon_commands[ARRAY_SIZE(mon_commands)]; cp++) { + if (cp->cmdstring.find(prefix) != string::npos) { + this_cmd = cp; + break; + } + } + assert(this_cmd != NULL); + bool cmd_r = (this_cmd->req_perms.find('r') != string::npos); + bool cmd_w = (this_cmd->req_perms.find('w') != string::npos); + bool cmd_x = (this_cmd->req_perms.find('x') != string::npos); + + bool capable = s->caps.is_capable(g_ceph_context, s->inst.name, + module, prefix, strmap, + cmd_r, cmd_w, cmd_x); + + dout(10) << __func__ << " " << (capable ? "" : "not ") << "capable" << dendl; + return capable; +} + +void get_command_descriptions(const MonCommand *commands, + unsigned commands_size, + Formatter *f, + bufferlist *rdata) { + int cmdnum = 0; + f->open_object_section("command_descriptions"); + for (const MonCommand *cp = commands; + cp < &commands[commands_size]; cp++) { + + ostringstream secname; + secname << "cmd" << setfill('0') << std::setw(3) << cmdnum; + dump_cmddesc_to_json(f, secname.str(), + cp->cmdstring, cp->helpstring, cp->module, + cp->req_perms, cp->availability); + cmdnum++; + } + f->close_section(); // command_descriptions + + f->flush(*rdata); +} + +void Monitor::handle_command(MMonCommand *m) +{ + if (m->fsid != monmap->fsid) { + dout(0) << "handle_command on fsid " << m->fsid << " != " << monmap->fsid << dendl; + reply_command(m, -EPERM, "wrong fsid", 0); + return; + } + + MonSession *session = m->get_session(); + if (!session) { + string rs = "Access denied"; + reply_command(m, -EACCES, rs, 0); + return; + } + + if (m->cmd.empty()) { + string rs = "No command supplied"; + reply_command(m, -EINVAL, rs, 0); + return; + } + + string prefix; + vector fullcmd; + map cmdmap; + stringstream ss, ds; + bufferlist rdata; + string rs; + int r = -EINVAL; + rs = "unrecognized command"; + + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + // ss has reason for failure + r = -EINVAL; + rs = ss.str(); + if (!m->get_source().is_mon()) // don't reply to mon->mon commands + reply_command(m, r, rs, 0); + else + m->put(); + return; + } + + cmd_getval(g_ceph_context, cmdmap, "prefix", prefix); + if (prefix == "get_command_descriptions") { + bufferlist rdata; + Formatter *f = new_formatter("json"); + get_command_descriptions(mon_commands, ARRAY_SIZE(mon_commands), f, &rdata); + delete f; + reply_command(m, 0, "", rdata, 0); + return; + } + + string module; + string err; + + dout(0) << "handle_command " << *m << dendl; + + string format; + cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain")); + boost::scoped_ptr f(new_formatter(format)); + + get_str_vec(prefix, fullcmd); + module = fullcmd[0]; + + if (!_allowed_command(session, module, prefix, cmdmap)) { + dout(1) << __func__ << " access denied" << dendl; + reply_command(m, -EACCES, "access denied", 0); + return; + } + + if (module == "mds") { + mdsmon()->dispatch(m); + return; + } + if (module == "osd") { + osdmon()->dispatch(m); + return; + } + + if (module == "pg") { + pgmon()->dispatch(m); + return; + } + if (module == "mon") { + monmon()->dispatch(m); + return; + } + if (module == "auth") { + authmon()->dispatch(m); + return; + } + if (module == "log") { + logmon()->dispatch(m); + return; + } + + if (module == "config-key") { + config_key_service->dispatch(m); + return; + } + + if (prefix == "fsid") { + if (f) { + f->open_object_section("fsid"); + f->dump_stream("fsid") << monmap->fsid; + f->close_section(); + f->flush(rdata); + } else { + ds << monmap->fsid; + rdata.append(ds); + } + reply_command(m, 0, "", rdata, 0); + return; + } + + if (prefix == "scrub") { + if (is_leader()) { + int r = scrub(); + reply_command(m, r, "", rdata, 0); + } else if (is_peon()) { + forward_request_leader(m); + } else { + reply_command(m, -EAGAIN, "no quorum", rdata, 0); + } + return; + } + + if (prefix == "compact") { + dout(1) << "triggering manual compaction" << dendl; + utime_t start = ceph_clock_now(g_ceph_context); + store->compact(); + utime_t end = ceph_clock_now(g_ceph_context); + end -= start; + dout(1) << "finished manual compaction in " << end << " seconds" << dendl; + ostringstream oss; + oss << "compacted leveldb in " << end; + rs = oss.str(); + r = 0; + } + else if (prefix == "injectargs") { + vector injected_args; + cmd_getval(g_ceph_context, cmdmap, "injected_args", injected_args); + if (!injected_args.empty()) { + dout(0) << "parsing injected options '" << injected_args << "'" << dendl; + ostringstream oss; + r = g_conf->injectargs(str_join(injected_args, " "), &oss); + ss << "injectargs:" << oss.str(); + rs = ss.str(); + goto out; + } else { + rs = "must supply options to be parsed in a single string"; + r = -EINVAL; + } + } else if (prefix == "status" || + prefix == "health" || + prefix == "df") { + string detail; + cmd_getval(g_ceph_context, cmdmap, "detail", detail); + + if (prefix == "status") { + // get_status handles f == NULL + get_status(ds, f.get()); + + if (f) { + f->flush(ds); + ds << '\n'; + } + rdata.append(ds); + } else if (prefix == "health") { + string health_str; + get_health(health_str, detail == "detail" ? &rdata : NULL, f.get()); + if (f) { + f->flush(ds); + ds << '\n'; + } else { + ds << health_str; + } + bufferlist comb; + comb.append(ds); + if (detail == "detail") + comb.append(rdata); + rdata = comb; + r = 0; + } else if (prefix == "df") { + bool verbose = (detail == "detail"); + if (f) + f->open_object_section("stats"); + + pgmon()->dump_fs_stats(ds, f.get(), verbose); + if (!f) + ds << '\n'; + pgmon()->dump_pool_stats(ds, f.get(), verbose); + + if (f) { + f->close_section(); + f->flush(ds); + ds << '\n'; + } + } else { + assert(0 == "We should never get here!"); + return; + } + rdata.append(ds); + rs = ""; + r = 0; + } else if (prefix == "report") { + + // this must be formatted, in its current form + if (!f) + f.reset(new_formatter("json-pretty")); + f->open_object_section("report"); + f->dump_string("version", ceph_version_to_str()); + f->dump_string("commit", git_version_to_str()); + f->dump_stream("timestamp") << ceph_clock_now(NULL); + + vector tagsvec; + cmd_getval(g_ceph_context, cmdmap, "tags", tagsvec); + string tagstr = str_join(tagsvec, " "); + if (!tagstr.empty()) + tagstr = tagstr.substr(0, tagstr.find_last_of(' ')); + f->dump_string("tag", tagstr); + + string hs; + get_health(hs, NULL, f.get()); + + monmon()->dump_info(f.get()); + osdmon()->dump_info(f.get()); + mdsmon()->dump_info(f.get()); + pgmon()->dump_info(f.get()); + authmon()->dump_info(f.get()); + + paxos->dump_info(f.get()); + + f->close_section(); + f->flush(rdata); + + ostringstream ss2; + ss2 << "report " << rdata.crc32c(6789); + rs = ss2.str(); + r = 0; + } else if (prefix == "quorum_status") { + // make sure our map is readable and up to date + if (!is_leader() && !is_peon()) { + dout(10) << " waiting for quorum" << dendl; + waitfor_quorum.push_back(new C_RetryMessage(this, m)); + return; + } + _quorum_status(f.get(), ds); + rdata.append(ds); + rs = ""; + r = 0; + } else if (prefix == "mon_status") { + _mon_status(f.get(), ds); + if (f) + f->flush(ds); + rdata.append(ds); + rs = ""; + r = 0; + } else if (prefix == "sync force") { + string validate1, validate2; + cmd_getval(g_ceph_context, cmdmap, "validate1", validate1); + cmd_getval(g_ceph_context, cmdmap, "validate2", validate2); + if (validate1 != "--yes-i-really-mean-it" || + validate2 != "--i-know-what-i-am-doing") { + r = -EINVAL; + rs = "are you SURE? this will mean the monitor store will be " + "erased. pass '--yes-i-really-mean-it " + "--i-know-what-i-am-doing' if you really do."; + goto out; + } + sync_force(f.get(), ds); + rs = ds.str(); + r = 0; + } else if (prefix == "heap") { + if (!ceph_using_tcmalloc()) + rs = "tcmalloc not enabled, can't use heap profiler commands\n"; + else { + string heapcmd; + cmd_getval(g_ceph_context, cmdmap, "heapcmd", heapcmd); + // XXX 1-element vector, change at callee or make vector here? + vector heapcmd_vec; + get_str_vec(heapcmd, heapcmd_vec); + ceph_heap_profiler_handle_command(heapcmd_vec, ds); + rdata.append(ds); + rs = ""; + r = 0; + } + } else if (prefix == "quorum") { + string quorumcmd; + cmd_getval(g_ceph_context, cmdmap, "quorumcmd", quorumcmd); + if (quorumcmd == "exit") { + start_election(); + elector.stop_participating(); + rs = "stopped responding to quorum, initiated new election"; + r = 0; + } else if (quorumcmd == "enter") { + elector.start_participating(); + start_election(); + rs = "started responding to quorum, initiated new election"; + r = 0; + } + } + + out: + if (!m->get_source().is_mon()) // don't reply to mon->mon commands + reply_command(m, r, rs, rdata, 0); + else + m->put(); +} + +void Monitor::reply_command(MMonCommand *m, int rc, const string &rs, version_t version) +{ + bufferlist rdata; + reply_command(m, rc, rs, rdata, version); +} + +void Monitor::reply_command(MMonCommand *m, int rc, const string &rs, bufferlist& rdata, version_t version) +{ + MMonCommandAck *reply = new MMonCommandAck(m->cmd, rc, rs, version); + reply->set_tid(m->get_tid()); + reply->set_data(rdata); + send_reply(m, reply); + m->put(); +} + + +// ------------------------ +// request/reply routing +// +// a client/mds/osd will connect to a random monitor. we need to forward any +// messages requiring state updates to the leader, and then route any replies +// back via the correct monitor and back to them. (the monitor will not +// initiate any connections.) + +void Monitor::forward_request_leader(PaxosServiceMessage *req) +{ + int mon = get_leader(); + MonSession *session = 0; + if (req->get_connection()) + session = static_cast(req->get_connection()->get_priv()); + if (req->get_source().is_mon() && req->get_source_addr() != messenger->get_myaddr()) { + dout(10) << "forward_request won't forward (non-local) mon request " << *req << dendl; + req->put(); + } else if (session && session->proxy_con) { + dout(10) << "forward_request won't double fwd request " << *req << dendl; + req->put(); + } else if (session && !session->closed) { + RoutedRequest *rr = new RoutedRequest; + rr->tid = ++routed_request_tid; + rr->client_inst = req->get_source_inst(); + rr->con = req->get_connection(); + encode_message(req, CEPH_FEATURES_ALL, rr->request_bl); // for my use only; use all features + rr->session = static_cast(session->get()); + routed_requests[rr->tid] = rr; + session->routed_request_tids.insert(rr->tid); + + dout(10) << "forward_request " << rr->tid << " request " << *req << dendl; + + MForward *forward = new MForward(rr->tid, req, rr->session->caps); + forward->set_priority(req->get_priority()); + messenger->send_message(forward, monmap->get_inst(mon)); + } else { + dout(10) << "forward_request no session for request " << *req << dendl; + req->put(); + } + if (session) + session->put(); +} + +//extract the original message and put it into the regular dispatch function +void Monitor::handle_forward(MForward *m) +{ + dout(10) << "received forwarded message from " << m->client + << " via " << m->get_source_inst() << dendl; + MonSession *session = static_cast(m->get_connection()->get_priv()); + assert(session); + + if (!session->is_capable("mon", MON_CAP_X)) { + dout(0) << "forward from entity with insufficient caps! " + << session->caps << dendl; + } else { + Connection *c = new Connection(NULL); // msgr must be null; see PaxosService::dispatch() + MonSession *s = new MonSession(m->msg->get_source_inst(), c); + c->set_priv(s); + c->set_peer_addr(m->client.addr); + c->set_peer_type(m->client.name.type()); + + s->caps = m->client_caps; + dout(10) << " caps are " << s->caps << dendl; + s->proxy_con = m->get_connection(); + s->proxy_tid = m->tid; + + PaxosServiceMessage *req = m->msg; + m->msg = NULL; // so ~MForward doesn't delete it + req->set_connection(c); + + // not super accurate, but better than nothing. + req->set_recv_stamp(m->get_recv_stamp()); + + /* + * note which election epoch this is; we will drop the message if + * there is a future election since our peers will resend routed + * requests in that case. + */ + req->rx_election_epoch = get_epoch(); + + /* Because this is a special fake connection, we need to break + the ref loop between Connection and MonSession differently + than we normally do. Here, the Message refers to the Connection + which refers to the Session, and nobody else refers to the Connection + or the Session. And due to the special nature of this message, + nobody refers to the Connection via the Session. So, clear out that + half of the ref loop.*/ + s->con.reset(NULL); + + dout(10) << " mesg " << req << " from " << m->get_source_addr() << dendl; + + _ms_dispatch(req); + } + session->put(); + m->put(); +} + +void Monitor::try_send_message(Message *m, const entity_inst_t& to) +{ + dout(10) << "try_send_message " << *m << " to " << to << dendl; + + bufferlist bl; + encode_message(m, CEPH_FEATURES_ALL, bl); // fixme: assume peers have all features we do. + + messenger->send_message(m, to); + + for (int i=0; i<(int)monmap->size(); i++) { + if (i != rank) + messenger->send_message(new MRoute(bl, to), monmap->get_inst(i)); + } +} + +void Monitor::send_reply(PaxosServiceMessage *req, Message *reply) +{ + MonSession *session = static_cast(req->get_connection()->get_priv()); + if (!session) { + dout(2) << "send_reply no session, dropping reply " << *reply + << " to " << req << " " << *req << dendl; + reply->put(); + return; + } + if (session->proxy_con) { + dout(15) << "send_reply routing reply to " << req->get_connection()->get_peer_addr() + << " via " << session->proxy_con->get_peer_addr() + << " for request " << *req << dendl; + messenger->send_message(new MRoute(session->proxy_tid, reply), + session->proxy_con); + } else { + messenger->send_message(reply, session->con); + } + session->put(); +} + +void Monitor::no_reply(PaxosServiceMessage *req) +{ + MonSession *session = static_cast(req->get_connection()->get_priv()); + if (!session) { + dout(2) << "no_reply no session, dropping non-reply to " << req << " " << *req << dendl; + return; + } + if (session->proxy_con) { + if (get_quorum_features() & CEPH_FEATURE_MON_NULLROUTE) { + dout(10) << "no_reply to " << req->get_source_inst() + << " via " << session->proxy_con->get_peer_addr() + << " for request " << *req << dendl; + messenger->send_message(new MRoute(session->proxy_tid, NULL), + session->proxy_con); + } else { + dout(10) << "no_reply no quorum nullroute feature for " << req->get_source_inst() + << " via " << session->proxy_con->get_peer_addr() + << " for request " << *req << dendl; + } + } else { + dout(10) << "no_reply to " << req->get_source_inst() << " " << *req << dendl; + } + session->put(); +} + +void Monitor::handle_route(MRoute *m) +{ + MonSession *session = static_cast(m->get_connection()->get_priv()); + //check privileges + if (session && !session->is_capable("mon", MON_CAP_X)) { + dout(0) << "MRoute received from entity without appropriate perms! " + << dendl; + session->put(); + m->put(); + return; + } + if (m->msg) + dout(10) << "handle_route " << *m->msg << " to " << m->dest << dendl; + else + dout(10) << "handle_route null to " << m->dest << dendl; + + // look it up + if (m->session_mon_tid) { + if (routed_requests.count(m->session_mon_tid)) { + RoutedRequest *rr = routed_requests[m->session_mon_tid]; + + // reset payload, in case encoding is dependent on target features + if (m->msg) { + m->msg->clear_payload(); + messenger->send_message(m->msg, rr->con); + m->msg = NULL; + } + routed_requests.erase(m->session_mon_tid); + rr->session->routed_request_tids.insert(rr->tid); + delete rr; + } else { + dout(10) << " don't have routed request tid " << m->session_mon_tid << dendl; + } + } else { + dout(10) << " not a routed request, trying to send anyway" << dendl; + if (m->msg) { + messenger->lazy_send_message(m->msg, m->dest); + m->msg = NULL; + } + } + m->put(); + if (session) + session->put(); +} + +void Monitor::resend_routed_requests() +{ + dout(10) << "resend_routed_requests" << dendl; + int mon = get_leader(); + list retry; + for (map::iterator p = routed_requests.begin(); + p != routed_requests.end(); + ++p) { + RoutedRequest *rr = p->second; + + bufferlist::iterator q = rr->request_bl.begin(); + PaxosServiceMessage *req = (PaxosServiceMessage *)decode_message(cct, q); + + if (mon == rank) { + dout(10) << " requeue for self tid " << rr->tid << " " << *req << dendl; + req->set_connection(rr->con); + retry.push_back(new C_RetryMessage(this, req)); + delete rr; + } else { + dout(10) << " resend to mon." << mon << " tid " << rr->tid << " " << *req << dendl; + MForward *forward = new MForward(rr->tid, req, rr->session->caps); + forward->client = rr->client_inst; + forward->set_priority(req->get_priority()); + messenger->send_message(forward, monmap->get_inst(mon)); + } + } + if (mon == rank) { + routed_requests.clear(); + finish_contexts(g_ceph_context, retry); + } +} + +void Monitor::remove_session(MonSession *s) +{ + dout(10) << "remove_session " << s << " " << s->inst << dendl; + assert(!s->closed); + for (set::iterator p = s->routed_request_tids.begin(); + p != s->routed_request_tids.end(); + ++p) { + if (routed_requests.count(*p)) { + RoutedRequest *rr = routed_requests[*p]; + dout(10) << " dropping routed request " << rr->tid << dendl; + delete rr; + routed_requests.erase(*p); + } + } + s->con->set_priv(NULL); + session_map.remove_session(s); +} + +void Monitor::remove_all_sessions() +{ + while (!session_map.sessions.empty()) { + MonSession *s = session_map.sessions.front(); + remove_session(s); + } +} + +void Monitor::send_command(const entity_inst_t& inst, + const vector& com) +{ + dout(10) << "send_command " << inst << "" << com << dendl; + MMonCommand *c = new MMonCommand(monmap->fsid); + c->cmd = com; + try_send_message(c, inst); +} + +void Monitor::waitlist_or_zap_client(Message *m) +{ + /** + * Wait list the new session until we're in the quorum, assuming it's + * sufficiently new. + * tick() will periodically send them back through so we can send + * the client elsewhere if we don't think we're getting back in. + * + * But we whitelist a few sorts of messages: + * 1) Monitors can talk to us at any time, of course. + * 2) auth messages. It's unlikely to go through much faster, but + * it's possible we've just lost our quorum status and we want to take... + * 3) command messages. We want to accept these under all possible + * circumstances. + */ + ConnectionRef con = m->get_connection(); + utime_t too_old = ceph_clock_now(g_ceph_context); + too_old -= g_ceph_context->_conf->mon_lease; + if (m->get_recv_stamp() > too_old && + con->is_connected()) { + dout(5) << "waitlisting message " << *m << dendl; + maybe_wait_for_quorum.push_back(new C_RetryMessage(this, m)); + } else { + dout(5) << "discarding message " << *m << " and sending client elsewhere" << dendl; + messenger->mark_down(con); + m->put(); + } +} + +bool Monitor::_ms_dispatch(Message *m) +{ + bool ret = true; + + if (is_shutdown()) { + m->put(); + return true; + } + + ConnectionRef connection = m->get_connection(); + MonSession *s = NULL; + MonCap caps; + EntityName entity_name; + bool src_is_mon; + + // regardless of who we are or who the sender is, the message must + // have a connection associated. If it doesn't then something fishy + // is going on. + assert(connection); + + src_is_mon = (connection->get_peer_type() & CEPH_ENTITY_TYPE_MON); + + bool reuse_caps = false; + dout(20) << "have connection" << dendl; + s = static_cast(connection->get_priv()); + if (s && s->closed) { + caps = s->caps; + reuse_caps = true; + s->put(); + s = NULL; + } + if (!s) { + // if the sender is not a monitor, make sure their first message for a + // session is an MAuth. If it is not, assume it's a stray message, + // and considering that we are creating a new session it is safe to + // assume that the sender hasn't authenticated yet, so we have no way + // of assessing whether we should handle it or not. + if (!src_is_mon && (m->get_type() != CEPH_MSG_AUTH && + m->get_type() != CEPH_MSG_MON_GET_MAP)) { + if (m->get_type() == CEPH_MSG_PING) { + // let it go through and be dispatched immediately! + return dispatch(s, m, false); + } + dout(1) << __func__ << " dropping stray message " << *m + << " from " << m->get_source_inst() << dendl; + return false; + } + + if (!exited_quorum.is_zero() && !src_is_mon) { + waitlist_or_zap_client(m); + return true; + } + + dout(10) << "do not have session, making new one" << dendl; + s = session_map.new_session(m->get_source_inst(), m->get_connection().get()); + m->get_connection()->set_priv(s->get()); + dout(10) << "ms_dispatch new session " << s << " for " << s->inst << dendl; + + if (!src_is_mon) { + dout(10) << "setting timeout on session" << dendl; + // set an initial timeout here, so we will trim this session even if they don't + // do anything. + s->until = ceph_clock_now(g_ceph_context); + s->until += g_conf->mon_subscribe_interval; + } else { + //give it monitor caps; the peer type has been authenticated + reuse_caps = false; + dout(5) << "setting monitor caps on this connection" << dendl; + if (!s->caps.is_allow_all()) //but no need to repeatedly copy + s->caps = *mon_caps; + } + if (reuse_caps) + s->caps = caps; + } else { + dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl; + } + + if (s) { + if (s->auth_handler) { + entity_name = s->auth_handler->get_entity_name(); + } + dout(20) << " caps " << s->caps.get_str() << dendl; + } + + if (is_synchronizing() && !src_is_mon) { + waitlist_or_zap_client(m); + return true; + } + + ret = dispatch(s, m, src_is_mon); + + if (s) { + s->put(); + } + + return ret; +} + +bool Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon) +{ + bool ret = true; + + assert(m != NULL); + + switch (m->get_type()) { + + case MSG_ROUTE: + handle_route(static_cast(m)); + break; + + // misc + case CEPH_MSG_MON_GET_MAP: + handle_mon_get_map(static_cast(m)); + break; + + case CEPH_MSG_MON_GET_VERSION: + handle_get_version(static_cast(m)); + break; + + case MSG_MON_COMMAND: + handle_command(static_cast(m)); + break; + + case CEPH_MSG_MON_SUBSCRIBE: + /* FIXME: check what's being subscribed, filter accordingly */ + handle_subscribe(static_cast(m)); + break; + + case MSG_MON_PROBE: + handle_probe(static_cast(m)); + break; + + // Sync (i.e., the new slurp, but on steroids) + case MSG_MON_SYNC: + handle_sync(static_cast(m)); + break; + case MSG_MON_SCRUB: + handle_scrub(static_cast(m)); + break; + + // OSDs + case MSG_OSD_MARK_ME_DOWN: + case MSG_OSD_FAILURE: + case MSG_OSD_BOOT: + case MSG_OSD_ALIVE: + case MSG_OSD_PGTEMP: + paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m); + break; + + case MSG_REMOVE_SNAPS: + paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m); + break; + + // MDSs + case MSG_MDS_BEACON: + case MSG_MDS_OFFLOAD_TARGETS: + paxos_service[PAXOS_MDSMAP]->dispatch((PaxosServiceMessage*)m); + break; + + // auth + case MSG_MON_GLOBAL_ID: + case CEPH_MSG_AUTH: + /* no need to check caps here */ + paxos_service[PAXOS_AUTH]->dispatch((PaxosServiceMessage*)m); + break; + + // pg + case CEPH_MSG_STATFS: + case MSG_PGSTATS: + case MSG_GETPOOLSTATS: + paxos_service[PAXOS_PGMAP]->dispatch((PaxosServiceMessage*)m); + break; + + case CEPH_MSG_POOLOP: + paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m); + break; + + // log + case MSG_LOG: + paxos_service[PAXOS_LOG]->dispatch((PaxosServiceMessage*)m); + break; + + case MSG_LOGACK: + clog.handle_log_ack((MLogAck*)m); + break; + + // monmap + case MSG_MON_JOIN: + paxos_service[PAXOS_MONMAP]->dispatch((PaxosServiceMessage*)m); + break; + + // paxos + case MSG_MON_PAXOS: + { + MMonPaxos *pm = static_cast(m); + if (!src_is_mon || + !s->is_capable("mon", MON_CAP_X)) { + //can't send these! + pm->put(); + break; + } + + if (state == STATE_SYNCHRONIZING) { + // we are synchronizing. These messages would do us no + // good, thus just drop them and ignore them. + dout(10) << __func__ << " ignore paxos msg from " + << pm->get_source_inst() << dendl; + pm->put(); + break; + } + + // sanitize + if (pm->epoch > get_epoch()) { + bootstrap(); + pm->put(); + break; + } + if (pm->epoch != get_epoch()) { + pm->put(); + break; + } + + paxos->dispatch((PaxosServiceMessage*)m); + } + break; + + // elector messages + case MSG_MON_ELECTION: + //check privileges here for simplicity + if (s && + !s->is_capable("mon", MON_CAP_X)) { + dout(0) << "MMonElection received from entity without enough caps!" + << s->caps << dendl; + m->put(); + break; + } + if (!is_probing() && !is_synchronizing()) { + elector.dispatch(m); + } else { + m->put(); + } + break; + + case MSG_FORWARD: + handle_forward(static_cast(m)); + break; + + case MSG_TIMECHECK: + handle_timecheck(static_cast(m)); + break; + + case MSG_MON_HEALTH: + health_monitor->dispatch(static_cast(m)); + break; + + case CEPH_MSG_PING: + handle_ping(static_cast(m)); + break; + + default: + ret = false; + } + + return ret; +} + +void Monitor::handle_ping(MPing *m) +{ + dout(10) << __func__ << " " << *m << dendl; + MPing *reply = new MPing; + entity_inst_t inst = m->get_source_inst(); + bufferlist payload; + Formatter *f = new JSONFormatter(true); + f->open_object_section("pong"); + + string health_str; + get_health(health_str, NULL, f); + { + stringstream ss; + _mon_status(f, ss); + } + + f->close_section(); + stringstream ss; + f->flush(ss); + ::encode(ss.str(), payload); + reply->set_payload(payload); + dout(10) << __func__ << " reply payload len " << reply->get_payload().length() << dendl; + messenger->send_message(reply, inst); + m->put(); +} + +void Monitor::timecheck_start() +{ + dout(10) << __func__ << dendl; + timecheck_cleanup(); + timecheck_start_round(); +} + +void Monitor::timecheck_finish() +{ + dout(10) << __func__ << dendl; + timecheck_cleanup(); +} + +void Monitor::timecheck_start_round() +{ + dout(10) << __func__ << " curr " << timecheck_round << dendl; + assert(is_leader()); + + if (monmap->size() == 1) { + assert(0 == "We are alone; this shouldn't have been scheduled!"); + return; + } + + if (timecheck_round % 2) { + dout(10) << __func__ << " there's a timecheck going on" << dendl; + utime_t curr_time = ceph_clock_now(g_ceph_context); + double max = g_conf->mon_timecheck_interval*3; + if (curr_time - timecheck_round_start > max) { + dout(10) << __func__ << " keep current round going" << dendl; + goto out; + } else { + dout(10) << __func__ + << " finish current timecheck and start new" << dendl; + timecheck_cancel_round(); + } + } + + assert(timecheck_round % 2 == 0); + timecheck_acks = 0; + timecheck_round ++; + timecheck_round_start = ceph_clock_now(g_ceph_context); + dout(10) << __func__ << " new " << timecheck_round << dendl; + + timecheck(); +out: + dout(10) << __func__ << " setting up next event" << dendl; + timecheck_event = new C_TimeCheck(this); + timer.add_event_after(g_conf->mon_timecheck_interval, timecheck_event); +} + +void Monitor::timecheck_finish_round(bool success) +{ + dout(10) << __func__ << " curr " << timecheck_round << dendl; + assert(timecheck_round % 2); + timecheck_round ++; + timecheck_round_start = utime_t(); + + if (success) { + assert(timecheck_waiting.empty()); + assert(timecheck_acks == quorum.size()); + timecheck_report(); + return; + } + + dout(10) << __func__ << " " << timecheck_waiting.size() + << " peers still waiting:"; + for (map::iterator p = timecheck_waiting.begin(); + p != timecheck_waiting.end(); ++p) { + *_dout << " " << p->first.name; + } + *_dout << dendl; + timecheck_waiting.clear(); + + dout(10) << __func__ << " finished to " << timecheck_round << dendl; +} + +void Monitor::timecheck_cancel_round() +{ + timecheck_finish_round(false); +} + +void Monitor::timecheck_cleanup() +{ + timecheck_round = 0; + timecheck_acks = 0; + timecheck_round_start = utime_t(); + + if (timecheck_event) { + timer.cancel_event(timecheck_event); + timecheck_event = NULL; + } + timecheck_waiting.clear(); + timecheck_skews.clear(); + timecheck_latencies.clear(); +} + +void Monitor::timecheck_report() +{ + dout(10) << __func__ << dendl; + assert(is_leader()); + assert((timecheck_round % 2) == 0); + if (monmap->size() == 1) { + assert(0 == "We are alone; we shouldn't have gotten here!"); + return; + } + + assert(timecheck_latencies.size() == timecheck_skews.size()); + bool do_output = true; // only output report once + for (set::iterator q = quorum.begin(); q != quorum.end(); ++q) { + if (monmap->get_name(*q) == name) + continue; + + MTimeCheck *m = new MTimeCheck(MTimeCheck::OP_REPORT); + m->epoch = get_epoch(); + m->round = timecheck_round; + + for (map::iterator it = timecheck_skews.begin(); it != timecheck_skews.end(); ++it) { + double skew = it->second; + double latency = timecheck_latencies[it->first]; + + m->skews[it->first] = skew; + m->latencies[it->first] = latency; + + if (do_output) { + dout(25) << __func__ << " " << it->first + << " latency " << latency + << " skew " << skew << dendl; + } + } + do_output = false; + entity_inst_t inst = monmap->get_inst(*q); + dout(10) << __func__ << " send report to " << inst << dendl; + messenger->send_message(m, inst); + } +} + +void Monitor::timecheck() +{ + dout(10) << __func__ << dendl; + assert(is_leader()); + if (monmap->size() == 1) { + assert(0 == "We are alone; we shouldn't have gotten here!"); + return; + } + assert(timecheck_round % 2 != 0); + + timecheck_acks = 1; // we ack ourselves + + dout(10) << __func__ << " start timecheck epoch " << get_epoch() + << " round " << timecheck_round << dendl; + + // we are at the eye of the storm; the point of reference + timecheck_skews[messenger->get_myinst()] = 0.0; + timecheck_latencies[messenger->get_myinst()] = 0.0; + + for (set::iterator it = quorum.begin(); it != quorum.end(); ++it) { + if (monmap->get_name(*it) == name) + continue; + + entity_inst_t inst = monmap->get_inst(*it); + utime_t curr_time = ceph_clock_now(g_ceph_context); + timecheck_waiting[inst] = curr_time; + MTimeCheck *m = new MTimeCheck(MTimeCheck::OP_PING); + m->epoch = get_epoch(); + m->round = timecheck_round; + dout(10) << __func__ << " send " << *m << " to " << inst << dendl; + messenger->send_message(m, inst); + } +} + +health_status_t Monitor::timecheck_status(ostringstream &ss, + const double skew_bound, + const double latency) +{ + health_status_t status = HEALTH_OK; + double abs_skew = (skew_bound > 0 ? skew_bound : -skew_bound); + assert(latency >= 0); + + if (abs_skew > g_conf->mon_clock_drift_allowed) { + status = HEALTH_WARN; + ss << "clock skew " << abs_skew << "s" + << " > max " << g_conf->mon_clock_drift_allowed << "s"; + } + + return status; +} + +void Monitor::handle_timecheck_leader(MTimeCheck *m) +{ + dout(10) << __func__ << " " << *m << dendl; + /* handles PONG's */ + assert(m->op == MTimeCheck::OP_PONG); + + entity_inst_t other = m->get_source_inst(); + if (m->epoch < get_epoch()) { + dout(1) << __func__ << " got old timecheck epoch " << m->epoch + << " from " << other + << " curr " << get_epoch() + << " -- severely lagged? discard" << dendl; + return; + } + assert(m->epoch == get_epoch()); + + if (m->round < timecheck_round) { + dout(1) << __func__ << " got old round " << m->round + << " from " << other + << " curr " << timecheck_round << " -- discard" << dendl; + return; + } + + utime_t curr_time = ceph_clock_now(g_ceph_context); + + assert(timecheck_waiting.count(other) > 0); + utime_t timecheck_sent = timecheck_waiting[other]; + timecheck_waiting.erase(other); + if (curr_time < timecheck_sent) { + // our clock was readjusted -- drop everything until it all makes sense. + dout(1) << __func__ << " our clock was readjusted --" + << " bump round and drop current check" + << dendl; + timecheck_cancel_round(); + return; + } + + /* update peer latencies */ + double latency = (double)(curr_time - timecheck_sent); + + if (timecheck_latencies.count(other) == 0) + timecheck_latencies[other] = latency; + else { + double avg_latency = ((timecheck_latencies[other]*0.8)+(latency*0.2)); + timecheck_latencies[other] = avg_latency; + } + + /* + * update skews + * + * some nasty thing goes on if we were to do 'a - b' between two utime_t, + * and 'a' happens to be lower than 'b'; so we use double instead. + * + * latency is always expected to be >= 0. + * + * delta, the difference between theirs timestamp and ours, may either be + * lower or higher than 0; will hardly ever be 0. + * + * The absolute skew is the absolute delta minus the latency, which is + * taken as a whole instead of an rtt given that there is some queueing + * and dispatch times involved and it's hard to assess how long exactly + * it took for the message to travel to the other side and be handled. So + * we call it a bounded skew, the worst case scenario. + * + * Now, to math! + * + * Given that the latency is always positive, we can establish that the + * bounded skew will be: + * + * 1. positive if the absolute delta is higher than the latency and + * delta is positive + * 2. negative if the absolute delta is higher than the latency and + * delta is negative. + * 3. zero if the absolute delta is lower than the latency. + * + * On 3. we make a judgement call and treat the skew as non-existent. + * This is because that, if the absolute delta is lower than the + * latency, then the apparently existing skew is nothing more than a + * side-effect of the high latency at work. + * + * This may not be entirely true though, as a severely skewed clock + * may be masked by an even higher latency, but with high latencies + * we probably have worse issues to deal with than just skewed clocks. + */ + assert(latency >= 0); + + double delta = ((double) m->timestamp) - ((double) curr_time); + double abs_delta = (delta > 0 ? delta : -delta); + double skew_bound = abs_delta - latency; + if (skew_bound < 0) + skew_bound = 0; + else if (delta < 0) + skew_bound = -skew_bound; + + ostringstream ss; + health_status_t status = timecheck_status(ss, skew_bound, latency); + if (status == HEALTH_ERR) + clog.error() << other << " " << ss.str() << "\n"; + else if (status == HEALTH_WARN) + clog.warn() << other << " " << ss.str() << "\n"; + + dout(10) << __func__ << " from " << other << " ts " << m->timestamp + << " delta " << delta << " skew_bound " << skew_bound + << " latency " << latency << dendl; + + if (timecheck_skews.count(other) == 0) { + timecheck_skews[other] = skew_bound; + } else { + timecheck_skews[other] = (timecheck_skews[other]*0.8)+(skew_bound*0.2); + } + + timecheck_acks++; + if (timecheck_acks == quorum.size()) { + dout(10) << __func__ << " got pongs from everybody (" + << timecheck_acks << " total)" << dendl; + assert(timecheck_skews.size() == timecheck_acks); + assert(timecheck_waiting.empty()); + // everyone has acked, so bump the round to finish it. + timecheck_finish_round(); + } +} + +void Monitor::handle_timecheck_peon(MTimeCheck *m) +{ + dout(10) << __func__ << " " << *m << dendl; + + assert(is_peon()); + assert(m->op == MTimeCheck::OP_PING || m->op == MTimeCheck::OP_REPORT); + + if (m->epoch != get_epoch()) { + dout(1) << __func__ << " got wrong epoch " + << "(ours " << get_epoch() + << " theirs: " << m->epoch << ") -- discarding" << dendl; + return; + } + + if (m->round < timecheck_round) { + dout(1) << __func__ << " got old round " << m->round + << " current " << timecheck_round + << " (epoch " << get_epoch() << ") -- discarding" << dendl; + return; + } + + timecheck_round = m->round; + + if (m->op == MTimeCheck::OP_REPORT) { + assert((timecheck_round % 2) == 0); + timecheck_latencies.swap(m->latencies); + timecheck_skews.swap(m->skews); + return; + } + + assert((timecheck_round % 2) != 0); + MTimeCheck *reply = new MTimeCheck(MTimeCheck::OP_PONG); + utime_t curr_time = ceph_clock_now(g_ceph_context); + reply->timestamp = curr_time; + reply->epoch = m->epoch; + reply->round = m->round; + dout(10) << __func__ << " send " << *m + << " to " << m->get_source_inst() << dendl; + messenger->send_message(reply, m->get_connection()); +} + +void Monitor::handle_timecheck(MTimeCheck *m) +{ + dout(10) << __func__ << " " << *m << dendl; + + if (is_leader()) { + if (m->op != MTimeCheck::OP_PONG) { + dout(1) << __func__ << " drop unexpected msg (not pong)" << dendl; + } else { + handle_timecheck_leader(m); + } + } else if (is_peon()) { + if (m->op != MTimeCheck::OP_PING && m->op != MTimeCheck::OP_REPORT) { + dout(1) << __func__ << " drop unexpected msg (not ping or report)" << dendl; + } else { + handle_timecheck_peon(m); + } + } else { + dout(1) << __func__ << " drop unexpected msg" << dendl; + } + m->put(); +} + +void Monitor::handle_subscribe(MMonSubscribe *m) +{ + dout(10) << "handle_subscribe " << *m << dendl; + + bool reply = false; + + MonSession *s = static_cast(m->get_connection()->get_priv()); + if (!s) { + dout(10) << " no session, dropping" << dendl; + m->put(); + return; + } + + s->until = ceph_clock_now(g_ceph_context); + s->until += g_conf->mon_subscribe_interval; + for (map::iterator p = m->what.begin(); + p != m->what.end(); + ++p) { + // if there are any non-onetime subscriptions, we need to reply to start the resubscribe timer + if ((p->second.flags & CEPH_SUBSCRIBE_ONETIME) == 0) + reply = true; + + session_map.add_update_sub(s, p->first, p->second.start, + p->second.flags & CEPH_SUBSCRIBE_ONETIME, + m->get_connection()->has_feature(CEPH_FEATURE_INCSUBOSDMAP)); + + if (p->first == "mdsmap") { + if ((int)s->is_capable("mds", MON_CAP_R)) { + mdsmon()->check_sub(s->sub_map["mdsmap"]); + } + } else if (p->first == "osdmap") { + if ((int)s->is_capable("osd", MON_CAP_R)) { + osdmon()->check_sub(s->sub_map["osdmap"]); + } + } else if (p->first == "osd_pg_creates") { + if ((int)s->is_capable("osd", MON_CAP_W)) { + pgmon()->check_sub(s->sub_map["osd_pg_creates"]); + } + } else if (p->first == "monmap") { + check_sub(s->sub_map["monmap"]); + } else if (logmon()->sub_name_to_id(p->first) >= 0) { + logmon()->check_sub(s->sub_map[p->first]); + } + } + + // ??? + + if (reply) + messenger->send_message(new MMonSubscribeAck(monmap->get_fsid(), (int)g_conf->mon_subscribe_interval), + m->get_source_inst()); + + s->put(); + m->put(); +} + +void Monitor::handle_get_version(MMonGetVersion *m) +{ + dout(10) << "handle_get_version " << *m << dendl; + + MonSession *s = static_cast(m->get_connection()->get_priv()); + if (!s) { + dout(10) << " no session, dropping" << dendl; + m->put(); + return; + } + + MMonGetVersionReply *reply = new MMonGetVersionReply(); + reply->handle = m->handle; + if (m->what == "mdsmap") { + reply->version = mdsmon()->mdsmap.get_epoch(); + reply->oldest_version = mdsmon()->get_first_committed(); + } else if (m->what == "osdmap") { + reply->version = osdmon()->osdmap.get_epoch(); + reply->oldest_version = osdmon()->get_first_committed(); + } else if (m->what == "monmap") { + reply->version = monmap->get_epoch(); + reply->oldest_version = monmon()->get_first_committed(); + } else { + derr << "invalid map type " << m->what << dendl; + } + + messenger->send_message(reply, m->get_source_inst()); + + s->put(); + m->put(); +} + +bool Monitor::ms_handle_reset(Connection *con) +{ + dout(10) << "ms_handle_reset " << con << " " << con->get_peer_addr() << dendl; + + // ignore lossless monitor sessions + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) + return false; + + MonSession *s = static_cast(con->get_priv()); + if (!s) + return false; + + // break any con <-> session ref cycle + s->con->set_priv(NULL); + + if (is_shutdown()) + return false; + + Mutex::Locker l(lock); + + dout(10) << "reset/close on session " << s->inst << dendl; + if (!s->closed) + remove_session(s); + s->put(); + return true; +} + +void Monitor::check_subs() +{ + string type = "monmap"; + if (session_map.subs.count(type) == 0) + return; + xlist::iterator p = session_map.subs[type]->begin(); + while (!p.end()) { + Subscription *sub = *p; + ++p; + check_sub(sub); + } +} + +void Monitor::check_sub(Subscription *sub) +{ + dout(10) << "check_sub monmap next " << sub->next << " have " << monmap->get_epoch() << dendl; + if (sub->next <= monmap->get_epoch()) { + send_latest_monmap(sub->session->con.get()); + if (sub->onetime) + session_map.remove_sub(sub); + else + sub->next = monmap->get_epoch() + 1; + } +} + + +// ----- + +void Monitor::send_latest_monmap(Connection *con) +{ + bufferlist bl; + monmap->encode(bl, con->get_features()); + messenger->send_message(new MMonMap(bl), con); +} + +void Monitor::handle_mon_get_map(MMonGetMap *m) +{ + dout(10) << "handle_mon_get_map" << dendl; + send_latest_monmap(m->get_connection().get()); + m->put(); +} + + + +// ---------------------------------------------- +// scrub + +int Monitor::scrub() +{ + dout(10) << __func__ << dendl; + assert(is_leader()); + + if ((get_quorum_features() & CEPH_FEATURE_MON_SCRUB) == 0) { + clog.warn() << "scrub not supported by entire quorum\n"; + return -EOPNOTSUPP; + } + + if (!scrub_result.empty()) { + clog.info() << "scrub already in progress\n"; + return -EBUSY; + } + + scrub_result.clear(); + scrub_version = paxos->get_version(); + + for (set::iterator p = quorum.begin(); + p != quorum.end(); + ++p) { + if (*p == rank) + continue; + MMonScrub *r = new MMonScrub(MMonScrub::OP_SCRUB, scrub_version); + messenger->send_message(r, monmap->get_inst(*p)); + } + + // scrub my keys + _scrub(&scrub_result[rank]); + + if (scrub_result.size() == quorum.size()) + scrub_finish(); + + return 0; +} + +void Monitor::handle_scrub(MMonScrub *m) +{ + dout(10) << __func__ << " " << *m << dendl; + switch (m->op) { + case MMonScrub::OP_SCRUB: + { + if (!is_peon()) + break; + if (m->version != paxos->get_version()) + break; + MMonScrub *reply = new MMonScrub(MMonScrub::OP_RESULT, m->version); + _scrub(&reply->result); + messenger->send_message(reply, m->get_connection()); + } + break; + + case MMonScrub::OP_RESULT: + { + if (!is_leader()) + break; + if (m->version != scrub_version) + break; + int from = m->get_source().num(); + assert(scrub_result.count(from) == 0); + scrub_result[from] = m->result; + + if (scrub_result.size() == quorum.size()) + scrub_finish(); + } + break; + } + m->put(); +} + +void Monitor::_scrub(ScrubResult *r) +{ + set prefixes = get_sync_targets_names(); + prefixes.erase("paxos"); // exclude paxos, as this one may have extra states for proposals, etc. + + dout(10) << __func__ << " prefixes " << prefixes << dendl; + + pair start; + MonitorDBStore::Synchronizer synchronizer = store->get_synchronizer(start, prefixes); + + while (synchronizer->has_next_chunk()) { + pair k = synchronizer->get_next_key(); + bufferlist bl; + store->get(k.first, k.second, bl); + dout(30) << __func__ << " " << k << " bl " << bl.length() << " bytes crc " << bl.crc32c(0) << dendl; + r->prefix_keys[k.first]++; + if (r->prefix_crc.count(k.first) == 0) + r->prefix_crc[k.first] = 0; + r->prefix_crc[k.first] = bl.crc32c(r->prefix_crc[k.first]); + } +} + +void Monitor::scrub_finish() +{ + dout(10) << __func__ << dendl; + + // compare + int errors = 0; + ScrubResult& mine = scrub_result[rank]; + for (map::iterator p = scrub_result.begin(); + p != scrub_result.end(); + ++p) { + if (p->first == rank) + continue; + if (p->second != mine) { + ++errors; + clog.error() << "scrub mismatch" << "\n"; + clog.error() << " mon." << rank << " " << mine << "\n"; + clog.error() << " mon." << p->first << " " << p->second << "\n"; + } + } + if (!errors) + clog.info() << "scrub ok on " << quorum << ": " << mine << "\n"; + + scrub_reset(); +} + +void Monitor::scrub_reset() +{ + dout(10) << __func__ << dendl; + scrub_version = 0; + scrub_result.clear(); +} + + + +/************ TICK ***************/ + +class C_Mon_Tick : public Context { + Monitor *mon; +public: + C_Mon_Tick(Monitor *m) : mon(m) {} + void finish(int r) { + mon->tick(); + } +}; + +void Monitor::new_tick() +{ + C_Mon_Tick *ctx = new C_Mon_Tick(this); + timer.add_event_after(g_conf->mon_tick_interval, ctx); +} + +void Monitor::tick() +{ + // ok go. + dout(11) << "tick" << dendl; + + for (vector::iterator p = paxos_service.begin(); p != paxos_service.end(); ++p) { + (*p)->tick(); + (*p)->maybe_trim(); + } + + // trim sessions + utime_t now = ceph_clock_now(g_ceph_context); + xlist::iterator p = session_map.sessions.begin(); + while (!p.end()) { + MonSession *s = *p; + ++p; + + // don't trim monitors + if (s->inst.name.is_mon()) + continue; + + if (!s->until.is_zero() && s->until < now) { + dout(10) << " trimming session " << s->con << " " << s->inst + << " (until " << s->until << " < now " << now << ")" << dendl; + messenger->mark_down(s->con); + remove_session(s); + } else if (!exited_quorum.is_zero()) { + if (now > (exited_quorum + 2 * g_conf->mon_lease)) { + // boot the client Session because we've taken too long getting back in + dout(10) << " trimming session " << s->con << " " << s->inst + << " because we've been out of quorum too long" << dendl; + messenger->mark_down(s->con); + remove_session(s); + } + } + } + + sync_trim_providers(); + + if (!maybe_wait_for_quorum.empty()) { + finish_contexts(g_ceph_context, maybe_wait_for_quorum); + } + + new_tick(); +} + +int Monitor::check_fsid() +{ + if (!store->exists(MONITOR_NAME, "cluster_uuid")) + return -ENOENT; + + bufferlist ebl; + int r = store->get(MONITOR_NAME, "cluster_uuid", ebl); + assert(r == 0); + + string es(ebl.c_str(), ebl.length()); + + // only keep the first line + size_t pos = es.find_first_of('\n'); + if (pos != string::npos) + es.resize(pos); + + dout(10) << "check_fsid cluster_uuid contains '" << es << "'" << dendl; + uuid_d ondisk; + if (!ondisk.parse(es.c_str())) { + derr << "error: unable to parse uuid" << dendl; + return -EINVAL; + } + + if (monmap->get_fsid() != ondisk) { + derr << "error: cluster_uuid file exists with value " << ondisk + << ", != our uuid " << monmap->get_fsid() << dendl; + return -EEXIST; + } + + return 0; +} + +int Monitor::write_fsid() +{ + MonitorDBStore::Transaction t; + int r = write_fsid(t); + store->apply_transaction(t); + return r; +} + +int Monitor::write_fsid(MonitorDBStore::Transaction &t) +{ + ostringstream ss; + ss << monmap->get_fsid() << "\n"; + string us = ss.str(); + + bufferlist b; + b.append(us); + + t.put(MONITOR_NAME, "cluster_uuid", b); + return 0; +} + +/* + * this is the closest thing to a traditional 'mkfs' for ceph. + * initialize the monitor state machines to their initial values. + */ +int Monitor::mkfs(bufferlist& osdmapbl) +{ + MonitorDBStore::Transaction t; + + // verify cluster fsid + int r = check_fsid(); + if (r < 0 && r != -ENOENT) + return r; + + bufferlist magicbl; + magicbl.append(CEPH_MON_ONDISK_MAGIC); + magicbl.append("\n"); + t.put(MONITOR_NAME, "magic", magicbl); + + + features = get_supported_features(); + write_features(t); + + // save monmap, osdmap, keyring. + bufferlist monmapbl; + monmap->encode(monmapbl, CEPH_FEATURES_ALL); + monmap->set_epoch(0); // must be 0 to avoid confusing first MonmapMonitor::update_from_paxos() + t.put("mkfs", "monmap", monmapbl); + + if (osdmapbl.length()) { + // make sure it's a valid osdmap + try { + OSDMap om; + om.decode(osdmapbl); + } + catch (buffer::error& e) { + derr << "error decoding provided osdmap: " << e.what() << dendl; + return -EINVAL; + } + t.put("mkfs", "osdmap", osdmapbl); + } + + KeyRing keyring; + string keyring_filename; + if (!ceph_resolve_file_search(g_conf->keyring, keyring_filename)) { + derr << "unable to find a keyring file on " << g_conf->keyring << dendl; + return -ENOENT; + } + + r = keyring.load(g_ceph_context, keyring_filename); + if (r < 0) { + derr << "unable to load initial keyring " << g_conf->keyring << dendl; + return r; + } + + // put mon. key in external keyring; seed with everything else. + extract_save_mon_key(keyring); + + bufferlist keyringbl; + keyring.encode_plaintext(keyringbl); + t.put("mkfs", "keyring", keyringbl); + write_fsid(t); + store->apply_transaction(t); + + return 0; +} + +int Monitor::write_default_keyring(bufferlist& bl) +{ + ostringstream os; + os << g_conf->mon_data << "/keyring"; + + int err = 0; + int fd = ::open(os.str().c_str(), O_WRONLY|O_CREAT, 0644); + if (fd < 0) { + err = -errno; + dout(0) << __func__ << " failed to open " << os.str() + << ": " << cpp_strerror(err) << dendl; + return err; + } + + err = bl.write_fd(fd); + if (!err) + ::fsync(fd); + ::close(fd); + + return err; +} + +void Monitor::extract_save_mon_key(KeyRing& keyring) +{ + EntityName mon_name; + mon_name.set_type(CEPH_ENTITY_TYPE_MON); + EntityAuth mon_key; + if (keyring.get_auth(mon_name, mon_key)) { + dout(10) << "extract_save_mon_key moving mon. key to separate keyring" << dendl; + KeyRing pkey; + pkey.add(mon_name, mon_key); + bufferlist bl; + pkey.encode_plaintext(bl); + write_default_keyring(bl); + keyring.remove(mon_name); + } +} + +bool Monitor::ms_get_authorizer(int service_id, AuthAuthorizer **authorizer, bool force_new) +{ + dout(10) << "ms_get_authorizer for " << ceph_entity_type_name(service_id) << dendl; + + if (is_shutdown()) + return false; + + // we only connect to other monitors; every else connects to us. + if (service_id != CEPH_ENTITY_TYPE_MON) + return false; + + if (!auth_cluster_required.is_supported_auth(CEPH_AUTH_CEPHX)) + return false; + + CephXServiceTicketInfo auth_ticket_info; + CephXSessionAuthInfo info; + int ret; + EntityName name; + name.set_type(CEPH_ENTITY_TYPE_MON); + + auth_ticket_info.ticket.name = name; + auth_ticket_info.ticket.global_id = 0; + + CryptoKey secret; + if (!keyring.get_secret(name, secret) && + !key_server.get_secret(name, secret)) { + dout(0) << " couldn't get secret for mon service from keyring or keyserver" << dendl; + stringstream ss, ds; + int err = key_server.list_secrets(ds); + if (err < 0) + ss << "no installed auth entries!"; + else + ss << "installed auth entries:"; + dout(0) << ss.str() << "\n" << ds.str() << dendl; + return false; + } + + /* mon to mon authentication uses the private monitor shared key and not the + rotating key */ + ret = key_server.build_session_auth_info(service_id, auth_ticket_info, info, secret, (uint64_t)-1); + if (ret < 0) { + dout(0) << "ms_get_authorizer failed to build session auth_info for use with mon ret " << ret << dendl; + return false; + } + + CephXTicketBlob blob; + if (!cephx_build_service_ticket_blob(cct, info, blob)) { + dout(0) << "ms_get_authorizer failed to build service ticket use with mon" << dendl; + return false; + } + bufferlist ticket_data; + ::encode(blob, ticket_data); + + bufferlist::iterator iter = ticket_data.begin(); + CephXTicketHandler handler(g_ceph_context, service_id); + ::decode(handler.ticket, iter); + + handler.session_key = info.session_key; + + *authorizer = handler.build_authorizer(0); + + return true; +} + +bool Monitor::ms_verify_authorizer(Connection *con, int peer_type, + int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply, + bool& isvalid, CryptoKey& session_key) +{ + dout(10) << "ms_verify_authorizer " << con->get_peer_addr() + << " " << ceph_entity_type_name(peer_type) + << " protocol " << protocol << dendl; + + if (is_shutdown()) + return false; + + if (peer_type == CEPH_ENTITY_TYPE_MON && + auth_cluster_required.is_supported_auth(CEPH_AUTH_CEPHX)) { + // monitor, and cephx is enabled + isvalid = false; + if (protocol == CEPH_AUTH_CEPHX) { + bufferlist::iterator iter = authorizer_data.begin(); + CephXServiceTicketInfo auth_ticket_info; + + if (authorizer_data.length()) { + int ret = cephx_verify_authorizer(g_ceph_context, &keyring, iter, + auth_ticket_info, authorizer_reply); + if (ret >= 0) { + session_key = auth_ticket_info.session_key; + isvalid = true; + } else { + dout(0) << "ms_verify_authorizer bad authorizer from mon " << con->get_peer_addr() << dendl; + } + } + } else { + dout(0) << "ms_verify_authorizer cephx enabled, but no authorizer (required for mon)" << dendl; + } + } else { + // who cares. + isvalid = true; + } + return true; +}; + +#undef dout_prefix +#define dout_prefix *_dout + +void Monitor::StoreConverter::_convert_finish_features( + MonitorDBStore::Transaction &t) +{ + dout(20) << __func__ << dendl; + + assert(db->exists(MONITOR_NAME, COMPAT_SET_LOC)); + bufferlist features_bl; + db->get(MONITOR_NAME, COMPAT_SET_LOC, features_bl); + assert(features_bl.length()); + + CompatSet features; + bufferlist::iterator p = features_bl.begin(); + features.decode(p); + + assert(features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_GV)); + features.incompat.remove(CEPH_MON_FEATURE_INCOMPAT_GV); + assert(!features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_GV)); + + features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS); + assert(features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS)); + + features_bl.clear(); + features.encode(features_bl); + + dout(20) << __func__ << " new features " << features << dendl; + t.put(MONITOR_NAME, COMPAT_SET_LOC, features_bl); +} + + +bool Monitor::StoreConverter::_check_gv_store() +{ + dout(20) << __func__ << dendl; + if (!store->exists_bl_ss(COMPAT_SET_LOC, 0)) + return false; + + bufferlist features_bl; + store->get_bl_ss_safe(features_bl, COMPAT_SET_LOC, 0); + if (!features_bl.length()) { + dout(20) << __func__ << " on-disk features length is zero" << dendl; + return false; + } + CompatSet features; + bufferlist::iterator p = features_bl.begin(); + features.decode(p); + return (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_GV)); +} + +int Monitor::StoreConverter::needs_conversion() +{ + bufferlist magicbl; + int ret = 0; + + dout(10) << "check if store needs conversion from legacy format" << dendl; + _init(); + + int err = store->mount(); + if (err < 0) { + if (err == -ENOENT) { + derr << "unable to mount monitor store: " + << cpp_strerror(err) << dendl; + } else { + derr << "it appears that another monitor is running: " + << cpp_strerror(err) << dendl; + } + ret = err; + goto out; + } + assert(err == 0); + + if (store->exists_bl_ss("magic", 0)) { + if (_check_gv_store()) { + dout(1) << "found old GV monitor store format " + << "-- should convert!" << dendl; + ret = 1; + } else { + dout(0) << "Existing monitor store has not been converted " + << "to 0.52 (bobtail) format" << dendl; + assert(0 == "Existing store has not been converted to 0.52 format"); + } + } + assert(!store->umount()); + +out: + _deinit(); + return ret; +} + +int Monitor::StoreConverter::convert() +{ + _init(); + assert(!store->mount()); + if (db->exists("mon_convert", "on_going")) { + dout(0) << __func__ << " found a mon store in mid-convertion; abort!" + << dendl; + return -EEXIST; + } + + _mark_convert_start(); + _convert_monitor(); + _convert_machines(); + _convert_paxos(); + _mark_convert_finish(); + + store->umount(); + _deinit(); + + dout(0) << __func__ << " finished conversion" << dendl; + + return 0; +} + +void Monitor::StoreConverter::_convert_monitor() +{ + dout(10) << __func__ << dendl; + + assert(store->exists_bl_ss("magic")); + assert(store->exists_bl_ss("keyring")); + assert(store->exists_bl_ss("feature_set")); + assert(store->exists_bl_ss("election_epoch")); + + MonitorDBStore::Transaction tx; + + if (store->exists_bl_ss("joined")) { + version_t joined = store->get_int("joined"); + tx.put(MONITOR_NAME, "joined", joined); + } + + vector keys; + keys.push_back("magic"); + keys.push_back("feature_set"); + keys.push_back("cluster_uuid"); + + vector::iterator it; + for (it = keys.begin(); it != keys.end(); ++it) { + if (!store->exists_bl_ss((*it).c_str())) + continue; + + bufferlist bl; + int r = store->get_bl_ss(bl, (*it).c_str(), 0); + assert(r > 0); + tx.put(MONITOR_NAME, *it, bl); + } + version_t election_epoch = store->get_int("election_epoch"); + tx.put(MONITOR_NAME, "election_epoch", election_epoch); + + assert(!tx.empty()); + db->apply_transaction(tx); + dout(10) << __func__ << " finished" << dendl; +} + +void Monitor::StoreConverter::_convert_machines(string machine) +{ + dout(10) << __func__ << " " << machine << dendl; + + version_t first_committed = + store->get_int(machine.c_str(), "first_committed"); + version_t last_committed = + store->get_int(machine.c_str(), "last_committed"); + + version_t accepted_pn = store->get_int(machine.c_str(), "accepted_pn"); + version_t last_pn = store->get_int(machine.c_str(), "last_pn"); + + if (accepted_pn > highest_accepted_pn) + highest_accepted_pn = accepted_pn; + if (last_pn > highest_last_pn) + highest_last_pn = last_pn; + + string machine_gv(machine); + machine_gv.append("_gv"); + bool has_gv = true; + + if (!store->exists_bl_ss(machine_gv.c_str())) { + dout(1) << __func__ << " " << machine + << " no gv dir '" << machine_gv << "'" << dendl; + has_gv = false; + } + + for (version_t ver = first_committed; ver <= last_committed; ver++) { + if (!store->exists_bl_sn(machine.c_str(), ver)) { + dout(20) << __func__ << " " << machine + << " ver " << ver << " dne" << dendl; + continue; + } + + bufferlist bl; + int r = store->get_bl_sn(bl, machine.c_str(), ver); + assert(r >= 0); + dout(20) << __func__ << " " << machine + << " ver " << ver << " bl " << bl.length() << dendl; + + MonitorDBStore::Transaction tx; + tx.put(machine, ver, bl); + tx.put(machine, "last_committed", ver); + + if (has_gv && store->exists_bl_sn(machine_gv.c_str(), ver)) { + stringstream s; + s << ver; + string ver_str = s.str(); + + version_t gv = store->get_int(machine_gv.c_str(), ver_str.c_str()); + dout(20) << __func__ << " " << machine + << " ver " << ver << " -> " << gv << dendl; + + MonitorDBStore::Transaction paxos_tx; + + if (gvs.count(gv) == 0) { + gvs.insert(gv); + } else { + dout(0) << __func__ << " " << machine + << " gv " << gv << " already exists" + << dendl; + + // Duplicates aren't supposed to happen, but an old bug introduced + // them and the mds state machine wasn't ever trimmed, so many users + // will see them. So we'll just merge them all in one + // single paxos version. + // We know that they are either from another paxos machine or + // they are from the same paxos machine but their version is + // lower than ours -- given that we are iterating all versions + // from the lowest to the highest, duh! + // We'll just append our stuff to the existing paxos transaction + // as if nothing had happened. + + // Just make sure we are correct. This shouldn't take long and + // should never be triggered! + set >& s = gv_map[gv]; + for (set >::iterator it = s.begin(); + it != s.end(); ++it) { + if (it->first == machine) + assert(it->second + 1 == ver); + } + + bufferlist paxos_bl; + int r = db->get("paxos", gv, paxos_bl); + assert(r >= 0); + paxos_tx.append_from_encoded(paxos_bl); + } + gv_map[gv].insert(make_pair(machine,ver)); + + bufferlist tx_bl; + tx.encode(tx_bl); + paxos_tx.append_from_encoded(tx_bl); + bufferlist paxos_bl; + paxos_tx.encode(paxos_bl); + tx.put("paxos", gv, paxos_bl); + } + db->apply_transaction(tx); + } + + version_t lc = db->get(machine, "last_committed"); + dout(20) << __func__ << " lc " << lc << " last_committed " << last_committed << dendl; + assert(lc == last_committed); + + MonitorDBStore::Transaction tx; + tx.put(machine, "first_committed", first_committed); + tx.put(machine, "last_committed", last_committed); + tx.put(machine, "conversion_first", first_committed); + + if (store->exists_bl_ss(machine.c_str(), "latest")) { + bufferlist latest_bl_raw; + int r = store->get_bl_ss(latest_bl_raw, machine.c_str(), "latest"); + assert(r >= 0); + if (!latest_bl_raw.length()) { + dout(20) << __func__ << " machine " << machine + << " skip latest with size 0" << dendl; + goto out; + } + + tx.put(machine, "latest", latest_bl_raw); + + bufferlist::iterator lbl_it = latest_bl_raw.begin(); + bufferlist latest_bl; + version_t latest_ver; + ::decode(latest_ver, lbl_it); + ::decode(latest_bl, lbl_it); + + dout(20) << __func__ << " machine " << machine + << " latest ver " << latest_ver << dendl; + + tx.put(machine, "full_latest", latest_ver); + stringstream os; + os << "full_" << latest_ver; + tx.put(machine, os.str(), latest_bl); + } +out: + db->apply_transaction(tx); + dout(10) << __func__ << " machine " << machine << " finished" << dendl; +} + +void Monitor::StoreConverter::_convert_osdmap_full() +{ + dout(10) << __func__ << dendl; + version_t first_committed = + store->get_int("osdmap", "first_committed"); + version_t last_committed = + store->get_int("osdmap", "last_committed"); + + int err = 0; + for (version_t ver = first_committed; ver <= last_committed; ver++) { + if (!store->exists_bl_sn("osdmap_full", ver)) { + dout(20) << __func__ << " osdmap_full ver " << ver << " dne" << dendl; + err++; + continue; + } + + bufferlist bl; + int r = store->get_bl_sn(bl, "osdmap_full", ver); + assert(r >= 0); + dout(20) << __func__ << " osdmap_full ver " << ver + << " bl " << bl.length() << " bytes" << dendl; + + string full_key = "full_" + stringify(ver); + MonitorDBStore::Transaction tx; + tx.put("osdmap", full_key, bl); + db->apply_transaction(tx); + } + dout(10) << __func__ << " found " << err << " conversion errors!" << dendl; + assert(err == 0); +} + +void Monitor::StoreConverter::_convert_paxos() +{ + dout(10) << __func__ << dendl; + assert(!gvs.empty()); + + set::reverse_iterator rit = gvs.rbegin(); + version_t highest_gv = *rit; + version_t last_gv = highest_gv; + + int n = 0; + int max_versions = (g_conf->paxos_max_join_drift*2); + for (; (rit != gvs.rend()) && (n < max_versions); ++rit, ++n) { + version_t gv = *rit; + + if (last_gv == gv) + continue; + if ((last_gv - gv) > 1) { + // we are done; we found a gap and we are only interested in keeping + // contiguous paxos versions. + break; + } + last_gv = gv; + } + + // erase all paxos versions between [first, last_gv[, with first being the + // first gv in the map. + MonitorDBStore::Transaction tx; + set::iterator it = gvs.begin(); + dout(1) << __func__ << " first gv " << (*it) + << " last gv " << last_gv << dendl; + for (; it != gvs.end() && (*it < last_gv); ++it) { + tx.erase("paxos", *it); + } + tx.put("paxos", "first_committed", last_gv); + tx.put("paxos", "last_committed", highest_gv); + tx.put("paxos", "accepted_pn", highest_accepted_pn); + tx.put("paxos", "last_pn", highest_last_pn); + tx.put("paxos", "conversion_first", last_gv); + db->apply_transaction(tx); + + dout(10) << __func__ << " finished" << dendl; +} + +void Monitor::StoreConverter::_convert_machines() +{ + dout(10) << __func__ << dendl; + set machine_names = _get_machines_names(); + set::iterator it = machine_names.begin(); + + for (; it != machine_names.end(); ++it) { + _convert_machines(*it); + } + // convert osdmap full versions + // this stays here as these aren't really an independent paxos + // machine, but rather machine-specific and don't fit on the + // _convert_machines(string) function. + _convert_osdmap_full(); + + dout(10) << __func__ << " finished" << dendl; +} diff --git a/src/msg/async/ProtocolV1.cc.orig b/src/msg/async/ProtocolV1.cc.orig new file mode 100644 index 0000000000000..4560cb5a031d1 --- /dev/null +++ b/src/msg/async/ProtocolV1.cc.orig @@ -0,0 +1,2596 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ProtocolV1.h" + +#include "common/errno.h" + +#include "AsyncConnection.h" +#include "AsyncMessenger.h" +#include "common/EventTrace.h" +#include "include/random.h" +#include "auth/AuthClient.h" +#include "auth/AuthServer.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix _conn_prefix(_dout) +ostream &ProtocolV1::_conn_prefix(std::ostream *_dout) { + return *_dout << "--1- " << messenger->get_myaddrs() << " >> " + << *connection->peer_addrs + << " conn(" + << connection << " " << this + << " :" << connection->port << " s=" << get_state_name(state) + << " pgs=" << peer_global_seq << " cs=" << connect_seq + << " l=" << connection->policy.lossy << ")."; +} + +#define WRITE(B, C) write(CONTINUATION(C), B) + +#define READ(L, C) read(CONTINUATION(C), L) + +#define READB(L, B, C) read(CONTINUATION(C), L, B) + +// Constant to limit starting sequence number to 2^31. Nothing special about +// it, just a big number. PLR +#define SEQ_MASK 0x7fffffff + +const int ASYNC_COALESCE_THRESHOLD = 256; + +using namespace std; + +static void alloc_aligned_buffer(bufferlist &data, unsigned len, unsigned off) { + // create a buffer to read into that matches the data alignment + unsigned alloc_len = 0; + unsigned left = len; + unsigned head = 0; + if (off & ~CEPH_PAGE_MASK) { + // head + alloc_len += CEPH_PAGE_SIZE; + head = std::min(CEPH_PAGE_SIZE - (off & ~CEPH_PAGE_MASK), left); + left -= head; + } + alloc_len += left; + bufferptr ptr(buffer::create_small_page_aligned(alloc_len)); + if (head) ptr.set_offset(CEPH_PAGE_SIZE - head); + data.push_back(std::move(ptr)); +} + +/** + * Protocol V1 + **/ + +ProtocolV1::ProtocolV1(AsyncConnection *connection) + : Protocol(1, connection), + temp_buffer(nullptr), + can_write(WriteStatus::NOWRITE), + keepalive(false), + connect_seq(0), + peer_global_seq(0), + msg_left(0), + cur_msg_size(0), + replacing(false), + is_reset_from_peer(false), + once_ready(false), + state(NONE), + global_seq(0), + wait_for_seq(false) { + temp_buffer = new char[4096]; +} + +ProtocolV1::~ProtocolV1() { + ceph_assert(out_q.empty()); + ceph_assert(sent.empty()); + + delete[] temp_buffer; +} + +void ProtocolV1::connect() { + this->state = START_CONNECT; + + // reset connect state variables + authorizer_buf.clear(); + // FIPS zeroization audit 20191115: these memsets are not security related. + memset(&connect_msg, 0, sizeof(connect_msg)); + memset(&connect_reply, 0, sizeof(connect_reply)); + + global_seq = messenger->get_global_seq(); +} + +void ProtocolV1::accept() { this->state = START_ACCEPT; } + +bool ProtocolV1::is_connected() { + return can_write.load() == WriteStatus::CANWRITE; +} + +void ProtocolV1::stop() { + ldout(cct, 20) << __func__ << dendl; + if (state == CLOSED) { + return; + } + + if (connection->delay_state) connection->delay_state->flush(); + + ldout(cct, 2) << __func__ << dendl; + std::lock_guard l(connection->write_lock); + + reset_recv_state(); + discard_out_queue(); + + connection->_stop(); + + can_write = WriteStatus::CLOSED; + state = CLOSED; +} + +void ProtocolV1::fault() { + ldout(cct, 20) << __func__ << dendl; + + if (state == CLOSED || state == NONE) { + ldout(cct, 10) << __func__ << " connection is already closed" << dendl; + return; + } + + if (connection->policy.lossy && state != START_CONNECT && + state != CONNECTING) { + ldout(cct, 1) << __func__ << " on lossy channel, failing" << dendl; + stop(); + connection->dispatch_queue->queue_reset(connection); + return; + } + + connection->write_lock.lock(); + can_write = WriteStatus::NOWRITE; + is_reset_from_peer = false; + + // requeue sent items + requeue_sent(); + + if (!once_ready && out_q.empty() && state >= START_ACCEPT && + state <= ACCEPTING_WAIT_CONNECT_MSG_AUTH && !replacing) { + ldout(cct, 10) << __func__ << " with nothing to send and in the half " + << " accept state just closed" << dendl; + connection->write_lock.unlock(); + stop(); + connection->dispatch_queue->queue_reset(connection); + return; + } + replacing = false; + + connection->fault(); + + reset_recv_state(); + + if (connection->policy.standby && out_q.empty() && !keepalive && + state != WAIT) { + ldout(cct, 10) << __func__ << " with nothing to send, going to standby" + << dendl; + state = STANDBY; + connection->write_lock.unlock(); + return; + } + + connection->write_lock.unlock(); + + if ((state >= START_CONNECT && state <= CONNECTING_SEND_CONNECT_MSG) || + state == WAIT) { + // backoff! + if (state == WAIT) { + backoff.set_from_double(cct->_conf->ms_max_backoff); + } else if (backoff == utime_t()) { + backoff.set_from_double(cct->_conf->ms_initial_backoff); + } else { + backoff += backoff; + if (backoff > cct->_conf->ms_max_backoff) + backoff.set_from_double(cct->_conf->ms_max_backoff); + } + + global_seq = messenger->get_global_seq(); + state = START_CONNECT; + connection->state = AsyncConnection::STATE_CONNECTING; + ldout(cct, 10) << __func__ << " waiting " << backoff << dendl; + // woke up again; + connection->register_time_events.insert( + connection->center->create_time_event(backoff.to_nsec() / 1000, + connection->wakeup_handler)); + } else { + // policy maybe empty when state is in accept + if (connection->policy.server) { + ldout(cct, 0) << __func__ << " server, going to standby" << dendl; + state = STANDBY; + } else { + ldout(cct, 0) << __func__ << " initiating reconnect" << dendl; + connect_seq++; + global_seq = messenger->get_global_seq(); + state = START_CONNECT; + connection->state = AsyncConnection::STATE_CONNECTING; + } + backoff = utime_t(); + connection->center->dispatch_event_external(connection->read_handler); + } +} + +void ProtocolV1::send_message(Message *m) { + bufferlist bl; + uint64_t f = connection->get_features(); + + // TODO: Currently not all messages supports reencode like MOSDMap, so here + // only let fast dispatch support messages prepare message + bool can_fast_prepare = messenger->ms_can_fast_dispatch(m); + if (can_fast_prepare) { + prepare_send_message(f, m, bl); + } + + std::lock_guard l(connection->write_lock); + // "features" changes will change the payload encoding + if (can_fast_prepare && + (can_write == WriteStatus::NOWRITE || connection->get_features() != f)) { + // ensure the correctness of message encoding + bl.clear(); + m->clear_payload(); + ldout(cct, 5) << __func__ << " clear encoded buffer previous " << f + << " != " << connection->get_features() << dendl; + } + if (can_write == WriteStatus::CLOSED) { + ldout(cct, 10) << __func__ << " connection closed." + << " Drop message " << m << dendl; + m->put(); + } else { + m->queue_start = ceph::mono_clock::now(); + m->trace.event("async enqueueing message"); + out_q[m->get_priority()].emplace_back(std::move(bl), m); + ldout(cct, 15) << __func__ << " inline write is denied, reschedule m=" << m + << dendl; + if (can_write != WriteStatus::REPLACING && !write_in_progress) { + write_in_progress = true; + connection->center->dispatch_event_external(connection->write_handler); + } + } +} + +void ProtocolV1::prepare_send_message(uint64_t features, Message *m, + bufferlist &bl) { + ldout(cct, 20) << __func__ << " m " << *m << dendl; + + // associate message with Connection (for benefit of encode_payload) + ldout(cct, 20) << __func__ << (m->empty_payload() ? " encoding features " : " half-reencoding features ") + << features << " " << m << " " << *m << dendl; + + // encode and copy out of *m + // in write_message we update header.seq and need recalc crc + // so skip calc header in encode function. + m->encode(features, messenger->crcflags, true); + + bl.append(m->get_payload()); + bl.append(m->get_middle()); + bl.append(m->get_data()); +} + +void ProtocolV1::send_keepalive() { + ldout(cct, 10) << __func__ << dendl; + std::lock_guard l(connection->write_lock); + if (can_write != WriteStatus::CLOSED) { + keepalive = true; + connection->center->dispatch_event_external(connection->write_handler); + } +} + +void ProtocolV1::read_event() { + ldout(cct, 20) << __func__ << dendl; + switch (state) { + case START_CONNECT: + CONTINUATION_RUN(CONTINUATION(send_client_banner)); + break; + case START_ACCEPT: + CONTINUATION_RUN(CONTINUATION(send_server_banner)); + break; + case OPENED: + CONTINUATION_RUN(CONTINUATION(wait_message)); + break; + case THROTTLE_MESSAGE: + CONTINUATION_RUN(CONTINUATION(throttle_message)); + break; + case THROTTLE_BYTES: + CONTINUATION_RUN(CONTINUATION(throttle_bytes)); + break; + case THROTTLE_DISPATCH_QUEUE: + CONTINUATION_RUN(CONTINUATION(throttle_dispatch_queue)); + break; + default: + break; + } +} + +void ProtocolV1::write_event() { + ldout(cct, 10) << __func__ << dendl; + ssize_t r = 0; + + connection->write_lock.lock(); + if (can_write == WriteStatus::CANWRITE) { + if (keepalive) { + append_keepalive_or_ack(); + keepalive = false; + } + + auto start = ceph::mono_clock::now(); + bool more; + do { + bufferlist data; + Message *m = _get_next_outgoing(&data); + if (!m) { + break; + } + + if (!connection->policy.lossy) { + // put on sent list + sent.push_back(m); + m->get(); + } + more = !out_q.empty(); + connection->write_lock.unlock(); + + // send_message or requeue messages may not encode message + if (!data.length()) { + prepare_send_message(connection->get_features(), m, data); + } + + if (m->queue_start != ceph::mono_time()) { + connection->logger->tinc(l_msgr_send_messages_queue_lat, + ceph::mono_clock::now() - m->queue_start); + } + + r = write_message(m, data, more); + + connection->write_lock.lock(); + if (r == 0) { + ; + } else if (r < 0) { + ldout(cct, 1) << __func__ << " send msg failed" << dendl; + break; + } else if (r > 0) { + // Outbound message in-progress, thread will be re-awoken + // when the outbound socket is writeable again + break; + } + } while (can_write == WriteStatus::CANWRITE); + write_in_progress = false; + connection->write_lock.unlock(); + + // if r > 0 mean data still lefted, so no need _try_send. + if (r == 0) { + uint64_t left = ack_left; + if (left) { + ceph_le64 s; + s = in_seq; + connection->outgoing_bl.append(CEPH_MSGR_TAG_ACK); + connection->outgoing_bl.append((char *)&s, sizeof(s)); + ldout(cct, 10) << __func__ << " try send msg ack, acked " << left + << " messages" << dendl; + ack_left -= left; + left = ack_left; + r = connection->_try_send(left); + } else if (is_queued()) { + r = connection->_try_send(); + } + } + + connection->logger->tinc(l_msgr_running_send_time, + ceph::mono_clock::now() - start); + if (r < 0) { + ldout(cct, 1) << __func__ << " send msg failed" << dendl; + connection->lock.lock(); + fault(); + connection->lock.unlock(); + return; + } + } else { + write_in_progress = false; + connection->write_lock.unlock(); + connection->lock.lock(); + connection->write_lock.lock(); + if (state == STANDBY && !connection->policy.server && is_queued()) { + ldout(cct, 10) << __func__ << " policy.server is false" << dendl; + connection->_connect(); + } else if (connection->cs && state != NONE && state != CLOSED && + state != START_CONNECT) { + r = connection->_try_send(); + if (r < 0) { + ldout(cct, 1) << __func__ << " send outcoming bl failed" << dendl; + connection->write_lock.unlock(); + fault(); + connection->lock.unlock(); + return; + } + } + connection->write_lock.unlock(); + connection->lock.unlock(); + } +} + +bool ProtocolV1::is_queued() { + return !out_q.empty() || connection->is_queued(); +} + +void ProtocolV1::run_continuation(CtPtr pcontinuation) { + if (pcontinuation) { + CONTINUATION_RUN(*pcontinuation); + } +} + +CtPtr ProtocolV1::read(CONTINUATION_RX_TYPE &next, + int len, char *buffer) { + if (!buffer) { + buffer = temp_buffer; + } + ssize_t r = connection->read(len, buffer, + [&next, this](char *buffer, int r) { + next.setParams(buffer, r); + CONTINUATION_RUN(next); + }); + if (r <= 0) { + next.setParams(buffer, r); + return &next; + } + + return nullptr; +} + +CtPtr ProtocolV1::write(CONTINUATION_TX_TYPE &next, + bufferlist &buffer) { + ssize_t r = connection->write(buffer, [&next, this](int r) { + next.setParams(r); + CONTINUATION_RUN(next); + }); + if (r <= 0) { + next.setParams(r); + return &next; + } + + return nullptr; +} + +CtPtr ProtocolV1::ready() { + ldout(cct, 25) << __func__ << dendl; + + // make sure no pending tick timer + if (connection->last_tick_id) { + connection->center->delete_time_event(connection->last_tick_id); + } + connection->last_tick_id = connection->center->create_time_event( + connection->inactive_timeout_us, connection->tick_handler); + + connection->write_lock.lock(); + can_write = WriteStatus::CANWRITE; + if (is_queued()) { + connection->center->dispatch_event_external(connection->write_handler); + } + connection->write_lock.unlock(); + connection->maybe_start_delay_thread(); + + state = OPENED; + return wait_message(); +} + +CtPtr ProtocolV1::wait_message() { + if (state != OPENED) { // must have changed due to a replace + return nullptr; + } + + ldout(cct, 20) << __func__ << dendl; + + return READ(sizeof(char), handle_message); +} + +CtPtr ProtocolV1::handle_message(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read tag failed" << dendl; + return _fault(); + } + + char tag = buffer[0]; + ldout(cct, 20) << __func__ << " process tag " << (int)tag << dendl; + + if (tag == CEPH_MSGR_TAG_KEEPALIVE) { + ldout(cct, 20) << __func__ << " got KEEPALIVE" << dendl; + connection->set_last_keepalive(ceph_clock_now()); + } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2) { + return READ(sizeof(ceph_timespec), handle_keepalive2); + } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) { + return READ(sizeof(ceph_timespec), handle_keepalive2_ack); + } else if (tag == CEPH_MSGR_TAG_ACK) { + return READ(sizeof(ceph_le64), handle_tag_ack); + } else if (tag == CEPH_MSGR_TAG_MSG) { + recv_stamp = ceph_clock_now(); + ldout(cct, 20) << __func__ << " begin MSG" << dendl; + return READ(sizeof(ceph_msg_header), handle_message_header); + } else if (tag == CEPH_MSGR_TAG_CLOSE) { + ldout(cct, 20) << __func__ << " got CLOSE" << dendl; + stop(); + } else { + ldout(cct, 0) << __func__ << " bad tag " << (int)tag << dendl; + return _fault(); + } + return nullptr; +} + +CtPtr ProtocolV1::handle_keepalive2(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read keeplive timespec failed" << dendl; + return _fault(); + } + + ldout(cct, 30) << __func__ << " got KEEPALIVE2 tag ..." << dendl; + + ceph_timespec *t; + t = (ceph_timespec *)buffer; + utime_t kp_t = utime_t(*t); + connection->write_lock.lock(); + append_keepalive_or_ack(true, &kp_t); + connection->write_lock.unlock(); + + ldout(cct, 20) << __func__ << " got KEEPALIVE2 " << kp_t << dendl; + connection->set_last_keepalive(ceph_clock_now()); + + if (is_connected()) { + connection->center->dispatch_event_external(connection->write_handler); + } + + return CONTINUE(wait_message); +} + +void ProtocolV1::append_keepalive_or_ack(bool ack, utime_t *tp) { + ldout(cct, 10) << __func__ << dendl; + if (ack) { + ceph_assert(tp); + struct ceph_timespec ts; + tp->encode_timeval(&ts); + connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE2_ACK); + connection->outgoing_bl.append((char *)&ts, sizeof(ts)); + } else if (connection->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) { + struct ceph_timespec ts; + utime_t t = ceph_clock_now(); + t.encode_timeval(&ts); + connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE2); + connection->outgoing_bl.append((char *)&ts, sizeof(ts)); + } else { + connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE); + } +} + +CtPtr ProtocolV1::handle_keepalive2_ack(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read keeplive timespec failed" << dendl; + return _fault(); + } + + ceph_timespec *t; + t = (ceph_timespec *)buffer; + connection->set_last_keepalive_ack(utime_t(*t)); + ldout(cct, 20) << __func__ << " got KEEPALIVE_ACK" << dendl; + + return CONTINUE(wait_message); +} + +CtPtr ProtocolV1::handle_tag_ack(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read ack seq failed" << dendl; + return _fault(); + } + + ceph_le64 seq; + seq = *(ceph_le64 *)buffer; + ldout(cct, 20) << __func__ << " got ACK" << dendl; + + ldout(cct, 15) << __func__ << " got ack seq " << seq << dendl; + // trim sent list + static const int max_pending = 128; + int i = 0; + auto now = ceph::mono_clock::now(); + Message *pending[max_pending]; + connection->write_lock.lock(); + while (!sent.empty() && sent.front()->get_seq() <= seq && i < max_pending) { + Message *m = sent.front(); + sent.pop_front(); + pending[i++] = m; + ldout(cct, 10) << __func__ << " got ack seq " << seq + << " >= " << m->get_seq() << " on " << m << " " << *m + << dendl; + } + connection->write_lock.unlock(); + connection->logger->tinc(l_msgr_handle_ack_lat, ceph::mono_clock::now() - now); + for (int k = 0; k < i; k++) { + pending[k]->put(); + } + + return CONTINUE(wait_message); +} + +CtPtr ProtocolV1::handle_message_header(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read message header failed" << dendl; + return _fault(); + } + + ldout(cct, 20) << __func__ << " got MSG header" << dendl; + + current_header = *((ceph_msg_header *)buffer); + + ldout(cct, 20) << __func__ << " got envelope type=" << current_header.type << " src " + << entity_name_t(current_header.src) << " front=" << current_header.front_len + << " data=" << current_header.data_len << " off " << current_header.data_off + << dendl; + + if (messenger->crcflags & MSG_CRC_HEADER) { + __u32 header_crc = 0; + header_crc = ceph_crc32c(0, (unsigned char *)¤t_header, + sizeof(current_header) - sizeof(current_header.crc)); + // verify header crc + if (header_crc != current_header.crc) { + ldout(cct, 0) << __func__ << " got bad header crc " << header_crc + << " != " << current_header.crc << dendl; + return _fault(); + } + } + + // Reset state + data_buf.clear(); + front.clear(); + middle.clear(); + data.clear(); + + state = THROTTLE_MESSAGE; + return CONTINUE(throttle_message); +} + +CtPtr ProtocolV1::throttle_message() { + ldout(cct, 20) << __func__ << dendl; + + if (connection->policy.throttler_messages) { + ldout(cct, 10) << __func__ << " wants " << 1 + << " message from policy throttler " + << connection->policy.throttler_messages->get_current() + << "/" << connection->policy.throttler_messages->get_max() + << dendl; + if (!connection->policy.throttler_messages->get_or_fail()) { + ldout(cct, 10) << __func__ << " wants 1 message from policy throttle " + << connection->policy.throttler_messages->get_current() + << "/" << connection->policy.throttler_messages->get_max() + << " failed, just wait." << dendl; + // following thread pool deal with th full message queue isn't a + // short time, so we can wait a ms. + if (connection->register_time_events.empty()) { + connection->register_time_events.insert( + connection->center->create_time_event(1000, + connection->wakeup_handler)); + } + return nullptr; + } + } + + state = THROTTLE_BYTES; + return CONTINUE(throttle_bytes); +} + +CtPtr ProtocolV1::throttle_bytes() { + ldout(cct, 20) << __func__ << dendl; + + cur_msg_size = current_header.front_len + current_header.middle_len + + current_header.data_len; + if (cur_msg_size) { + if (connection->policy.throttler_bytes) { + ldout(cct, 10) << __func__ << " wants " << cur_msg_size + << " bytes from policy throttler " + << connection->policy.throttler_bytes->get_current() << "/" + << connection->policy.throttler_bytes->get_max() << dendl; + if (!connection->policy.throttler_bytes->get_or_fail(cur_msg_size)) { + ldout(cct, 10) << __func__ << " wants " << cur_msg_size + << " bytes from policy throttler " + << connection->policy.throttler_bytes->get_current() + << "/" << connection->policy.throttler_bytes->get_max() + << " failed, just wait." << dendl; + // following thread pool deal with th full message queue isn't a + // short time, so we can wait a ms. + if (connection->register_time_events.empty()) { + connection->register_time_events.insert( + connection->center->create_time_event( + 1000, connection->wakeup_handler)); + } + return nullptr; + } + } + } + + state = THROTTLE_DISPATCH_QUEUE; + return CONTINUE(throttle_dispatch_queue); +} + +CtPtr ProtocolV1::throttle_dispatch_queue() { + ldout(cct, 20) << __func__ << dendl; + + if (cur_msg_size) { + if (!connection->dispatch_queue->dispatch_throttler.get_or_fail( + cur_msg_size)) { + ldout(cct, 10) + << __func__ << " wants " << cur_msg_size + << " bytes from dispatch throttle " + << connection->dispatch_queue->dispatch_throttler.get_current() << "/" + << connection->dispatch_queue->dispatch_throttler.get_max() + << " failed, just wait." << dendl; + // following thread pool deal with th full message queue isn't a + // short time, so we can wait a ms. + if (connection->register_time_events.empty()) { + connection->register_time_events.insert( + connection->center->create_time_event(1000, + connection->wakeup_handler)); + } + return nullptr; + } + } + + throttle_stamp = ceph_clock_now(); + + state = READ_MESSAGE_FRONT; + return read_message_front(); +} + +CtPtr ProtocolV1::read_message_front() { + ldout(cct, 20) << __func__ << dendl; + + unsigned front_len = current_header.front_len; + if (front_len) { + if (!front.length()) { + front.push_back(buffer::create(front_len)); + } + return READB(front_len, front.c_str(), handle_message_front); + } + return read_message_middle(); +} + +CtPtr ProtocolV1::handle_message_front(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read message front failed" << dendl; + return _fault(); + } + + ldout(cct, 20) << __func__ << " got front " << front.length() << dendl; + + return read_message_middle(); +} + +CtPtr ProtocolV1::read_message_middle() { + ldout(cct, 20) << __func__ << dendl; + + if (current_header.middle_len) { + if (!middle.length()) { + middle.push_back(buffer::create(current_header.middle_len)); + } + return READB(current_header.middle_len, middle.c_str(), + handle_message_middle); + } + + return read_message_data_prepare(); +} + +CtPtr ProtocolV1::handle_message_middle(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read message middle failed" << dendl; + return _fault(); + } + + ldout(cct, 20) << __func__ << " got middle " << middle.length() << dendl; + + return read_message_data_prepare(); +} + +CtPtr ProtocolV1::read_message_data_prepare() { + ldout(cct, 20) << __func__ << dendl; + + unsigned data_len = current_header.data_len; + unsigned data_off = current_header.data_off; + + if (data_len) { + // get a buffer +#if 0 + // rx_buffers is broken by design... see + // http://tracker.ceph.com/issues/22480 + map >::iterator p = + connection->rx_buffers.find(current_header.tid); + if (p != connection->rx_buffers.end()) { + ldout(cct, 10) << __func__ << " seleting rx buffer v " << p->second.second + << " at offset " << data_off << " len " + << p->second.first.length() << dendl; + data_buf = p->second.first; + // make sure it's big enough + if (data_buf.length() < data_len) + data_buf.push_back(buffer::create(data_len - data_buf.length())); + data_blp = data_buf.begin(); + } else { + ldout(cct, 20) << __func__ << " allocating new rx buffer at offset " + << data_off << dendl; + alloc_aligned_buffer(data_buf, data_len, data_off); + data_blp = data_buf.begin(); + } +#else + ldout(cct, 20) << __func__ << " allocating new rx buffer at offset " + << data_off << dendl; + alloc_aligned_buffer(data_buf, data_len, data_off); + data_blp = data_buf.begin(); +#endif + } + + msg_left = data_len; + + return CONTINUE(read_message_data); +} + +CtPtr ProtocolV1::read_message_data() { + ldout(cct, 20) << __func__ << " msg_left=" << msg_left << dendl; + + if (msg_left > 0) { + bufferptr bp = data_blp.get_current_ptr(); + unsigned read_len = std::min(bp.length(), msg_left); + + return READB(read_len, bp.c_str(), handle_message_data); + } + + return read_message_footer(); +} + +CtPtr ProtocolV1::handle_message_data(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read data error " << dendl; + return _fault(); + } + + bufferptr bp = data_blp.get_current_ptr(); + unsigned read_len = std::min(bp.length(), msg_left); + ceph_assert(read_len < + static_cast(std::numeric_limits::max())); + data_blp += read_len; + data.append(bp, 0, read_len); + msg_left -= read_len; + + return CONTINUE(read_message_data); +} + +CtPtr ProtocolV1::read_message_footer() { + ldout(cct, 20) << __func__ << dendl; + + state = READ_FOOTER_AND_DISPATCH; + + unsigned len; + if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) { + len = sizeof(ceph_msg_footer); + } else { + len = sizeof(ceph_msg_footer_old); + } + + return READ(len, handle_message_footer); +} + +CtPtr ProtocolV1::handle_message_footer(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read footer data error " << dendl; + return _fault(); + } + + ceph_msg_footer footer; + ceph_msg_footer_old old_footer; + + if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) { + footer = *((ceph_msg_footer *)buffer); + } else { + old_footer = *((ceph_msg_footer_old *)buffer); + footer.front_crc = old_footer.front_crc; + footer.middle_crc = old_footer.middle_crc; + footer.data_crc = old_footer.data_crc; + footer.sig = 0; + footer.flags = old_footer.flags; + } + + int aborted = (footer.flags & CEPH_MSG_FOOTER_COMPLETE) == 0; + ldout(cct, 10) << __func__ << " aborted = " << aborted << dendl; + if (aborted) { + ldout(cct, 0) << __func__ << " got " << front.length() << " + " + << middle.length() << " + " << data.length() + << " byte message.. ABORTED" << dendl; + return _fault(); + } + + ldout(cct, 20) << __func__ << " got " << front.length() << " + " + << middle.length() << " + " << data.length() << " byte message" + << dendl; + Message *message = decode_message(cct, messenger->crcflags, current_header, + footer, front, middle, data, connection); + if (!message) { + ldout(cct, 1) << __func__ << " decode message failed " << dendl; + return _fault(); + } + + // + // Check the signature if one should be present. A zero return indicates + // success. PLR + // + + if (session_security.get() == NULL) { + ldout(cct, 10) << __func__ << " no session security set" << dendl; + } else { + if (session_security->check_message_signature(message)) { + ldout(cct, 0) << __func__ << " Signature check failed" << dendl; + message->put(); + return _fault(); + } + } + message->set_byte_throttler(connection->policy.throttler_bytes); + message->set_message_throttler(connection->policy.throttler_messages); + + // store reservation size in message, so we don't get confused + // by messages entering the dispatch queue through other paths. + message->set_dispatch_throttle_size(cur_msg_size); + + message->set_recv_stamp(recv_stamp); + message->set_throttle_stamp(throttle_stamp); + message->set_recv_complete_stamp(ceph_clock_now()); + + // check received seq#. if it is old, drop the message. + // note that incoming messages may skip ahead. this is convenient for the + // client side queueing because messages can't be renumbered, but the (kernel) + // client will occasionally pull a message out of the sent queue to send + // elsewhere. in that case it doesn't matter if we "got" it or not. + uint64_t cur_seq = in_seq; + if (message->get_seq() <= cur_seq) { + ldout(cct, 0) << __func__ << " got old message " << message->get_seq() + << " <= " << cur_seq << " " << message << " " << *message + << ", discarding" << dendl; + message->put(); + if (connection->has_feature(CEPH_FEATURE_RECONNECT_SEQ) && + cct->_conf->ms_die_on_old_message) { + ceph_assert(0 == "old msgs despite reconnect_seq feature"); + } + return nullptr; + } + if (message->get_seq() > cur_seq + 1) { + ldout(cct, 0) << __func__ << " missed message? skipped from seq " + << cur_seq << " to " << message->get_seq() << dendl; + if (cct->_conf->ms_die_on_skipped_message) { + ceph_assert(0 == "skipped incoming seq"); + } + } + +#if defined(WITH_EVENTTRACE) + if (message->get_type() == CEPH_MSG_OSD_OP || + message->get_type() == CEPH_MSG_OSD_OPREPLY) { + utime_t ltt_processed_stamp = ceph_clock_now(); + double usecs_elapsed = + ((double)(ltt_processed_stamp.to_nsec() - recv_stamp.to_nsec())) / 1000; + ostringstream buf; + if (message->get_type() == CEPH_MSG_OSD_OP) + OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OP", + false); + else + OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OPREPLY", + false); + } +#endif + + // note last received message. + in_seq = message->get_seq(); + ldout(cct, 5) << " rx " << message->get_source() << " seq " + << message->get_seq() << " " << message << " " << *message + << dendl; + + bool need_dispatch_writer = false; + if (!connection->policy.lossy) { + ack_left++; + need_dispatch_writer = true; + } + + state = OPENED; + + ceph::mono_time fast_dispatch_time; + + if (connection->is_blackhole()) { + ldout(cct, 10) << __func__ << " blackhole " << *message << dendl; + message->put(); + goto out; + } + + connection->logger->inc(l_msgr_recv_messages); + connection->logger->inc( + l_msgr_recv_bytes, + cur_msg_size + sizeof(ceph_msg_header) + sizeof(ceph_msg_footer)); + + messenger->ms_fast_preprocess(message); + fast_dispatch_time = ceph::mono_clock::now(); + connection->logger->tinc(l_msgr_running_recv_time, + fast_dispatch_time - connection->recv_start_time); + if (connection->delay_state) { + double delay_period = 0; + if (rand() % 10000 < cct->_conf->ms_inject_delay_probability * 10000.0) { + delay_period = + cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0; + ldout(cct, 1) << "queue_received will delay after " + << (ceph_clock_now() + delay_period) << " on " << message + << " " << *message << dendl; + } + connection->delay_state->queue(delay_period, message); + } else if (messenger->ms_can_fast_dispatch(message)) { + connection->lock.unlock(); + connection->dispatch_queue->fast_dispatch(message); + connection->recv_start_time = ceph::mono_clock::now(); + connection->logger->tinc(l_msgr_running_fast_dispatch_time, + connection->recv_start_time - fast_dispatch_time); + connection->lock.lock(); + } else { + connection->dispatch_queue->enqueue(message, message->get_priority(), + connection->conn_id); + } + + out: + // clean up local buffer references + data_buf.clear(); + front.clear(); + middle.clear(); + data.clear(); + + if (need_dispatch_writer && connection->is_connected()) { + connection->center->dispatch_event_external(connection->write_handler); + } + + return CONTINUE(wait_message); +} + +void ProtocolV1::session_reset() { + ldout(cct, 10) << __func__ << " started" << dendl; + + std::lock_guard l(connection->write_lock); + if (connection->delay_state) { + connection->delay_state->discard(); + } + + connection->dispatch_queue->discard_queue(connection->conn_id); + discard_out_queue(); + // note: we need to clear outgoing_bl here, but session_reset may be + // called by other thread, so let caller clear this itself! + // outgoing_bl.clear(); + + connection->dispatch_queue->queue_remote_reset(connection); + + randomize_out_seq(); + + in_seq = 0; + connect_seq = 0; + // it's safe to directly set 0, double locked + ack_left = 0; + once_ready = false; + can_write = WriteStatus::NOWRITE; +} + +void ProtocolV1::randomize_out_seq() { + if (connection->get_features() & CEPH_FEATURE_MSG_AUTH) { + // Set out_seq to a random value, so CRC won't be predictable. + auto rand_seq = ceph::util::generate_random_number(0, SEQ_MASK); + ldout(cct, 10) << __func__ << " randomize_out_seq " << rand_seq << dendl; + out_seq = rand_seq; + } else { + // previously, seq #'s always started at 0. + out_seq = 0; + } +} + +ssize_t ProtocolV1::write_message(Message *m, bufferlist &bl, bool more) { + FUNCTRACE(cct); + ceph_assert(connection->center->in_thread()); + m->set_seq(++out_seq); + + if (messenger->crcflags & MSG_CRC_HEADER) { + m->calc_header_crc(); + } + + ceph_msg_header &header = m->get_header(); + ceph_msg_footer &footer = m->get_footer(); + + // TODO: let sign_message could be reentry? + // Now that we have all the crcs calculated, handle the + // digital signature for the message, if the AsyncConnection has session + // security set up. Some session security options do not + // actually calculate and check the signature, but they should + // handle the calls to sign_message and check_signature. PLR + if (session_security.get() == NULL) { + ldout(cct, 20) << __func__ << " no session security" << dendl; + } else { + if (session_security->sign_message(m)) { + ldout(cct, 20) << __func__ << " failed to sign m=" << m + << "): sig = " << footer.sig << dendl; + } else { + ldout(cct, 20) << __func__ << " signed m=" << m + << "): sig = " << footer.sig << dendl; + } + } + + connection->outgoing_bl.append(CEPH_MSGR_TAG_MSG); + connection->outgoing_bl.append((char *)&header, sizeof(header)); + + ldout(cct, 20) << __func__ << " sending message type=" << header.type + << " src " << entity_name_t(header.src) + << " front=" << header.front_len << " data=" << header.data_len + << " off " << header.data_off << dendl; + + if ((bl.length() <= ASYNC_COALESCE_THRESHOLD) && (bl.get_num_buffers() > 1)) { + for (const auto &pb : bl.buffers()) { + connection->outgoing_bl.append((char *)pb.c_str(), pb.length()); + } + } else { + connection->outgoing_bl.claim_append(bl); + } + + // send footer; if receiver doesn't support signatures, use the old footer + // format + ceph_msg_footer_old old_footer; + if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) { + connection->outgoing_bl.append((char *)&footer, sizeof(footer)); + } else { + if (messenger->crcflags & MSG_CRC_HEADER) { + old_footer.front_crc = footer.front_crc; + old_footer.middle_crc = footer.middle_crc; + } else { + old_footer.front_crc = old_footer.middle_crc = 0; + } + old_footer.data_crc = + messenger->crcflags & MSG_CRC_DATA ? footer.data_crc : 0; + old_footer.flags = footer.flags; + connection->outgoing_bl.append((char *)&old_footer, sizeof(old_footer)); + } + + m->trace.event("async writing message"); + ldout(cct, 20) << __func__ << " sending " << m->get_seq() << " " << m + << dendl; + ssize_t total_send_size = connection->outgoing_bl.length(); + ssize_t rc = connection->_try_send(more); + if (rc < 0) { + ldout(cct, 1) << __func__ << " error sending " << m << ", " + << cpp_strerror(rc) << dendl; + } else { + connection->logger->inc( + l_msgr_send_bytes, total_send_size - connection->outgoing_bl.length()); + ldout(cct, 10) << __func__ << " sending " << m + << (rc ? " continuely." : " done.") << dendl; + } + +#if defined(WITH_EVENTTRACE) + if (m->get_type() == CEPH_MSG_OSD_OP) + OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_END", false); + else if (m->get_type() == CEPH_MSG_OSD_OPREPLY) + OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_END", false); +#endif + m->put(); + + return rc; +} + +void ProtocolV1::requeue_sent() { + write_in_progress = false; + if (sent.empty()) { + return; + } + + list > &rq = out_q[CEPH_MSG_PRIO_HIGHEST]; + out_seq -= sent.size(); + while (!sent.empty()) { + Message *m = sent.back(); + sent.pop_back(); + ldout(cct, 10) << __func__ << " " << *m << " for resend " + << " (" << m->get_seq() << ")" << dendl; + m->clear_payload(); + rq.push_front(make_pair(bufferlist(), m)); + } +} + +uint64_t ProtocolV1::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) { + ldout(cct, 10) << __func__ << " " << seq << dendl; + std::lock_guard l(connection->write_lock); + if (out_q.count(CEPH_MSG_PRIO_HIGHEST) == 0) { + return seq; + } + list > &rq = out_q[CEPH_MSG_PRIO_HIGHEST]; + uint64_t count = out_seq; + while (!rq.empty()) { + pair p = rq.front(); + if (p.second->get_seq() == 0 || p.second->get_seq() > seq) break; + ldout(cct, 10) << __func__ << " " << *(p.second) << " for resend seq " + << p.second->get_seq() << " <= " << seq << ", discarding" + << dendl; + p.second->put(); + rq.pop_front(); + count++; + } + if (rq.empty()) out_q.erase(CEPH_MSG_PRIO_HIGHEST); + return count; +} + +/* + * Tears down the message queues, and removes them from the + * DispatchQueue Must hold write_lock prior to calling. + */ +void ProtocolV1::discard_out_queue() { + ldout(cct, 10) << __func__ << " started" << dendl; + + for (list::iterator p = sent.begin(); p != sent.end(); ++p) { + ldout(cct, 20) << __func__ << " discard " << *p << dendl; + (*p)->put(); + } + sent.clear(); + for (map > >::iterator p = + out_q.begin(); + p != out_q.end(); ++p) { + for (list >::iterator r = p->second.begin(); + r != p->second.end(); ++r) { + ldout(cct, 20) << __func__ << " discard " << r->second << dendl; + r->second->put(); + } + } + out_q.clear(); + write_in_progress = false; +} + +void ProtocolV1::reset_security() +{ + ldout(cct, 5) << __func__ << dendl; + + auth_meta.reset(new AuthConnectionMeta); + authorizer_more.clear(); + session_security.reset(); +} + +void ProtocolV1::reset_recv_state() +{ + ldout(cct, 5) << __func__ << dendl; + + // execute in the same thread that uses the `session_security`. + // We need to do the warp because holding `write_lock` is not + // enough as `write_event()` releases it just before calling + // `write_message()`. `submit_to()` here is NOT blocking. + if (!connection->center->in_thread()) { + connection->center->submit_to(connection->center->get_id(), [this] { + ldout(cct, 5) << "reset_recv_state (warped) reseting security handlers" + << dendl; + // Possibly unnecessary. See the comment in `deactivate_existing`. + std::lock_guard l(connection->lock); + std::lock_guard wl(connection->write_lock); + reset_security(); + }, /* always_async = */true); + } else { + reset_security(); + } + + // clean read and write callbacks + connection->pendingReadLen.reset(); + connection->writeCallback.reset(); + + if (state > THROTTLE_MESSAGE && state <= READ_FOOTER_AND_DISPATCH && + connection->policy.throttler_messages) { + ldout(cct, 10) << __func__ << " releasing " << 1 + << " message to policy throttler " + << connection->policy.throttler_messages->get_current() + << "/" << connection->policy.throttler_messages->get_max() + << dendl; + connection->policy.throttler_messages->put(); + } + if (state > THROTTLE_BYTES && state <= READ_FOOTER_AND_DISPATCH) { + if (connection->policy.throttler_bytes) { + ldout(cct, 10) << __func__ << " releasing " << cur_msg_size + << " bytes to policy throttler " + << connection->policy.throttler_bytes->get_current() << "/" + << connection->policy.throttler_bytes->get_max() << dendl; + connection->policy.throttler_bytes->put(cur_msg_size); + } + } + if (state > THROTTLE_DISPATCH_QUEUE && state <= READ_FOOTER_AND_DISPATCH) { + ldout(cct, 10) + << __func__ << " releasing " << cur_msg_size + << " bytes to dispatch_queue throttler " + << connection->dispatch_queue->dispatch_throttler.get_current() << "/" + << connection->dispatch_queue->dispatch_throttler.get_max() << dendl; + connection->dispatch_queue->dispatch_throttle_release(cur_msg_size); + } +} + +Message *ProtocolV1::_get_next_outgoing(bufferlist *bl) { + Message *m = 0; + if (!out_q.empty()) { + map > >::reverse_iterator it = + out_q.rbegin(); + ceph_assert(!it->second.empty()); + list >::iterator p = it->second.begin(); + m = p->second; + if (p->first.length() && bl) { + assert(bl->length() == 0); + bl->swap(p->first); + } + it->second.erase(p); + if (it->second.empty()) out_q.erase(it->first); + } + return m; +} + +/** + * Client Protocol V1 + **/ + +CtPtr ProtocolV1::send_client_banner() { + ldout(cct, 20) << __func__ << dendl; + state = CONNECTING; + + bufferlist bl; + bl.append(CEPH_BANNER, strlen(CEPH_BANNER)); + return WRITE(bl, handle_client_banner_write); +} + +CtPtr ProtocolV1::handle_client_banner_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " write client banner failed" << dendl; + return _fault(); + } + ldout(cct, 10) << __func__ << " connect write banner done: " + << connection->get_peer_addr() << dendl; + + return wait_server_banner(); +} + +CtPtr ProtocolV1::wait_server_banner() { + state = CONNECTING_WAIT_BANNER_AND_IDENTIFY; + + ldout(cct, 20) << __func__ << dendl; + + bufferlist myaddrbl; + unsigned banner_len = strlen(CEPH_BANNER); + unsigned need_len = banner_len + sizeof(ceph_entity_addr) * 2; + return READ(need_len, handle_server_banner_and_identify); +} + +CtPtr ProtocolV1::handle_server_banner_and_identify(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read banner and identify addresses failed" + << dendl; + return _fault(); + } + + unsigned banner_len = strlen(CEPH_BANNER); + if (memcmp(buffer, CEPH_BANNER, banner_len)) { + ldout(cct, 0) << __func__ << " connect protocol error (bad banner) on peer " + << connection->get_peer_addr() << dendl; + return _fault(); + } + + bufferlist bl; + entity_addr_t paddr, peer_addr_for_me; + + bl.append(buffer + banner_len, sizeof(ceph_entity_addr) * 2); + auto p = bl.cbegin(); + try { + decode(paddr, p); + decode(peer_addr_for_me, p); + } catch (const buffer::error &e) { + lderr(cct) << __func__ << " decode peer addr failed " << dendl; + return _fault(); + } + ldout(cct, 20) << __func__ << " connect read peer addr " << paddr + << " on socket " << connection->cs.fd() << dendl; + + entity_addr_t peer_addr = connection->peer_addrs->legacy_addr(); + if (peer_addr != paddr) { + if (paddr.is_blank_ip() && peer_addr.get_port() == paddr.get_port() && + peer_addr.get_nonce() == paddr.get_nonce()) { + ldout(cct, 0) << __func__ << " connect claims to be " << paddr << " not " + << peer_addr << " - presumably this is the same node!" + << dendl; + } else { + ldout(cct, 10) << __func__ << " connect claims to be " << paddr << " not " + << peer_addr << dendl; + return _fault(); + } + } + + ldout(cct, 20) << __func__ << " connect peer addr for me is " + << peer_addr_for_me << dendl; + if (messenger->get_myaddrs().empty() || + messenger->get_myaddrs().front().is_blank_ip()) { + sockaddr_storage ss; + socklen_t len = sizeof(ss); + getsockname(connection->cs.fd(), (sockaddr *)&ss, &len); + entity_addr_t a; + if (cct->_conf->ms_learn_addr_from_peer) { + ldout(cct, 1) << __func__ << " peer " << connection->target_addr + << " says I am " << peer_addr_for_me << " (socket says " + << (sockaddr*)&ss << ")" << dendl; + a = peer_addr_for_me; + } else { + ldout(cct, 1) << __func__ << " socket to " << connection->target_addr + << " says I am " << (sockaddr*)&ss + << " (peer says " << peer_addr_for_me << ")" << dendl; + a.set_sockaddr((sockaddr *)&ss); + } + a.set_type(entity_addr_t::TYPE_LEGACY); // anything but NONE; learned_addr ignores this + a.set_port(0); + connection->lock.unlock(); + messenger->learned_addr(a); + if (cct->_conf->ms_inject_internal_delays && + cct->_conf->ms_inject_socket_failures) { + if (rand() % cct->_conf->ms_inject_socket_failures == 0) { + ldout(cct, 10) << __func__ << " sleep for " + << cct->_conf->ms_inject_internal_delays << dendl; + utime_t t; + t.set_from_double(cct->_conf->ms_inject_internal_delays); + t.sleep(); + } + } + connection->lock.lock(); + if (state != CONNECTING_WAIT_BANNER_AND_IDENTIFY) { + ldout(cct, 1) << __func__ + << " state changed while learned_addr, mark_down or " + << " replacing must be happened just now" << dendl; + return nullptr; + } + } + + bufferlist myaddrbl; + encode(messenger->get_myaddr_legacy(), myaddrbl, 0); // legacy + return WRITE(myaddrbl, handle_my_addr_write); +} + +CtPtr ProtocolV1::handle_my_addr_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 2) << __func__ << " connect couldn't write my addr, " + << cpp_strerror(r) << dendl; + return _fault(); + } + ldout(cct, 10) << __func__ << " connect sent my addr " + << messenger->get_myaddr_legacy() << dendl; + + return CONTINUE(send_connect_message); +} + +CtPtr ProtocolV1::send_connect_message() +{ + state = CONNECTING_SEND_CONNECT_MSG; + + ldout(cct, 20) << __func__ << dendl; + ceph_assert(messenger->auth_client); + + bufferlist auth_bl; + vector preferred_modes; + + if (connection->peer_type != CEPH_ENTITY_TYPE_MON || + messenger->get_myname().type() == CEPH_ENTITY_TYPE_MON) { + if (authorizer_more.length()) { + ldout(cct,10) << __func__ << " using augmented (challenge) auth payload" + << dendl; + auth_bl = authorizer_more; + } else { + auto am = auth_meta; + authorizer_more.clear(); + connection->lock.unlock(); + int r = messenger->auth_client->get_auth_request( + connection, am.get(), + &am->auth_method, &preferred_modes, &auth_bl); + connection->lock.lock(); + if (r < 0) { + return _fault(); + } + if (state != CONNECTING_SEND_CONNECT_MSG) { + ldout(cct, 1) << __func__ << " state changed!" << dendl; + return _fault(); + } + } + } + + ceph_msg_connect connect; + connect.features = connection->policy.features_supported; + connect.host_type = messenger->get_myname().type(); + connect.global_seq = global_seq; + connect.connect_seq = connect_seq; + connect.protocol_version = + messenger->get_proto_version(connection->peer_type, true); + if (auth_bl.length()) { + ldout(cct, 10) << __func__ + << " connect_msg.authorizer_len=" << auth_bl.length() + << " protocol=" << auth_meta->auth_method << dendl; + connect.authorizer_protocol = auth_meta->auth_method; + connect.authorizer_len = auth_bl.length(); + } else { + connect.authorizer_protocol = 0; + connect.authorizer_len = 0; + } + + connect.flags = 0; + if (connection->policy.lossy) { + connect.flags |= + CEPH_MSG_CONNECT_LOSSY; // this is fyi, actually, server decides! + } + + bufferlist bl; + bl.append((char *)&connect, sizeof(connect)); + if (auth_bl.length()) { + bl.append(auth_bl.c_str(), auth_bl.length()); + } + + ldout(cct, 10) << __func__ << " connect sending gseq=" << global_seq + << " cseq=" << connect_seq + << " proto=" << connect.protocol_version << dendl; + + return WRITE(bl, handle_connect_message_write); +} + +CtPtr ProtocolV1::handle_connect_message_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 2) << __func__ << " connect couldn't send reply " + << cpp_strerror(r) << dendl; + return _fault(); + } + + ldout(cct, 20) << __func__ + << " connect wrote (self +) cseq, waiting for reply" << dendl; + + return wait_connect_reply(); +} + +CtPtr ProtocolV1::wait_connect_reply() { + ldout(cct, 20) << __func__ << dendl; + + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&connect_reply, 0, sizeof(connect_reply)); + return READ(sizeof(connect_reply), handle_connect_reply_1); +} + +CtPtr ProtocolV1::handle_connect_reply_1(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read connect reply failed" << dendl; + return _fault(); + } + + connect_reply = *((ceph_msg_connect_reply *)buffer); + + ldout(cct, 20) << __func__ << " connect got reply tag " + << (int)connect_reply.tag << " connect_seq " + << connect_reply.connect_seq << " global_seq " + << connect_reply.global_seq << " proto " + << connect_reply.protocol_version << " flags " + << (int)connect_reply.flags << " features " + << connect_reply.features << dendl; + + if (connect_reply.authorizer_len) { + return wait_connect_reply_auth(); + } + + return handle_connect_reply_2(); +} + +CtPtr ProtocolV1::wait_connect_reply_auth() { + ldout(cct, 20) << __func__ << dendl; + + ldout(cct, 10) << __func__ + << " reply.authorizer_len=" << connect_reply.authorizer_len + << dendl; + + ceph_assert(connect_reply.authorizer_len < 4096); + + return READ(connect_reply.authorizer_len, handle_connect_reply_auth); +} + +CtPtr ProtocolV1::handle_connect_reply_auth(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read connect reply authorizer failed" + << dendl; + return _fault(); + } + + bufferlist authorizer_reply; + authorizer_reply.append(buffer, connect_reply.authorizer_len); + + if (connection->peer_type != CEPH_ENTITY_TYPE_MON || + messenger->get_myname().type() == CEPH_ENTITY_TYPE_MON) { + auto am = auth_meta; + bool more = (connect_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER); + bufferlist auth_retry_bl; + int r; + connection->lock.unlock(); + if (more) { + r = messenger->auth_client->handle_auth_reply_more( + connection, am.get(), authorizer_reply, &auth_retry_bl); + } else { + // these aren't used for v1 + CryptoKey skey; + string con_secret; + r = messenger->auth_client->handle_auth_done( + connection, am.get(), + 0 /* global id */, 0 /* con mode */, + authorizer_reply, + &skey, &con_secret); + } + connection->lock.lock(); + if (state != CONNECTING_SEND_CONNECT_MSG) { + ldout(cct, 1) << __func__ << " state changed" << dendl; + return _fault(); + } + if (r < 0) { + return _fault(); + } + if (more && r == 0) { + authorizer_more = auth_retry_bl; + return CONTINUE(send_connect_message); + } + } + + return handle_connect_reply_2(); +} + +CtPtr ProtocolV1::handle_connect_reply_2() { + ldout(cct, 20) << __func__ << dendl; + + if (connect_reply.tag == CEPH_MSGR_TAG_FEATURES) { + ldout(cct, 0) << __func__ << " connect protocol feature mismatch, my " + << std::hex << connection->policy.features_supported + << " < peer " << connect_reply.features << " missing " + << (connect_reply.features & + ~connection->policy.features_supported) + << std::dec << dendl; + return _fault(); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_BADPROTOVER) { + ldout(cct, 0) << __func__ << " connect protocol version mismatch, my " + << messenger->get_proto_version(connection->peer_type, true) + << " != " << connect_reply.protocol_version << dendl; + return _fault(); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_BADAUTHORIZER) { + ldout(cct, 0) << __func__ << " connect got BADAUTHORIZER" << dendl; + authorizer_more.clear(); + return _fault(); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_RESETSESSION) { + ldout(cct, 0) << __func__ << " connect got RESETSESSION" << dendl; + session_reset(); + connect_seq = 0; + + // see session_reset + connection->outgoing_bl.clear(); + + return CONTINUE(send_connect_message); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_RETRY_GLOBAL) { + global_seq = messenger->get_global_seq(connect_reply.global_seq); + ldout(cct, 5) << __func__ << " connect got RETRY_GLOBAL " + << connect_reply.global_seq << " chose new " << global_seq + << dendl; + return CONTINUE(send_connect_message); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_RETRY_SESSION) { + ceph_assert(connect_reply.connect_seq > connect_seq); + ldout(cct, 5) << __func__ << " connect got RETRY_SESSION " << connect_seq + << " -> " << connect_reply.connect_seq << dendl; + connect_seq = connect_reply.connect_seq; + return CONTINUE(send_connect_message); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_WAIT) { + ldout(cct, 1) << __func__ << " connect got WAIT (connection race)" << dendl; + state = WAIT; + return _fault(); + } + + uint64_t feat_missing; + feat_missing = + connection->policy.features_required & ~(uint64_t)connect_reply.features; + if (feat_missing) { + ldout(cct, 1) << __func__ << " missing required features " << std::hex + << feat_missing << std::dec << dendl; + return _fault(); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_SEQ) { + ldout(cct, 10) + << __func__ + << " got CEPH_MSGR_TAG_SEQ, reading acked_seq and writing in_seq" + << dendl; + + return wait_ack_seq(); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_READY) { + ldout(cct, 10) << __func__ << " got CEPH_MSGR_TAG_READY " << dendl; + } + + return client_ready(); +} + +CtPtr ProtocolV1::wait_ack_seq() { + ldout(cct, 20) << __func__ << dendl; + + return READ(sizeof(uint64_t), handle_ack_seq); +} + +CtPtr ProtocolV1::handle_ack_seq(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read connect ack seq failed" << dendl; + return _fault(); + } + + uint64_t newly_acked_seq = 0; + + newly_acked_seq = *((uint64_t *)buffer); + ldout(cct, 2) << __func__ << " got newly_acked_seq " << newly_acked_seq + << " vs out_seq " << out_seq << dendl; + out_seq = discard_requeued_up_to(out_seq, newly_acked_seq); + + bufferlist bl; + uint64_t s = in_seq; + bl.append((char *)&s, sizeof(s)); + + return WRITE(bl, handle_in_seq_write); +} + +CtPtr ProtocolV1::handle_in_seq_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 10) << __func__ << " failed to send in_seq " << dendl; + return _fault(); + } + + ldout(cct, 10) << __func__ << " send in_seq done " << dendl; + + return client_ready(); +} + +CtPtr ProtocolV1::client_ready() { + ldout(cct, 20) << __func__ << dendl; + + // hooray! + peer_global_seq = connect_reply.global_seq; + connection->policy.lossy = connect_reply.flags & CEPH_MSG_CONNECT_LOSSY; + + once_ready = true; + connect_seq += 1; + ceph_assert(connect_seq == connect_reply.connect_seq); + backoff = utime_t(); + connection->set_features((uint64_t)connect_reply.features & + (uint64_t)connection->policy.features_supported); + ldout(cct, 10) << __func__ << " connect success " << connect_seq + << ", lossy = " << connection->policy.lossy << ", features " + << connection->get_features() << dendl; + + // If we have an authorizer, get a new AuthSessionHandler to deal with + // ongoing security of the connection. PLR + if (auth_meta->authorizer) { + ldout(cct, 10) << __func__ << " setting up session_security with auth " + << auth_meta->authorizer.get() << dendl; + session_security.reset(get_auth_session_handler( + cct, auth_meta->authorizer->protocol, + auth_meta->session_key, + connection->get_features())); + } else { + // We have no authorizer, so we shouldn't be applying security to messages + // in this AsyncConnection. PLR + ldout(cct, 10) << __func__ << " no authorizer, clearing session_security" + << dendl; + session_security.reset(); + } + + if (connection->delay_state) { + ceph_assert(connection->delay_state->ready()); + } + connection->dispatch_queue->queue_connect(connection); + messenger->ms_deliver_handle_fast_connect(connection); + + return ready(); +} + +/** + * Server Protocol V1 + **/ + +CtPtr ProtocolV1::send_server_banner() { + ldout(cct, 20) << __func__ << dendl; + state = ACCEPTING; + + bufferlist bl; + + bl.append(CEPH_BANNER, strlen(CEPH_BANNER)); + + // as a server, we should have a legacy addr if we accepted this connection. + auto legacy = messenger->get_myaddrs().legacy_addr(); + encode(legacy, bl, 0); // legacy + connection->port = legacy.get_port(); + encode(connection->target_addr, bl, 0); // legacy + + ldout(cct, 1) << __func__ << " sd=" << connection->cs.fd() + << " legacy " << legacy + << " socket_addr " << connection->socket_addr + << " target_addr " << connection->target_addr + << dendl; + + return WRITE(bl, handle_server_banner_write); +} + +CtPtr ProtocolV1::handle_server_banner_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << " write server banner failed" << dendl; + return _fault(); + } + ldout(cct, 10) << __func__ << " write banner and addr done: " + << connection->get_peer_addr() << dendl; + + return wait_client_banner(); +} + +CtPtr ProtocolV1::wait_client_banner() { + ldout(cct, 20) << __func__ << dendl; + + return READ(strlen(CEPH_BANNER) + sizeof(ceph_entity_addr), + handle_client_banner); +} + +CtPtr ProtocolV1::handle_client_banner(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read peer banner and addr failed" << dendl; + return _fault(); + } + + if (memcmp(buffer, CEPH_BANNER, strlen(CEPH_BANNER))) { + ldout(cct, 1) << __func__ << " accept peer sent bad banner '" << buffer + << "' (should be '" << CEPH_BANNER << "')" << dendl; + return _fault(); + } + + bufferlist addr_bl; + entity_addr_t peer_addr; + + addr_bl.append(buffer + strlen(CEPH_BANNER), sizeof(ceph_entity_addr)); + try { + auto ti = addr_bl.cbegin(); + decode(peer_addr, ti); + } catch (const buffer::error &e) { + lderr(cct) << __func__ << " decode peer_addr failed " << dendl; + return _fault(); + } + + ldout(cct, 10) << __func__ << " accept peer addr is " << peer_addr << dendl; + if (peer_addr.is_blank_ip()) { + // peer apparently doesn't know what ip they have; figure it out for them. + int port = peer_addr.get_port(); + peer_addr.set_sockaddr(connection->target_addr.get_sockaddr()); + peer_addr.set_port(port); + + ldout(cct, 0) << __func__ << " accept peer addr is really " << peer_addr + << " (socket is " << connection->target_addr << ")" << dendl; + } + connection->set_peer_addr(peer_addr); // so that connection_state gets set up + connection->target_addr = peer_addr; + + return CONTINUE(wait_connect_message); +} + +CtPtr ProtocolV1::wait_connect_message() { + ldout(cct, 20) << __func__ << dendl; + + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&connect_msg, 0, sizeof(connect_msg)); + return READ(sizeof(connect_msg), handle_connect_message_1); +} + +CtPtr ProtocolV1::handle_connect_message_1(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read connect msg failed" << dendl; + return _fault(); + } + + connect_msg = *((ceph_msg_connect *)buffer); + + state = ACCEPTING_WAIT_CONNECT_MSG_AUTH; + + if (connect_msg.authorizer_len) { + return wait_connect_message_auth(); + } + + return handle_connect_message_2(); +} + +CtPtr ProtocolV1::wait_connect_message_auth() { + ldout(cct, 20) << __func__ << dendl; + authorizer_buf.clear(); + authorizer_buf.push_back(buffer::create(connect_msg.authorizer_len)); + return READB(connect_msg.authorizer_len, authorizer_buf.c_str(), + handle_connect_message_auth); +} + +CtPtr ProtocolV1::handle_connect_message_auth(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read connect authorizer failed" << dendl; + return _fault(); + } + + return handle_connect_message_2(); +} + +CtPtr ProtocolV1::handle_connect_message_2() { + ldout(cct, 20) << __func__ << dendl; + + ldout(cct, 20) << __func__ << " accept got peer connect_seq " + << connect_msg.connect_seq << " global_seq " + << connect_msg.global_seq << dendl; + + connection->set_peer_type(connect_msg.host_type); + connection->policy = messenger->get_policy(connect_msg.host_type); + + ldout(cct, 10) << __func__ << " accept of host_type " << connect_msg.host_type + << ", policy.lossy=" << connection->policy.lossy + << " policy.server=" << connection->policy.server + << " policy.standby=" << connection->policy.standby + << " policy.resetcheck=" << connection->policy.resetcheck + << " features 0x" << std::hex << (uint64_t)connect_msg.features + << std::dec + << dendl; + + ceph_msg_connect_reply reply; + bufferlist authorizer_reply; + + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&reply, 0, sizeof(reply)); + reply.protocol_version = + messenger->get_proto_version(connection->peer_type, false); + + // mismatch? + ldout(cct, 10) << __func__ << " accept my proto " << reply.protocol_version + << ", their proto " << connect_msg.protocol_version << dendl; + + if (connect_msg.protocol_version != reply.protocol_version) { + return send_connect_message_reply(CEPH_MSGR_TAG_BADPROTOVER, reply, + authorizer_reply); + } + + // require signatures for cephx? + if (connect_msg.authorizer_protocol == CEPH_AUTH_CEPHX) { + if (connection->peer_type == CEPH_ENTITY_TYPE_OSD || + connection->peer_type == CEPH_ENTITY_TYPE_MDS) { + if (cct->_conf->cephx_require_signatures || + cct->_conf->cephx_cluster_require_signatures) { + ldout(cct, 10) + << __func__ + << " using cephx, requiring MSG_AUTH feature bit for cluster" + << dendl; + connection->policy.features_required |= CEPH_FEATURE_MSG_AUTH; + } + } else { + if (cct->_conf->cephx_require_signatures || + cct->_conf->cephx_service_require_signatures) { + ldout(cct, 10) + << __func__ + << " using cephx, requiring MSG_AUTH feature bit for service" + << dendl; + connection->policy.features_required |= CEPH_FEATURE_MSG_AUTH; + } + } + } + + uint64_t feat_missing = + connection->policy.features_required & ~(uint64_t)connect_msg.features; + if (feat_missing) { + ldout(cct, 1) << __func__ << " peer missing required features " << std::hex + << feat_missing << std::dec << dendl; + return send_connect_message_reply(CEPH_MSGR_TAG_FEATURES, reply, + authorizer_reply); + } + + bufferlist auth_bl_copy = authorizer_buf; + auto am = auth_meta; + am->auth_method = connect_msg.authorizer_protocol; + connection->lock.unlock(); + ldout(cct,10) << __func__ << " authorizor_protocol " + << connect_msg.authorizer_protocol + << " len " << auth_bl_copy.length() + << dendl; + bool more = (bool)auth_meta->authorizer_challenge; + int r = messenger->auth_server->handle_auth_request( + connection, + am.get(), + more, + am->auth_method, + auth_bl_copy, + &authorizer_reply); + if (r < 0) { + connection->lock.lock(); + if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) { + ldout(cct, 1) << __func__ << " state changed" << dendl; + return _fault(); + } + ldout(cct, 0) << __func__ << ": got bad authorizer, auth_reply_len=" + << authorizer_reply.length() << dendl; + session_security.reset(); + return send_connect_message_reply(CEPH_MSGR_TAG_BADAUTHORIZER, reply, + authorizer_reply); + } + if (r == 0) { + connection->lock.lock(); + if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) { + ldout(cct, 1) << __func__ << " state changed" << dendl; + return _fault(); + } + ldout(cct, 10) << __func__ << ": challenging authorizer" << dendl; + ceph_assert(authorizer_reply.length()); + return send_connect_message_reply(CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER, + reply, authorizer_reply); + } + + // We've verified the authorizer for this AsyncConnection, so set up the + // session security structure. PLR + ldout(cct, 10) << __func__ << " accept setting up session_security." << dendl; + + if (connection->policy.server && + connection->policy.lossy && + !connection->policy.register_lossy_clients) { + // incoming lossy client, no need to register this connection + // new session + ldout(cct, 10) << __func__ << " accept new session" << dendl; + connection->lock.lock(); + return open(reply, authorizer_reply); + } + + AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs); + + connection->inject_delay(); + + connection->lock.lock(); + if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) { + ldout(cct, 1) << __func__ << " state changed" << dendl; + return _fault(); + } + + if (existing == connection) { + existing = nullptr; + } + if (existing && existing->protocol->proto_type != 1) { + ldout(cct,1) << __func__ << " existing " << existing << " proto " + << existing->protocol.get() << " version is " + << existing->protocol->proto_type << ", marking down" << dendl; + existing->mark_down(); + existing = nullptr; + } + + if (existing) { + // There is no possible that existing connection will acquire this + // connection's lock + existing->lock.lock(); // skip lockdep check (we are locking a second + // AsyncConnection here) + + ldout(cct,10) << __func__ << " existing=" << existing << " exproto=" + << existing->protocol.get() << dendl; + ProtocolV1 *exproto = dynamic_cast(existing->protocol.get()); + ceph_assert(exproto); + ceph_assert(exproto->proto_type == 1); + + if (exproto->state == CLOSED) { + ldout(cct, 1) << __func__ << " existing " << existing + << " already closed." << dendl; + existing->lock.unlock(); + existing = nullptr; + + return open(reply, authorizer_reply); + } + + if (exproto->replacing) { + ldout(cct, 1) << __func__ + << " existing racing replace happened while replacing." + << " existing_state=" + << connection->get_state_name(existing->state) << dendl; + reply.global_seq = exproto->peer_global_seq; + existing->lock.unlock(); + return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_GLOBAL, reply, + authorizer_reply); + } + + if (connect_msg.global_seq < exproto->peer_global_seq) { + ldout(cct, 10) << __func__ << " accept existing " << existing << ".gseq " + << exproto->peer_global_seq << " > " + << connect_msg.global_seq << ", RETRY_GLOBAL" << dendl; + reply.global_seq = exproto->peer_global_seq; // so we can send it below.. + existing->lock.unlock(); + return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_GLOBAL, reply, + authorizer_reply); + } else { + ldout(cct, 10) << __func__ << " accept existing " << existing << ".gseq " + << exproto->peer_global_seq + << " <= " << connect_msg.global_seq << ", looks ok" + << dendl; + } + + if (existing->policy.lossy) { + ldout(cct, 0) + << __func__ + << " accept replacing existing (lossy) channel (new one lossy=" + << connection->policy.lossy << ")" << dendl; + exproto->session_reset(); + return replace(existing, reply, authorizer_reply); + } + + ldout(cct, 1) << __func__ << " accept connect_seq " + << connect_msg.connect_seq + << " vs existing csq=" << exproto->connect_seq + << " existing_state=" + << connection->get_state_name(existing->state) << dendl; + + if (connect_msg.connect_seq == 0 && exproto->connect_seq > 0) { + ldout(cct, 0) + << __func__ + << " accept peer reset, then tried to connect to us, replacing" + << dendl; + // this is a hard reset from peer + is_reset_from_peer = true; + if (connection->policy.resetcheck) { + exproto->session_reset(); // this resets out_queue, msg_ and + // connect_seq #'s + } + return replace(existing, reply, authorizer_reply); + } + + if (connect_msg.connect_seq < exproto->connect_seq) { + // old attempt, or we sent READY but they didn't get it. + ldout(cct, 10) << __func__ << " accept existing " << existing << ".cseq " + << exproto->connect_seq << " > " << connect_msg.connect_seq + << ", RETRY_SESSION" << dendl; + reply.connect_seq = exproto->connect_seq + 1; + existing->lock.unlock(); + return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_SESSION, reply, + authorizer_reply); + } + + if (connect_msg.connect_seq == exproto->connect_seq) { + // if the existing connection successfully opened, and/or + // subsequently went to standby, then the peer should bump + // their connect_seq and retry: this is not a connection race + // we need to resolve here. + if (exproto->state == OPENED || exproto->state == STANDBY) { + ldout(cct, 10) << __func__ << " accept connection race, existing " + << existing << ".cseq " << exproto->connect_seq + << " == " << connect_msg.connect_seq + << ", OPEN|STANDBY, RETRY_SESSION " << dendl; + // if connect_seq both zero, dont stuck into dead lock. it's ok to + // replace + if (connection->policy.resetcheck && exproto->connect_seq == 0) { + return replace(existing, reply, authorizer_reply); + } + + reply.connect_seq = exproto->connect_seq + 1; + existing->lock.unlock(); + return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_SESSION, reply, + authorizer_reply); + } + + // connection race? + if (connection->peer_addrs->legacy_addr() < messenger->get_myaddr_legacy() || + existing->policy.server) { + // incoming wins + ldout(cct, 10) << __func__ << " accept connection race, existing " + << existing << ".cseq " << exproto->connect_seq + << " == " << connect_msg.connect_seq + << ", or we are server, replacing my attempt" << dendl; + return replace(existing, reply, authorizer_reply); + } else { + // our existing outgoing wins + ldout(messenger->cct, 10) + << __func__ << " accept connection race, existing " << existing + << ".cseq " << exproto->connect_seq + << " == " << connect_msg.connect_seq << ", sending WAIT" << dendl; + ceph_assert(connection->peer_addrs->legacy_addr() > + messenger->get_myaddr_legacy()); + existing->lock.unlock(); + // make sure we follow through with opening the existing + // connection (if it isn't yet open) since we know the peer + // has something to send to us. + existing->send_keepalive(); + return send_connect_message_reply(CEPH_MSGR_TAG_WAIT, reply, + authorizer_reply); + } + } + + ceph_assert(connect_msg.connect_seq > exproto->connect_seq); + ceph_assert(connect_msg.global_seq >= exproto->peer_global_seq); + if (connection->policy.resetcheck && // RESETSESSION only used by servers; + // peers do not reset each other + exproto->connect_seq == 0) { + ldout(cct, 0) << __func__ << " accept we reset (peer sent cseq " + << connect_msg.connect_seq << ", " << existing + << ".cseq = " << exproto->connect_seq + << "), sending RESETSESSION " << dendl; + existing->lock.unlock(); + return send_connect_message_reply(CEPH_MSGR_TAG_RESETSESSION, reply, + authorizer_reply); + } + + // reconnect + ldout(cct, 10) << __func__ << " accept peer sent cseq " + << connect_msg.connect_seq << " > " << exproto->connect_seq + << dendl; + return replace(existing, reply, authorizer_reply); + } // existing + else if (!replacing && connect_msg.connect_seq > 0) { + // we reset, and they are opening a new session + ldout(cct, 0) << __func__ << " accept we reset (peer sent cseq " + << connect_msg.connect_seq << "), sending RESETSESSION" + << dendl; + return send_connect_message_reply(CEPH_MSGR_TAG_RESETSESSION, reply, + authorizer_reply); + } else { + // new session + ldout(cct, 10) << __func__ << " accept new session" << dendl; + existing = nullptr; + return open(reply, authorizer_reply); + } +} + +CtPtr ProtocolV1::send_connect_message_reply(char tag, + ceph_msg_connect_reply &reply, + bufferlist &authorizer_reply) { + ldout(cct, 20) << __func__ << dendl; + bufferlist reply_bl; + reply.tag = tag; + reply.features = + ((uint64_t)connect_msg.features & connection->policy.features_supported) | + connection->policy.features_required; + reply.authorizer_len = authorizer_reply.length(); + reply_bl.append((char *)&reply, sizeof(reply)); + + ldout(cct, 10) << __func__ << " reply features 0x" << std::hex + << reply.features << " = (policy sup 0x" + << connection->policy.features_supported + << " & connect 0x" << (uint64_t)connect_msg.features + << ") | policy req 0x" + << connection->policy.features_required + << dendl; + + if (reply.authorizer_len) { + reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length()); + authorizer_reply.clear(); + } + + return WRITE(reply_bl, handle_connect_message_reply_write); +} + +CtPtr ProtocolV1::handle_connect_message_reply_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << " write connect message reply failed" << dendl; + connection->inject_delay(); + return _fault(); + } + + return CONTINUE(wait_connect_message); +} + +CtPtr ProtocolV1::replace(const AsyncConnectionRef& existing, + ceph_msg_connect_reply &reply, + bufferlist &authorizer_reply) { + ldout(cct, 10) << __func__ << " accept replacing " << existing << dendl; + + connection->inject_delay(); + if (existing->policy.lossy) { + // disconnect from the Connection + ldout(cct, 1) << __func__ << " replacing on lossy channel, failing existing" + << dendl; + existing->protocol->stop(); + existing->dispatch_queue->queue_reset(existing.get()); + } else { + ceph_assert(can_write == WriteStatus::NOWRITE); + existing->write_lock.lock(); + + ProtocolV1 *exproto = dynamic_cast(existing->protocol.get()); + + // reset the in_seq if this is a hard reset from peer, + // otherwise we respect our original connection's value + if (is_reset_from_peer) { + exproto->is_reset_from_peer = true; + } + + connection->center->delete_file_event(connection->cs.fd(), + EVENT_READABLE | EVENT_WRITABLE); + + if (existing->delay_state) { + existing->delay_state->flush(); + ceph_assert(!connection->delay_state); + } + exproto->reset_recv_state(); + + exproto->connect_msg.features = connect_msg.features; + + auto temp_cs = std::move(connection->cs); + EventCenter *new_center = connection->center; + Worker *new_worker = connection->worker; + // avoid _stop shutdown replacing socket + // queue a reset on the new connection, which we're dumping for the old + stop(); + + connection->dispatch_queue->queue_reset(connection); + ldout(messenger->cct, 1) + << __func__ << " stop myself to swap existing" << dendl; + exproto->can_write = WriteStatus::REPLACING; + exproto->replacing = true; + exproto->write_in_progress = false; + existing->state_offset = 0; + // avoid previous thread modify event + exproto->state = NONE; + existing->state = AsyncConnection::STATE_NONE; + // Discard existing prefetch buffer in `recv_buf` + existing->recv_start = existing->recv_end = 0; + // there shouldn't exist any buffer + ceph_assert(connection->recv_start == connection->recv_end); + + auto deactivate_existing = std::bind( + [existing, new_worker, new_center, exproto, reply, + authorizer_reply](ConnectedSocket &cs) mutable { + // we need to delete time event in original thread + { + std::lock_guard l(existing->lock); + existing->write_lock.lock(); + exproto->requeue_sent(); + existing->outgoing_bl.clear(); + existing->open_write = false; + existing->write_lock.unlock(); + if (exproto->state == NONE) { + existing->shutdown_socket(); + existing->cs = std::move(cs); + existing->worker->references--; + new_worker->references++; + existing->logger = new_worker->get_perf_counter(); + existing->worker = new_worker; + existing->center = new_center; + if (existing->delay_state) + existing->delay_state->set_center(new_center); + } else if (exproto->state == CLOSED) { + auto back_to_close = + std::bind([](ConnectedSocket &cs) mutable { cs.close(); }, + std::move(cs)); + new_center->submit_to(new_center->get_id(), + std::move(back_to_close), true); + return; + } else { + ceph_abort(); + } + } + + // Before changing existing->center, it may already exists some + // events in existing->center's queue. Then if we mark down + // `existing`, it will execute in another thread and clean up + // connection. Previous event will result in segment fault + auto transfer_existing = [existing, exproto, reply, + authorizer_reply]() mutable { + std::lock_guard l(existing->lock); + if (exproto->state == CLOSED) return; + ceph_assert(exproto->state == NONE); + + // we have called shutdown_socket above + ceph_assert(existing->last_tick_id == 0); + // restart timer since we are going to re-build connection + existing->last_connect_started = ceph::coarse_mono_clock::now(); + existing->last_tick_id = existing->center->create_time_event( + existing->connect_timeout_us, existing->tick_handler); + existing->state = AsyncConnection::STATE_CONNECTION_ESTABLISHED; + exproto->state = ACCEPTING; + + existing->center->create_file_event( + existing->cs.fd(), EVENT_READABLE, existing->read_handler); + reply.global_seq = exproto->peer_global_seq; + exproto->run_continuation(exproto->send_connect_message_reply( + CEPH_MSGR_TAG_RETRY_GLOBAL, reply, authorizer_reply)); + }; + if (existing->center->in_thread()) + transfer_existing(); + else + existing->center->submit_to(existing->center->get_id(), + std::move(transfer_existing), true); + }, + std::move(temp_cs)); + + existing->center->submit_to(existing->center->get_id(), + std::move(deactivate_existing), true); + existing->write_lock.unlock(); + existing->lock.unlock(); + return nullptr; + } + existing->lock.unlock(); + + return open(reply, authorizer_reply); +} + +CtPtr ProtocolV1::open(ceph_msg_connect_reply &reply, + bufferlist &authorizer_reply) { + ldout(cct, 20) << __func__ << dendl; + + connect_seq = connect_msg.connect_seq + 1; + peer_global_seq = connect_msg.global_seq; + ldout(cct, 10) << __func__ << " accept success, connect_seq = " << connect_seq + << " in_seq=" << in_seq << ", sending READY" << dendl; + + // if it is a hard reset from peer, we don't need a round-trip to negotiate + // in/out sequence + if ((connect_msg.features & CEPH_FEATURE_RECONNECT_SEQ) && + !is_reset_from_peer) { + reply.tag = CEPH_MSGR_TAG_SEQ; + wait_for_seq = true; + } else { + reply.tag = CEPH_MSGR_TAG_READY; + wait_for_seq = false; + out_seq = discard_requeued_up_to(out_seq, 0); + is_reset_from_peer = false; + in_seq = 0; + } + + // send READY reply + reply.features = connection->policy.features_supported; + reply.global_seq = messenger->get_global_seq(); + reply.connect_seq = connect_seq; + reply.flags = 0; + reply.authorizer_len = authorizer_reply.length(); + if (connection->policy.lossy) { + reply.flags = reply.flags | CEPH_MSG_CONNECT_LOSSY; + } + + connection->set_features((uint64_t)reply.features & + (uint64_t)connect_msg.features); + ldout(cct, 10) << __func__ << " accept features " + << connection->get_features() + << " authorizer_protocol " + << connect_msg.authorizer_protocol << dendl; + + session_security.reset( + get_auth_session_handler(cct, auth_meta->auth_method, + auth_meta->session_key, + connection->get_features())); + + bufferlist reply_bl; + reply_bl.append((char *)&reply, sizeof(reply)); + + if (reply.authorizer_len) { + reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length()); + } + + if (reply.tag == CEPH_MSGR_TAG_SEQ) { + uint64_t s = in_seq; + reply_bl.append((char *)&s, sizeof(s)); + } + + connection->lock.unlock(); + // Because "replacing" will prevent other connections preempt this addr, + // it's safe that here we don't acquire Connection's lock + ssize_t r = messenger->accept_conn(connection); + + connection->inject_delay(); + + connection->lock.lock(); + replacing = false; + if (r < 0) { + ldout(cct, 1) << __func__ << " existing race replacing process for addr = " + << connection->peer_addrs->legacy_addr() + << " just fail later one(this)" << dendl; + ldout(cct, 10) << "accept fault after register" << dendl; + connection->inject_delay(); + return _fault(); + } + if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) { + ldout(cct, 1) << __func__ + << " state changed while accept_conn, it must be mark_down" + << dendl; + ceph_assert(state == CLOSED || state == NONE); + ldout(cct, 10) << "accept fault after register" << dendl; + messenger->unregister_conn(connection); + connection->inject_delay(); + return _fault(); + } + + return WRITE(reply_bl, handle_ready_connect_message_reply_write); +} + +CtPtr ProtocolV1::handle_ready_connect_message_reply_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " write ready connect message reply failed" + << dendl; + return _fault(); + } + + // notify + connection->dispatch_queue->queue_accept(connection); + messenger->ms_deliver_handle_fast_accept(connection); + once_ready = true; + + state = ACCEPTING_HANDLED_CONNECT_MSG; + + if (wait_for_seq) { + return wait_seq(); + } + + return server_ready(); +} + +CtPtr ProtocolV1::wait_seq() { + ldout(cct, 20) << __func__ << dendl; + + return READ(sizeof(uint64_t), handle_seq); +} + +CtPtr ProtocolV1::handle_seq(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read ack seq failed" << dendl; + return _fault(); + } + + uint64_t newly_acked_seq = *(uint64_t *)buffer; + ldout(cct, 2) << __func__ << " accept get newly_acked_seq " << newly_acked_seq + << dendl; + out_seq = discard_requeued_up_to(out_seq, newly_acked_seq); + + return server_ready(); +} + +CtPtr ProtocolV1::server_ready() { + ldout(cct, 20) << __func__ << " session_security is " + << session_security + << dendl; + + ldout(cct, 20) << __func__ << " accept done" << dendl; + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&connect_msg, 0, sizeof(connect_msg)); + + if (connection->delay_state) { + ceph_assert(connection->delay_state->ready()); + } + + return ready(); +} diff --git a/src/rgw/rgw_acl_s3.cc b/src/rgw/rgw_acl_s3.cc index 6007d41077a0e..b3702961894d4 100644 --- a/src/rgw/rgw_acl_s3.cc +++ b/src/rgw/rgw_acl_s3.cc @@ -545,7 +545,7 @@ bool RGWAccessControlPolicy_S3::compare_group_name(string& id, ACLGroupTypeEnum { switch (group) { case ACL_GROUP_ALL_USERS: - return (id.compare(rgw_uri_all_users) == 0); + return (id.compare(RGW_USER_ANON_ID) == 0); case ACL_GROUP_AUTHENTICATED_USERS: return (id.compare(rgw_uri_auth_users) == 0); default: diff --git a/src/rgw/rgw_acl_s3.cc.orig b/src/rgw/rgw_acl_s3.cc.orig new file mode 100644 index 0000000000000..6007d41077a0e --- /dev/null +++ b/src/rgw/rgw_acl_s3.cc.orig @@ -0,0 +1,586 @@ +#include + +#include +#include + +#include "include/types.h" + +#include "rgw_acl_s3.h" +#include "rgw_user.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + + +#define RGW_URI_ALL_USERS "http://acs.amazonaws.com/groups/global/AllUsers" +#define RGW_URI_AUTH_USERS "http://acs.amazonaws.com/groups/global/AuthenticatedUsers" + +static string rgw_uri_all_users = RGW_URI_ALL_USERS; +static string rgw_uri_auth_users = RGW_URI_AUTH_USERS; + +void ACLPermission_S3::to_xml(ostream& out) +{ + if ((flags & RGW_PERM_FULL_CONTROL) == RGW_PERM_FULL_CONTROL) { + out << "FULL_CONTROL"; + } else { + if (flags & RGW_PERM_READ) + out << "READ"; + if (flags & RGW_PERM_WRITE) + out << "WRITE"; + if (flags & RGW_PERM_READ_ACP) + out << "READ_ACP"; + if (flags & RGW_PERM_WRITE_ACP) + out << "WRITE_ACP"; + } +} + +bool ACLPermission_S3:: +xml_end(const char *el) +{ + const char *s = data.c_str(); + if (strcasecmp(s, "READ") == 0) { + flags |= RGW_PERM_READ; + return true; + } else if (strcasecmp(s, "WRITE") == 0) { + flags |= RGW_PERM_WRITE; + return true; + } else if (strcasecmp(s, "READ_ACP") == 0) { + flags |= RGW_PERM_READ_ACP; + return true; + } else if (strcasecmp(s, "WRITE_ACP") == 0) { + flags |= RGW_PERM_WRITE_ACP; + return true; + } else if (strcasecmp(s, "FULL_CONTROL") == 0) { + flags |= RGW_PERM_FULL_CONTROL; + return true; + } + return false; +} + + +class ACLGranteeType_S3 { +public: + static const char *to_string(ACLGranteeType& type) { + switch (type.get_type()) { + case ACL_TYPE_CANON_USER: + return "CanonicalUser"; + case ACL_TYPE_EMAIL_USER: + return "AmazonCustomerByEmail"; + case ACL_TYPE_GROUP: + return "Group"; + default: + return "unknown"; + } + } + + static void set(const char *s, ACLGranteeType& type) { + if (!s) { + type.set(ACL_TYPE_UNKNOWN); + return; + } + if (strcmp(s, "CanonicalUser") == 0) + type.set(ACL_TYPE_CANON_USER); + else if (strcmp(s, "AmazonCustomerByEmail") == 0) + type.set(ACL_TYPE_EMAIL_USER); + else if (strcmp(s, "Group") == 0) + type.set(ACL_TYPE_GROUP); + else + type.set(ACL_TYPE_UNKNOWN); + } +}; + +class ACLID_S3 : public XMLObj +{ +public: + ACLID_S3() {} + ~ACLID_S3() {} + string& to_str() { return data; } +}; + +class ACLURI_S3 : public XMLObj +{ +public: + ACLURI_S3() {} + ~ACLURI_S3() {} +}; + +class ACLEmail_S3 : public XMLObj +{ +public: + ACLEmail_S3() {} + ~ACLEmail_S3() {} +}; + +class ACLDisplayName_S3 : public XMLObj +{ +public: + ACLDisplayName_S3() {} + ~ACLDisplayName_S3() {} +}; + +bool ACLOwner_S3::xml_end(const char *el) { + ACLID_S3 *acl_id = static_cast(find_first("ID")); + ACLID_S3 *acl_name = static_cast(find_first("DisplayName")); + + // ID is mandatory + if (!acl_id) + return false; + id = acl_id->get_data(); + + // DisplayName is optional + if (acl_name) + display_name = acl_name->get_data(); + else + display_name = ""; + + return true; +} + +bool ACLGrant_S3::xml_end(const char *el) { + ACLGrantee_S3 *acl_grantee; + ACLID_S3 *acl_id; + ACLURI_S3 *acl_uri; + ACLEmail_S3 *acl_email; + ACLPermission_S3 *acl_permission; + ACLDisplayName_S3 *acl_name; + string uri; + + acl_grantee = static_cast(find_first("Grantee")); + if (!acl_grantee) + return false; + string type_str; + if (!acl_grantee->get_attr("xsi:type", type_str)) + return false; + ACLGranteeType_S3::set(type_str.c_str(), type); + + acl_permission = static_cast(find_first("Permission")); + if (!acl_permission) + return false; + + permission = *acl_permission; + + id.clear(); + name.clear(); + email.clear(); + + switch (type.get_type()) { + case ACL_TYPE_CANON_USER: + acl_id = static_cast(acl_grantee->find_first("ID")); + if (!acl_id) + return false; + id = acl_id->to_str(); + acl_name = static_cast(acl_grantee->find_first("DisplayName")); + if (acl_name) + name = acl_name->get_data(); + break; + case ACL_TYPE_GROUP: + acl_uri = static_cast(acl_grantee->find_first("URI")); + if (!acl_uri) + return false; + uri = acl_uri->get_data(); + group = uri_to_group(uri); + break; + case ACL_TYPE_EMAIL_USER: + acl_email = static_cast(acl_grantee->find_first("EmailAddress")); + if (!acl_email) + return false; + email = acl_email->get_data(); + break; + default: + // unknown user type + return false; + }; + return true; +} + +void ACLGrant_S3::to_xml(CephContext *cct, ostream& out) { + ACLPermission_S3& perm = static_cast(permission); + + /* only show s3 compatible permissions */ + if (!(perm.get_permissions() & RGW_PERM_ALL_S3)) + return; + + string uri; + + out << "" << + ""; + switch (type.get_type()) { + case ACL_TYPE_CANON_USER: + out << "" << id << ""; + if (name.size()) { + out << "" << name << ""; + } + break; + case ACL_TYPE_EMAIL_USER: + out << "" << email << ""; + break; + case ACL_TYPE_GROUP: + if (!group_to_uri(group, uri)) { + ldout(cct, 0) << "ERROR: group_to_uri failed with group=" << (int)group << dendl; + break; + } + out << "" << uri << ""; + break; + default: + break; + } + out << ""; + perm.to_xml(out); + out << ""; +} + +bool ACLGrant_S3::group_to_uri(ACLGroupTypeEnum group, string& uri) +{ + switch (group) { + case ACL_GROUP_ALL_USERS: + uri = rgw_uri_all_users; + return true; + case ACL_GROUP_AUTHENTICATED_USERS: + uri = rgw_uri_auth_users; + return true; + default: + return false; + } +} + +ACLGroupTypeEnum ACLGrant_S3::uri_to_group(string& uri) +{ + if (uri.compare(rgw_uri_all_users) == 0) + return ACL_GROUP_ALL_USERS; + else if (uri.compare(rgw_uri_auth_users) == 0) + return ACL_GROUP_AUTHENTICATED_USERS; + + return ACL_GROUP_NONE; +} + +bool RGWAccessControlList_S3::xml_end(const char *el) { + XMLObjIter iter = find("Grant"); + ACLGrant_S3 *grant = static_cast(iter.get_next()); + while (grant) { + add_grant(grant); + grant = static_cast(iter.get_next()); + } + return true; +} + +struct s3_acl_header { + int rgw_perm; + const char *http_header; +}; + +static const char *get_acl_header(RGWEnv *env, + const struct s3_acl_header *perm) +{ + const char *header = perm->http_header; + + return env->get(header, NULL); +} + +static int parse_grantee_str(RGWRados *store, string& grantee_str, + const struct s3_acl_header *perm, ACLGrant& grant) +{ + string id_type, id_val_quoted; + int rgw_perm = perm->rgw_perm; + int ret; + + RGWUserInfo info; + + ret = parse_key_value(grantee_str, id_type, id_val_quoted); + if (ret < 0) + return ret; + + string id_val = rgw_trim_quotes(id_val_quoted); + + if (strcasecmp(id_type.c_str(), "emailAddress") == 0) { + ret = rgw_get_user_info_by_email(store, id_val, info); + if (ret < 0) + return ret; + + grant.set_canon(info.user_id, info.display_name, rgw_perm); + } else if (strcasecmp(id_type.c_str(), "id") == 0) { + ret = rgw_get_user_info_by_uid(store, id_val, info); + if (ret < 0) + return ret; + + grant.set_canon(info.user_id, info.display_name, rgw_perm); + } else if (strcasecmp(id_type.c_str(), "uri") == 0) { + ACLGroupTypeEnum gid = grant.uri_to_group(id_val); + if (gid == ACL_GROUP_NONE) + return -EINVAL; + + grant.set_group(gid, rgw_perm); + } else { + return -EINVAL; + } + + return 0; +} + +static int parse_acl_header(RGWRados *store, RGWEnv *env, + const struct s3_acl_header *perm, std::list& _grants) +{ + std::list grantees; + std::string hacl_str; + + const char *hacl = get_acl_header(env, perm); + if (hacl == NULL) + return 0; + + hacl_str = hacl; + get_str_list(hacl_str, ",", grantees); + + for (list::iterator it = grantees.begin(); it != grantees.end(); ++it) { + ACLGrant grant; + int ret = parse_grantee_str(store, *it, perm, grant); + if (ret < 0) + return ret; + + _grants.push_back(grant); + } + + return 0; +} + +int RGWAccessControlList_S3::create_canned(ACLOwner& owner, ACLOwner& bucket_owner, const string& canned_acl) +{ + acl_user_map.clear(); + grant_map.clear(); + + ACLGrant owner_grant; + + string bid = bucket_owner.get_id(); + string bname = bucket_owner.get_display_name(); + + /* owner gets full control */ + owner_grant.set_canon(owner.get_id(), owner.get_display_name(), RGW_PERM_FULL_CONTROL); + add_grant(&owner_grant); + + if (canned_acl.size() == 0 || canned_acl.compare("private") == 0) { + return 0; + } + + ACLGrant bucket_owner_grant; + ACLGrant group_grant; + if (canned_acl.compare("public-read") == 0) { + group_grant.set_group(ACL_GROUP_ALL_USERS, RGW_PERM_READ); + add_grant(&group_grant); + } else if (canned_acl.compare("public-read-write") == 0) { + group_grant.set_group(ACL_GROUP_ALL_USERS, RGW_PERM_READ); + add_grant(&group_grant); + group_grant.set_group(ACL_GROUP_ALL_USERS, RGW_PERM_WRITE); + add_grant(&group_grant); + } else if (canned_acl.compare("authenticated-read") == 0) { + group_grant.set_group(ACL_GROUP_AUTHENTICATED_USERS, RGW_PERM_READ); + add_grant(&group_grant); + } else if (canned_acl.compare("bucket-owner-read") == 0) { + bucket_owner_grant.set_canon(bid, bname, RGW_PERM_READ); + if (bid.compare(owner.get_id()) != 0) + add_grant(&bucket_owner_grant); + } else if (canned_acl.compare("bucket-owner-full-control") == 0) { + bucket_owner_grant.set_canon(bid, bname, RGW_PERM_FULL_CONTROL); + if (bid.compare(owner.get_id()) != 0) + add_grant(&bucket_owner_grant); + } else { + return -EINVAL; + } + + return 0; +} + +int RGWAccessControlList_S3::create_from_grants(std::list& grants) +{ + if (grants.empty()) + return -EINVAL; + + acl_user_map.clear(); + grant_map.clear(); + + for (std::list::iterator it = grants.begin(); it != grants.end(); ++it) { + ACLGrant g = *it; + add_grant(&g); + } + + return 0; +} + +bool RGWAccessControlPolicy_S3::xml_end(const char *el) { + RGWAccessControlList_S3 *s3acl = + static_cast(find_first("AccessControlList")); + if (!s3acl) + return false; + + acl = *s3acl; + + ACLOwner *owner_p = static_cast(find_first("Owner")); + if (!owner_p) + return false; + owner = *owner_p; + return true; +} + +static const s3_acl_header acl_header_perms[] = { + {RGW_PERM_READ, "HTTP_X_AMZ_GRANT_READ"}, + {RGW_PERM_WRITE, "HTTP_X_AMZ_GRANT_WRITE"}, + {RGW_PERM_READ_ACP,"HTTP_X_AMZ_GRANT_READ_ACP"}, + {RGW_PERM_WRITE_ACP, "HTTP_X_AMZ_GRANT_WRITE_ACP"}, + {RGW_PERM_FULL_CONTROL, "HTTP_X_AMZ_GRANT_FULL_CONTROL"}, + {0, NULL} +}; + +int RGWAccessControlPolicy_S3::create_from_headers(RGWRados *store, RGWEnv *env, ACLOwner& _owner) +{ + std::list grants; + + for (const struct s3_acl_header *p = acl_header_perms; p->rgw_perm; p++) { + if (parse_acl_header(store, env, p, grants) < 0) + return false; + } + + RGWAccessControlList_S3& _acl = static_cast(acl); + int r = _acl.create_from_grants(grants); + + owner = _owner; + + return r; +} + +/* + can only be called on object that was parsed + */ +int RGWAccessControlPolicy_S3::rebuild(RGWRados *store, ACLOwner *owner, RGWAccessControlPolicy& dest) +{ + if (!owner) + return -EINVAL; + + ACLOwner *requested_owner = static_cast(find_first("Owner")); + if (requested_owner && requested_owner->get_id().compare(owner->get_id()) != 0) { + return -EPERM; + } + + RGWUserInfo owner_info; + if (rgw_get_user_info_by_uid(store, owner->get_id(), owner_info) < 0) { + ldout(cct, 10) << "owner info does not exist" << dendl; + return -EINVAL; + } + ACLOwner& dest_owner = dest.get_owner(); + dest_owner.set_id(owner->get_id()); + dest_owner.set_name(owner_info.display_name); + + ldout(cct, 20) << "owner id=" << owner->get_id() << dendl; + ldout(cct, 20) << "dest owner id=" << dest.get_owner().get_id() << dendl; + + RGWAccessControlList& dst_acl = dest.get_acl(); + + multimap& grant_map = acl.get_grant_map(); + multimap::iterator iter; + for (iter = grant_map.begin(); iter != grant_map.end(); ++iter) { + ACLGrant& src_grant = iter->second; + ACLGranteeType& type = src_grant.get_type(); + ACLGrant new_grant; + bool grant_ok = false; + string uid; + RGWUserInfo grant_user; + switch (type.get_type()) { + case ACL_TYPE_EMAIL_USER: + { + string email; + if (!src_grant.get_id(email)) { + ldout(cct, 0) << "ERROR: src_grant.get_id() failed" << dendl; + return -EINVAL; + } + ldout(cct, 10) << "grant user email=" << email << dendl; + if (rgw_get_user_info_by_email(store, email, grant_user) < 0) { + ldout(cct, 10) << "grant user email not found or other error" << dendl; + return -ERR_UNRESOLVABLE_EMAIL; + } + uid = grant_user.user_id; + } + case ACL_TYPE_CANON_USER: + { + if (type.get_type() == ACL_TYPE_CANON_USER) { + if (!src_grant.get_id(uid)) { + ldout(cct, 0) << "ERROR: src_grant.get_id() failed" << dendl; + return -EINVAL; + } + } + + if (grant_user.user_id.empty() && rgw_get_user_info_by_uid(store, uid, grant_user) < 0) { + ldout(cct, 10) << "grant user does not exist:" << uid << dendl; + return -EINVAL; + } else { + ACLPermission& perm = src_grant.get_permission(); + new_grant.set_canon(uid, grant_user.display_name, perm.get_permissions()); + grant_ok = true; + string new_id; + new_grant.get_id(new_id); + ldout(cct, 10) << "new grant: " << new_id << ":" << grant_user.display_name << dendl; + } + } + break; + case ACL_TYPE_GROUP: + { + string uri; + if (ACLGrant_S3::group_to_uri(src_grant.get_group(), uri)) { + new_grant = src_grant; + grant_ok = true; + ldout(cct, 10) << "new grant: " << uri << dendl; + } else { + ldout(cct, 10) << "bad grant group:" << (int)src_grant.get_group() << dendl; + return -EINVAL; + } + } + default: + break; + } + if (grant_ok) { + dst_acl.add_grant(&new_grant); + } + } + + return 0; +} + +bool RGWAccessControlPolicy_S3::compare_group_name(string& id, ACLGroupTypeEnum group) +{ + switch (group) { + case ACL_GROUP_ALL_USERS: + return (id.compare(rgw_uri_all_users) == 0); + case ACL_GROUP_AUTHENTICATED_USERS: + return (id.compare(rgw_uri_auth_users) == 0); + default: + return id.empty(); + } + + // shouldn't get here + return false; +} + +XMLObj *RGWACLXMLParser_S3::alloc_obj(const char *el) +{ + XMLObj * obj = NULL; + if (strcmp(el, "AccessControlPolicy") == 0) { + obj = new RGWAccessControlPolicy_S3(cct); + } else if (strcmp(el, "Owner") == 0) { + obj = new ACLOwner_S3(); + } else if (strcmp(el, "AccessControlList") == 0) { + obj = new RGWAccessControlList_S3(cct); + } else if (strcmp(el, "ID") == 0) { + obj = new ACLID_S3(); + } else if (strcmp(el, "DisplayName") == 0) { + obj = new ACLDisplayName_S3(); + } else if (strcmp(el, "Grant") == 0) { + obj = new ACLGrant_S3(); + } else if (strcmp(el, "Grantee") == 0) { + obj = new ACLGrantee_S3(); + } else if (strcmp(el, "Permission") == 0) { + obj = new ACLPermission_S3(); + } else if (strcmp(el, "URI") == 0) { + obj = new ACLURI_S3(); + } else if (strcmp(el, "EmailAddress") == 0) { + obj = new ACLEmail_S3(); + } + + return obj; +} + diff --git a/src/rgw/rgw_cors.cc b/src/rgw/rgw_cors.cc index 4be83605b502e..a155fd7f27d87 100644 --- a/src/rgw/rgw_cors.cc +++ b/src/rgw/rgw_cors.cc @@ -101,11 +101,13 @@ bool RGWCORSRule::is_header_allowed(const char *h, size_t len) { void RGWCORSRule::format_exp_headers(string& s) { s = ""; - for(list::iterator it = exposable_hdrs.begin(); - it != exposable_hdrs.end(); ++it) { - if (s.length() > 0) - s.append(","); - s.append((*it)); + for (const auto& header : exposable_hdrs) { + if (s.length() > 0) + s.append(","); + // these values are sent to clients in a 'Access-Control-Expose-Headers' + // response header, so we escape '\n' and '\r' to avoid header injection + std::string tmp = boost::replace_all_copy(header, "\n", "\\n"); + boost::replace_all_copy(std::back_inserter(s), tmp, "\r", "\\r"); } } diff --git a/src/rgw/rgw_cors.cc.orig b/src/rgw/rgw_cors.cc.orig new file mode 100644 index 0000000000000..51cdf0f2639f0 --- /dev/null +++ b/src/rgw/rgw_cors.cc.orig @@ -0,0 +1,150 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include + +#include +#include + +#include + +#include "include/types.h" +#include "common/debug.h" +#include "include/str_list.h" +#include "common/Formatter.h" + +#include "rgw_cors.h" + +#define dout_subsys ceph_subsys_rgw +using namespace std; + +void RGWCORSRule::dump_origins() { + unsigned num_origins = allowed_origins.size(); + dout(10) << "Allowed origins : " << num_origins << dendl; + for(set::iterator it = allowed_origins.begin(); + it != allowed_origins.end(); + ++it) { + dout(10) << *it << "," << dendl; + } +} + +void RGWCORSRule::erase_origin_if_present(string& origin, bool *rule_empty) { + set::iterator it = allowed_origins.find(origin); + if (!rule_empty) + return; + *rule_empty = false; + if (it != allowed_origins.end()) { + dout(10) << "Found origin " << origin << ", set size:" << + allowed_origins.size() << dendl; + allowed_origins.erase(it); + *rule_empty = (allowed_origins.empty()); + } +} + +static bool is_string_in_set(set& s, string h) { + if ((s.find("*") != s.end()) || + (s.find(h) != s.end())) { + return true; + } + /* The header can be Content-*-type, or Content-* */ + for(set::iterator it = s.begin(); + it != s.end(); ++it) { + size_t off; + if ((off = (*it).find("*"))!=string::npos) { + list ssplit; + unsigned flen = 0; + + get_str_list((*it), "* \t", ssplit); + if (off != 0) { + string sl = ssplit.front(); + flen = sl.length(); + dout(10) << "Finding " << sl << ", in " << h << ", at offset 0" << dendl; + if (!boost::algorithm::starts_with(h,sl)) + continue; + ssplit.pop_front(); + } + if (off != ((*it).length() - 1)) { + string sl = ssplit.front(); + dout(10) << "Finding " << sl << ", in " << h + << ", at offset not less than " << flen << dendl; + if (h.compare((h.size() - sl.size()), sl.size(), sl) != 0) + continue; + ssplit.pop_front(); + } + if (!ssplit.empty()) + continue; + return true; + } + } + return false; +} + +bool RGWCORSRule::is_origin_present(const char *o) { + string origin = o; + return is_string_in_set(allowed_origins, origin); +} + +bool RGWCORSRule::is_header_allowed(const char *h, size_t len) { + string hdr(h, len); + return is_string_in_set(allowed_hdrs, hdr); +} + +void RGWCORSRule::format_exp_headers(string& s) { + s = ""; + for (const auto& header : exposable_hdrs) { + if (s.length() > 0) + s.append(","); + // these values are sent to clients in a 'Access-Control-Expose-Headers' + // response header, so we escape '\n' to avoid header injection + boost::replace_all_copy(std::back_inserter(s), header, "\n", "\\n"); + } +} + +RGWCORSRule * RGWCORSConfiguration::host_name_rule(const char *origin) { + for(list::iterator it_r = rules.begin(); + it_r != rules.end(); ++it_r) { + RGWCORSRule& r = (*it_r); + if (r.is_origin_present(origin)) + return &r; + } + return NULL; +} + +void RGWCORSConfiguration::erase_host_name_rule(string& origin) { + bool rule_empty; + unsigned loop = 0; + /*Erase the host name from that rule*/ + dout(10) << "Num of rules : " << rules.size() << dendl; + for(list::iterator it_r = rules.begin(); + it_r != rules.end(); ++it_r, loop++) { + RGWCORSRule& r = (*it_r); + r.erase_origin_if_present(origin, &rule_empty); + dout(10) << "Origin:" << origin << ", rule num:" + << loop << ", emptying now:" << rule_empty << dendl; + if (rule_empty) { + rules.erase(it_r); + break; + } + } +} + +void RGWCORSConfiguration::dump() { + unsigned loop = 1; + unsigned num_rules = rules.size(); + dout(10) << "Number of rules: " << num_rules << dendl; + for(list::iterator it = rules.begin(); + it!= rules.end(); ++it, loop++) { + dout(10) << " <<<<<<< Rule " << loop << " >>>>>>> " << dendl; + (*it).dump_origins(); + } +} diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index bd73a239a4bf1..82ec4ba9ce9f4 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -15,6 +15,7 @@ #include "rgw_rest.h" #include "rgw_acl.h" #include "rgw_acl_s3.h" +#include "rgw_acl_swift.h" #include "rgw_user.h" #include "rgw_bucket.h" #include "rgw_log.h" @@ -319,7 +320,13 @@ static int rgw_build_policies(RGWRados *store, struct req_state *s, bool only_bu s->bucket_instance_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "bucket-instance"); - s->bucket_acl = new RGWAccessControlPolicy(s->cct); + if(s->dialect.compare("s3") == 0) { + s->bucket_acl = new RGWAccessControlPolicy_S3(s->cct); + } else if(s->dialect.compare("swift") == 0) { + s->bucket_acl = new RGWAccessControlPolicy_SWIFT(s->cct); + } else { + s->bucket_acl = new RGWAccessControlPolicy(s->cct); + } if (s->copy_source) { /* check if copy source is within the current domain */ const char *src = s->copy_source; diff --git a/src/rgw/rgw_op.cc.orig b/src/rgw/rgw_op.cc.orig new file mode 100644 index 0000000000000..bd73a239a4bf1 --- /dev/null +++ b/src/rgw/rgw_op.cc.orig @@ -0,0 +1,2678 @@ + +#include +#include + +#include + +#include "common/Clock.h" +#include "common/armor.h" +#include "common/mime.h" +#include "common/utf8.h" +#include "common/ceph_json.h" + +#include "rgw_rados.h" +#include "rgw_op.h" +#include "rgw_rest.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" +#include "rgw_user.h" +#include "rgw_bucket.h" +#include "rgw_log.h" +#include "rgw_multi.h" +#include "rgw_multi_del.h" +#include "rgw_cors.h" +#include "rgw_cors_s3.h" + +#include "rgw_client_io.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; +using ceph::crypto::MD5; + +static string mp_ns = RGW_OBJ_NS_MULTIPART; +static string shadow_ns = RGW_OBJ_NS_SHADOW; + +class MultipartMetaFilter : public RGWAccessListFilter { +public: + MultipartMetaFilter() {} + bool filter(string& name, string& key) { + int len = name.size(); + if (len < 6) + return false; + + int pos = name.find(MP_META_SUFFIX, len - 5); + if (pos <= 0) + return false; + + pos = name.rfind('.', pos - 1); + if (pos < 0) + return false; + + key = name.substr(0, pos); + + return true; + } +}; + +static MultipartMetaFilter mp_filter; + +static int parse_range(const char *range, off_t& ofs, off_t& end, bool *partial_content) +{ + int r = -ERANGE; + string s(range); + string ofs_str; + string end_str; + + *partial_content = false; + + int pos = s.find("bytes="); + if (pos < 0) { + pos = 0; + while (isspace(s[pos])) + pos++; + int end = pos; + while (isalpha(s[end])) + end++; + if (strncasecmp(s.c_str(), "bytes", end - pos) != 0) + return 0; + while (isspace(s[end])) + end++; + if (s[end] != '=') + return 0; + s = s.substr(end + 1); + } else { + s = s.substr(pos + 6); /* size of("bytes=") */ + } + pos = s.find('-'); + if (pos < 0) + goto done; + + *partial_content = true; + + ofs_str = s.substr(0, pos); + end_str = s.substr(pos + 1); + if (end_str.length()) { + end = atoll(end_str.c_str()); + if (end < 0) + goto done; + } + + if (ofs_str.length()) { + ofs = atoll(ofs_str.c_str()); + } else { // RFC2616 suffix-byte-range-spec + ofs = -end; + end = -1; + } + + if (end >= 0 && end < ofs) + goto done; + + r = 0; +done: + return r; +} + +static void format_xattr(std::string &xattr) +{ + /* If the extended attribute is not valid UTF-8, we encode it using quoted-printable + * encoding. + */ + if ((check_utf8(xattr.c_str(), xattr.length()) != 0) || + (check_for_control_characters(xattr.c_str(), xattr.length()) != 0)) { + static const char MIME_PREFIX_STR[] = "=?UTF-8?Q?"; + static const int MIME_PREFIX_LEN = sizeof(MIME_PREFIX_STR) - 1; + static const char MIME_SUFFIX_STR[] = "?="; + static const int MIME_SUFFIX_LEN = sizeof(MIME_SUFFIX_STR) - 1; + int mlen = mime_encode_as_qp(xattr.c_str(), NULL, 0); + char *mime = new char[MIME_PREFIX_LEN + mlen + MIME_SUFFIX_LEN + 1]; + strcpy(mime, MIME_PREFIX_STR); + mime_encode_as_qp(xattr.c_str(), mime + MIME_PREFIX_LEN, mlen); + strcpy(mime + MIME_PREFIX_LEN + (mlen - 1), MIME_SUFFIX_STR); + xattr.assign(mime); + delete [] mime; + } +} + +/** + * Get the HTTP request metadata out of the req_state as a + * map(, where attr_name is RGW_ATTR_PREFIX.HTTP_NAME) + * s: The request state + * attrs: will be filled up with attrs mapped as + * + */ +static void rgw_get_request_metadata(CephContext *cct, struct req_info& info, map& attrs) +{ + map::iterator iter; + for (iter = info.x_meta_map.begin(); iter != info.x_meta_map.end(); ++iter) { + const string &name(iter->first); + string &xattr(iter->second); + ldout(cct, 10) << "x>> " << name << ":" << xattr << dendl; + format_xattr(xattr); + string attr_name(RGW_ATTR_PREFIX); + attr_name.append(name); + map::value_type v(attr_name, bufferlist()); + std::pair < map::iterator, bool > rval(attrs.insert(v)); + bufferlist& bl(rval.first->second); + bl.append(xattr.c_str(), xattr.size() + 1); + } +} + +static int decode_policy(CephContext *cct, bufferlist& bl, RGWAccessControlPolicy *policy) +{ + bufferlist::iterator iter = bl.begin(); + try { + policy->decode(iter); + } catch (buffer::error& err) { + ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) { + RGWAccessControlPolicy_S3 *s3policy = static_cast(policy); + ldout(cct, 15) << "Read AccessControlPolicy"; + s3policy->to_xml(*_dout); + *_dout << dendl; + } + return 0; +} + +static int get_bucket_policy_from_attr(CephContext *cct, RGWRados *store, void *ctx, + RGWBucketInfo& bucket_info, map& bucket_attrs, + RGWAccessControlPolicy *policy, rgw_obj& obj) +{ + map::iterator aiter = bucket_attrs.find(RGW_ATTR_ACL); + + if (aiter != bucket_attrs.end()) { + int ret = decode_policy(cct, aiter->second, policy); + if (ret < 0) + return ret; + } else { + ldout(cct, 0) << "WARNING: couldn't find acl header for bucket, generating default" << dendl; + RGWUserInfo uinfo; + /* object exists, but policy is broken */ + int r = rgw_get_user_info_by_uid(store, bucket_info.owner, uinfo); + if (r < 0) + return r; + + policy->create_default(bucket_info.owner, uinfo.display_name); + } + return 0; +} + +static int get_obj_policy_from_attr(CephContext *cct, RGWRados *store, void *ctx, + RGWBucketInfo& bucket_info, map& bucket_attrs, + RGWAccessControlPolicy *policy, rgw_obj& obj) +{ + bufferlist bl; + int ret = 0; + + ret = store->get_attr(ctx, obj, RGW_ATTR_ACL, bl); + if (ret >= 0) { + ret = decode_policy(cct, bl, policy); + if (ret < 0) + return ret; + } else if (ret == -ENODATA) { + /* object exists, but policy is broken */ + ldout(cct, 0) << "WARNING: couldn't find acl header for object, generating default" << dendl; + RGWUserInfo uinfo; + ret = rgw_get_user_info_by_uid(store, bucket_info.owner, uinfo); + if (ret < 0) + return ret; + + policy->create_default(bucket_info.owner, uinfo.display_name); + } + return ret; +} + + +/** + * Get the AccessControlPolicy for an object off of disk. + * policy: must point to a valid RGWACL, and will be filled upon return. + * bucket: name of the bucket containing the object. + * object: name of the object to get the ACL for. + * Returns: 0 on success, -ERR# otherwise. + */ +static int get_policy_from_attr(CephContext *cct, RGWRados *store, void *ctx, + RGWBucketInfo& bucket_info, map& bucket_attrs, + RGWAccessControlPolicy *policy, rgw_obj& obj) +{ + if (obj.bucket.name.empty()) { + return 0; + } + + if (obj.object.empty()) { + rgw_obj instance_obj; + store->get_bucket_instance_obj(bucket_info.bucket, instance_obj); + return get_bucket_policy_from_attr(cct, store, ctx, bucket_info, bucket_attrs, + policy, instance_obj); + } + return get_obj_policy_from_attr(cct, store, ctx, bucket_info, bucket_attrs, + policy, obj); +} + +static int get_obj_attrs(RGWRados *store, struct req_state *s, rgw_obj& obj, map& attrs, + uint64_t *obj_size, RGWObjVersionTracker *objv_tracker) +{ + void *handle; + int ret = store->prepare_get_obj(s->obj_ctx, obj, NULL, NULL, &attrs, NULL, + NULL, NULL, NULL, NULL, NULL, obj_size, objv_tracker, &handle, &s->err); + store->finish_get_obj(&handle); + return ret; +} + +static int read_policy(RGWRados *store, struct req_state *s, + RGWBucketInfo& bucket_info, map& bucket_attrs, + RGWAccessControlPolicy *policy, rgw_bucket& bucket, string& object) +{ + string upload_id; + upload_id = s->info.args.get("uploadId"); + string oid = object; + rgw_obj obj; + + if (!s->system_request && bucket_info.flags & BUCKET_SUSPENDED) { + ldout(s->cct, 0) << "NOTICE: bucket " << bucket_info.bucket.name << " is suspended" << dendl; + return -ERR_USER_SUSPENDED; + } + + if (!oid.empty() && !upload_id.empty()) { + RGWMPObj mp(oid, upload_id); + oid = mp.get_meta(); + obj.init_ns(bucket, oid, mp_ns); + } else { + obj.init(bucket, oid); + } + int ret = get_policy_from_attr(s->cct, store, s->obj_ctx, bucket_info, bucket_attrs, policy, obj); + if (ret == -ENOENT && object.size()) { + /* object does not exist checking the bucket's ACL to make sure + that we send a proper error code */ + RGWAccessControlPolicy bucket_policy(s->cct); + string no_object; + rgw_obj no_obj(bucket, no_object); + ret = get_policy_from_attr(s->cct, store, s->obj_ctx, bucket_info, bucket_attrs, &bucket_policy, no_obj); + if (ret < 0) + return ret; + string& owner = bucket_policy.get_owner().get_id(); + if (!s->system_request && owner.compare(s->user.user_id) != 0 && + !bucket_policy.verify_permission(s->user.user_id, s->perm_mask, RGW_PERM_READ)) + ret = -EACCES; + else + ret = -ENOENT; + + } else if (ret == -ENOENT) { + ret = -ERR_NO_SUCH_BUCKET; + } + + return ret; +} + +/** + * Get the AccessControlPolicy for a bucket or object off of disk. + * s: The req_state to draw information from. + * only_bucket: If true, reads the bucket ACL rather than the object ACL. + * Returns: 0 on success, -ERR# otherwise. + */ +static int rgw_build_policies(RGWRados *store, struct req_state *s, bool only_bucket, bool prefetch_data) +{ + int ret = 0; + string obj_str; + RGWUserInfo bucket_owner_info; + + s->bucket_instance_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "bucket-instance"); + + s->bucket_acl = new RGWAccessControlPolicy(s->cct); + + if (s->copy_source) { /* check if copy source is within the current domain */ + const char *src = s->copy_source; + if (*src == '/') + ++src; + string copy_source_str(src); + + int pos = copy_source_str.find('/'); + if (pos > 0) + copy_source_str = copy_source_str.substr(0, pos); + + RGWBucketInfo source_info; + + ret = store->get_bucket_info(s->obj_ctx, copy_source_str, source_info, NULL); + if (ret == 0) { + string& region = source_info.region; + s->local_source = store->region.equals(region); + } + } + + if (s->bucket_name_str.size()) { + s->bucket_exists = true; + if (s->bucket_instance_id.empty()) { + ret = store->get_bucket_info(s->obj_ctx, s->bucket_name_str, s->bucket_info, NULL, &s->bucket_attrs); + } else { + ret = store->get_bucket_instance_info(s->obj_ctx, s->bucket_instance_id, s->bucket_info, NULL, &s->bucket_attrs); + } + if (ret < 0) { + if (ret != -ENOENT) { + ldout(s->cct, 0) << "NOTICE: couldn't get bucket from bucket_name (name=" << s->bucket_name_str << ")" << dendl; + return ret; + } + s->bucket_exists = false; + } + s->bucket = s->bucket_info.bucket; + + string no_obj; + RGWAccessControlPolicy bucket_acl(s->cct); + ret = read_policy(store, s, s->bucket_info, s->bucket_attrs, s->bucket_acl, s->bucket, no_obj); + + s->bucket_owner = s->bucket_acl->get_owner(); + + string& region = s->bucket_info.region; + if (s->bucket_exists && !store->region.equals(region)) { + ldout(s->cct, 0) << "NOTICE: request for data in a different region (" << region << " != " << store->region.name << ")" << dendl; + /* we now need to make sure that the operation actually requires copy source, that is + * it's a copy operation + */ + if (store->region.is_master && s->op == OP_DELETE && s->system_request) { + /*If the operation is delete and if this is the master, don't redirect*/ + } else if (!s->local_source || + (s->op != OP_PUT && s->op != OP_COPY) || + s->object_str.empty()) { + return -ERR_PERMANENT_REDIRECT; + } + } + } + + /* we're passed only_bucket = true when we specifically need the bucket's + acls, that happens on write operations */ + if (!only_bucket) { + s->object_acl = new RGWAccessControlPolicy(s->cct); + + obj_str = s->object_str; + rgw_obj obj(s->bucket, obj_str); + store->set_atomic(s->obj_ctx, obj); + if (prefetch_data) { + store->set_prefetch_data(s->obj_ctx, obj); + } + ret = read_policy(store, s, s->bucket_info, s->bucket_attrs, s->object_acl, s->bucket, obj_str); + } + + return ret; +} + +int RGWGetObj::verify_permission() +{ + obj.init(s->bucket, s->object_str); + store->set_atomic(s->obj_ctx, obj); + store->set_prefetch_data(s->obj_ctx, obj); + + if (!verify_object_permission(s, RGW_PERM_READ)) + return -EACCES; + + return 0; +} + + +int RGWOp::verify_op_mask() +{ + uint32_t required_mask = op_mask(); + + ldout(s->cct, 20) << "required_mask= " << required_mask << " user.op_mask=" << s->user.op_mask << dendl; + + if ((s->user.op_mask & required_mask) != required_mask) { + return -EPERM; + } + + if (!s->system_request && (required_mask & RGW_OP_TYPE_MODIFY) && !store->zone.is_master) { + ldout(s->cct, 5) << "NOTICE: modify request to a non-master zone by a non-system user, permission denied" << dendl; + return -EPERM; + } + + return 0; +} + +int RGWOp::init_quota() +{ + /* no quota enforcement for system requests */ + if (s->system_request) + return 0; + + /* init quota related stuff */ + if (!(s->user.op_mask & RGW_OP_TYPE_MODIFY)) { + return 0; + } + + /* only interested in object related ops */ + if (s->object_str.empty()) { + return 0; + } + + if (s->bucket_info.quota.enabled) { + bucket_quota = s->bucket_info.quota; + return 0; + } + if (s->user.user_id == s->bucket_owner.get_id()) { + if (s->user.bucket_quota.enabled) { + bucket_quota = s->user.bucket_quota; + return 0; + } + } else { + RGWUserInfo owner_info; + int r = rgw_get_user_info_by_uid(store, s->bucket_info.owner, owner_info); + if (r < 0) + return r; + + if (owner_info.bucket_quota.enabled) { + bucket_quota = owner_info.bucket_quota; + return 0; + } + } + + bucket_quota = store->region_map.bucket_quota; + return 0; +} + +static bool validate_cors_rule_method(RGWCORSRule *rule, const char *req_meth) { + uint8_t flags = 0; + if (strcmp(req_meth, "GET") == 0) flags = RGW_CORS_GET; + else if (strcmp(req_meth, "POST") == 0) flags = RGW_CORS_POST; + else if (strcmp(req_meth, "PUT") == 0) flags = RGW_CORS_PUT; + else if (strcmp(req_meth, "DELETE") == 0) flags = RGW_CORS_DELETE; + else if (strcmp(req_meth, "HEAD") == 0) flags = RGW_CORS_HEAD; + + if ((rule->get_allowed_methods() & flags) == flags) { + dout(10) << "Method " << req_meth << " is supported" << dendl; + } else { + dout(5) << "Method " << req_meth << " is not supported" << dendl; + return false; + } + + return true; +} + +int RGWOp::read_bucket_cors() +{ + bufferlist bl; + + map::iterator aiter = s->bucket_attrs.find(RGW_ATTR_CORS); + if (aiter == s->bucket_attrs.end()) { + ldout(s->cct, 20) << "no CORS configuration attr found" << dendl; + cors_exist = false; + return 0; /* no CORS configuration found */ + } + + cors_exist = true; + + bl = aiter->second; + + bufferlist::iterator iter = bl.begin(); + try { + bucket_cors.decode(iter); + } catch (buffer::error& err) { + ldout(s->cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) { + RGWCORSConfiguration_S3 *s3cors = static_cast(&bucket_cors); + ldout(s->cct, 15) << "Read RGWCORSConfiguration"; + s3cors->to_xml(*_dout); + *_dout << dendl; + } + return 0; +} + +static void get_cors_response_headers(RGWCORSRule *rule, const char *req_hdrs, string& hdrs, string& exp_hdrs, unsigned *max_age) { + if (req_hdrs) { + list hl; + get_str_list(req_hdrs, hl); + for(list::iterator it = hl.begin(); it != hl.end(); ++it) { + if (!rule->is_header_allowed((*it).c_str(), (*it).length())) { + dout(5) << "Header " << (*it) << " is not registered in this rule" << dendl; + } else { + if (hdrs.length() > 0) hdrs.append(","); + hdrs.append((*it)); + } + } + } + rule->format_exp_headers(exp_hdrs); + *max_age = rule->get_max_age(); +} + +bool RGWOp::generate_cors_headers(string& origin, string& method, string& headers, string& exp_headers, unsigned *max_age) +{ + const char *orig = s->info.env->get("HTTP_ORIGIN"); + if (!orig) { + return false; + } + origin = orig; + int ret = read_bucket_cors(); + if (ret < 0) { + return false; + } + + if (!cors_exist) { + dout(2) << "No CORS configuration set yet for this bucket" << dendl; + return false; + } + + RGWCORSRule *rule = bucket_cors.host_name_rule(orig); + if (!rule) + return false; + + const char *req_meth = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD"); + if (!req_meth) { + req_meth = s->info.method; + } + + if (req_meth) + method = req_meth; + + if (!validate_cors_rule_method(rule, req_meth)) { + return false; + } + + const char *req_hdrs = s->info.env->get("HTTP_ACCESS_CONTROL_ALLOW_HEADERS"); + + get_cors_response_headers(rule, req_hdrs, headers, exp_headers, max_age); + + return true; +} + +int RGWGetObj::read_user_manifest_part(rgw_bucket& bucket, RGWObjEnt& ent, RGWAccessControlPolicy *bucket_policy, off_t start_ofs, off_t end_ofs) +{ + ldout(s->cct, 0) << "user manifest obj=" << ent.name << dendl; + + void *handle = NULL; + off_t cur_ofs = start_ofs; + off_t cur_end = end_ofs; + utime_t start_time = s->time; + + rgw_obj part(bucket, ent.name); + + map attrs; + + uint64_t obj_size; + void *obj_ctx = store->create_context(s); + RGWAccessControlPolicy obj_policy(s->cct); + + ldout(s->cct, 20) << "reading obj=" << part << " ofs=" << cur_ofs << " end=" << cur_end << dendl; + + store->set_atomic(obj_ctx, part); + store->set_prefetch_data(obj_ctx, part); + ret = store->prepare_get_obj(obj_ctx, part, &cur_ofs, &cur_end, &attrs, NULL, + NULL, NULL, NULL, NULL, NULL, &obj_size, NULL, &handle, &s->err); + if (ret < 0) + goto done_err; + + if (obj_size != ent.size) { + // hmm.. something wrong, object not as expected, abort! + ldout(s->cct, 0) << "ERROR: expected obj_size=" << obj_size << ", actual read size=" << ent.size << dendl; + ret = -EIO; + goto done_err; + } + + ret = rgw_policy_from_attrset(s->cct, attrs, &obj_policy); + if (ret < 0) + goto done_err; + + if (!verify_object_permission(s, bucket_policy, &obj_policy, RGW_PERM_READ)) { + ret = -EPERM; + goto done_err; + } + + perfcounter->inc(l_rgw_get_b, cur_end - cur_ofs); + while (cur_ofs <= cur_end) { + bufferlist bl; + ret = store->get_obj(obj_ctx, NULL, &handle, part, bl, cur_ofs, cur_end); + if (ret < 0) + goto done_err; + + off_t len = bl.length(); + cur_ofs += len; + ofs += len; + ret = 0; + perfcounter->tinc(l_rgw_get_lat, + (ceph_clock_now(s->cct) - start_time)); + send_response_data(bl, 0, len); + + start_time = ceph_clock_now(s->cct); + } + + store->destroy_context(obj_ctx); + obj_ctx = NULL; + + store->finish_get_obj(&handle); + + return 0; + +done_err: + if (obj_ctx) + store->destroy_context(obj_ctx); + return ret; +} + +int RGWGetObj::iterate_user_manifest_parts(rgw_bucket& bucket, string& obj_prefix, RGWAccessControlPolicy *bucket_policy, + uint64_t *ptotal_len, bool read_data) +{ + uint64_t obj_ofs = 0, len_count = 0; + bool found_start = false, found_end = false; + string delim; + string marker; + bool is_truncated; + string no_ns; + map common_prefixes; + vector objs; + + utime_t start_time = ceph_clock_now(s->cct); + + do { +#define MAX_LIST_OBJS 100 + int r = store->list_objects(bucket, MAX_LIST_OBJS, obj_prefix, delim, marker, + objs, common_prefixes, + true, no_ns, true, &is_truncated, NULL); + if (r < 0) + return r; + + vector::iterator viter; + + for (viter = objs.begin(); viter != objs.end() && !found_end; ++viter) { + RGWObjEnt& ent = *viter; + uint64_t cur_total_len = obj_ofs; + uint64_t start_ofs = 0, end_ofs = ent.size; + + if (!found_start && cur_total_len + ent.size > (uint64_t)ofs) { + start_ofs = ofs - obj_ofs; + found_start = true; + } + + obj_ofs += ent.size; + + if (!found_end && obj_ofs > (uint64_t)end) { + end_ofs = end - cur_total_len + 1; + found_end = true; + } + + perfcounter->tinc(l_rgw_get_lat, + (ceph_clock_now(s->cct) - start_time)); + + if (found_start) { + len_count += end_ofs - start_ofs; + + if (read_data) { + r = read_user_manifest_part(bucket, ent, bucket_policy, start_ofs, end_ofs); + if (r < 0) + return r; + } + } + marker = ent.name; + + start_time = ceph_clock_now(s->cct); + } + } while (is_truncated && !found_end); + + if (ptotal_len) + *ptotal_len = len_count; + + return 0; +} + +int RGWGetObj::handle_user_manifest(const char *prefix) +{ + ldout(s->cct, 2) << "RGWGetObj::handle_user_manifest() prefix=" << prefix << dendl; + + string prefix_str = prefix; + int pos = prefix_str.find('/'); + if (pos < 0) + return -EINVAL; + + string bucket_name = prefix_str.substr(0, pos); + string obj_prefix = prefix_str.substr(pos + 1); + + rgw_bucket bucket; + + RGWAccessControlPolicy _bucket_policy(s->cct); + RGWAccessControlPolicy *bucket_policy; + + if (bucket_name.compare(s->bucket.name) != 0) { + RGWBucketInfo bucket_info; + map bucket_attrs; + int r = store->get_bucket_info(NULL, bucket_name, bucket_info, NULL, &bucket_attrs); + if (r < 0) { + ldout(s->cct, 0) << "could not get bucket info for bucket=" << bucket_name << dendl; + return r; + } + bucket = bucket_info.bucket; + string no_obj; + bucket_policy = &_bucket_policy; + r = read_policy(store, s, bucket_info, bucket_attrs, bucket_policy, bucket, no_obj); + if (r < 0) { + ldout(s->cct, 0) << "failed to read bucket policy" << dendl; + return r; + } + } else { + bucket = s->bucket; + bucket_policy = s->bucket_acl; + } + + /* dry run to find out total length */ + int r = iterate_user_manifest_parts(bucket, obj_prefix, bucket_policy, &total_len, false); + if (r < 0) + return r; + + s->obj_size = total_len; + + r = iterate_user_manifest_parts(bucket, obj_prefix, bucket_policy, NULL, true); + if (r < 0) + return r; + + return 0; +} + +class RGWGetObj_CB : public RGWGetDataCB +{ + RGWGetObj *op; +public: + RGWGetObj_CB(RGWGetObj *_op) : op(_op) {} + virtual ~RGWGetObj_CB() {} + + int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) { + return op->get_data_cb(bl, bl_ofs, bl_len); + } +}; + +int RGWGetObj::get_data_cb(bufferlist& bl, off_t bl_ofs, off_t bl_len) +{ + /* garbage collection related handling */ + utime_t start_time = ceph_clock_now(s->cct); + if (start_time > gc_invalidate_time) { + int r = store->defer_gc(s->obj_ctx, obj); + if (r < 0) { + dout(0) << "WARNING: could not defer gc entry for obj" << dendl; + } + gc_invalidate_time = start_time; + gc_invalidate_time += (s->cct->_conf->rgw_gc_obj_min_wait / 2); + } + return send_response_data(bl, bl_ofs, bl_len); +} + +void RGWGetObj::execute() +{ + void *handle = NULL; + utime_t start_time = s->time; + bufferlist bl; + gc_invalidate_time = ceph_clock_now(s->cct); + gc_invalidate_time += (s->cct->_conf->rgw_gc_obj_min_wait / 2); + + RGWGetObj_CB cb(this); + + map::iterator attr_iter; + + perfcounter->inc(l_rgw_get); + off_t new_ofs, new_end; + + ret = get_params(); + if (ret < 0) + goto done_err; + + ret = init_common(); + if (ret < 0) + goto done_err; + + new_ofs = ofs; + new_end = end; + + ret = store->prepare_get_obj(s->obj_ctx, obj, &new_ofs, &new_end, &attrs, mod_ptr, + unmod_ptr, &lastmod, if_match, if_nomatch, &total_len, &s->obj_size, NULL, &handle, &s->err); + if (ret < 0) + goto done_err; + + attr_iter = attrs.find(RGW_ATTR_USER_MANIFEST); + if (attr_iter != attrs.end()) { + ret = handle_user_manifest(attr_iter->second.c_str()); + if (ret < 0) { + ldout(s->cct, 0) << "ERROR: failed to handle user manifest ret=" << ret << dendl; + } + return; + } + + ofs = new_ofs; + end = new_end; + + start = ofs; + + if (!get_data || ofs > end) + goto done_err; + + perfcounter->inc(l_rgw_get_b, end - ofs); + + ret = store->get_obj_iterate(s->obj_ctx, &handle, obj, ofs, end, &cb); + + perfcounter->tinc(l_rgw_get_lat, + (ceph_clock_now(s->cct) - start_time)); + if (ret < 0) { + goto done_err; + } + + store->finish_get_obj(&handle); + +done_err: + send_response_data(bl, 0, 0); + store->finish_get_obj(&handle); +} + +int RGWGetObj::init_common() +{ + if (range_str) { + int r = parse_range(range_str, ofs, end, &partial_content); + if (r < 0) + return r; + } + if (if_mod) { + if (parse_time(if_mod, &mod_time) < 0) + return -EINVAL; + mod_ptr = &mod_time; + } + + if (if_unmod) { + if (parse_time(if_unmod, &unmod_time) < 0) + return -EINVAL; + unmod_ptr = &unmod_time; + } + + return 0; +} + +int RGWListBuckets::verify_permission() +{ + return 0; +} + +void RGWListBuckets::execute() +{ + bool done; + bool started = false; + uint64_t total_count = 0; + + uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk; + + ret = get_params(); + if (ret < 0) + goto send_end; + + do { + RGWUserBuckets buckets; + uint64_t read_count; + if (limit > 0) + read_count = min(limit - total_count, (uint64_t)max_buckets); + else + read_count = max_buckets; + + ret = rgw_read_user_buckets(store, s->user.user_id, buckets, + marker, read_count, should_get_stats()); + + if (!started) { + send_response_begin(buckets.count() > 0); + started = true; + } + + if (ret < 0) { + /* hmm.. something wrong here.. the user was authenticated, so it + should exist */ + ldout(s->cct, 10) << "WARNING: failed on rgw_get_user_buckets uid=" << s->user.user_id << dendl; + break; + } + map& m = buckets.get_buckets(); + + total_count += m.size(); + + done = (m.size() < read_count || (limit > 0 && total_count == limit)); + + if (!m.empty()) { + send_response_data(buckets); + + map::reverse_iterator riter = m.rbegin(); + marker = riter->first; + } + } while (!done); + +send_end: + if (!started) { + send_response_begin(false); + } + send_response_end(); +} + +int RGWStatAccount::verify_permission() +{ + return 0; +} + +void RGWStatAccount::execute() +{ + string marker; + bool done; + uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk; + + do { + RGWUserBuckets buckets; + + ret = rgw_read_user_buckets(store, s->user.user_id, buckets, marker, max_buckets, true); + if (ret < 0) { + /* hmm.. something wrong here.. the user was authenticated, so it + should exist */ + ldout(s->cct, 10) << "WARNING: failed on rgw_get_user_buckets uid=" << s->user.user_id << dendl; + break; + } else { + map& m = buckets.get_buckets(); + map::iterator iter; + for (iter = m.begin(); iter != m.end(); ++iter) { + RGWBucketEnt& bucket = iter->second; + buckets_size += bucket.size; + buckets_size_rounded += bucket.size_rounded; + buckets_objcount += bucket.count; + + marker = iter->first; + } + buckets_count += m.size(); + + done = (m.size() < max_buckets); + } + } while (!done); +} + +int RGWStatBucket::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_READ)) + return -EACCES; + + return 0; +} + +void RGWStatBucket::execute() +{ + RGWUserBuckets buckets; + bucket.bucket = s->bucket; + buckets.add(bucket); + map& m = buckets.get_buckets(); + ret = store->update_containers_stats(m); + if (!ret) + ret = -EEXIST; + if (ret > 0) { + ret = 0; + map::iterator iter = m.find(bucket.bucket.name); + if (iter != m.end()) { + bucket = iter->second; + } else { + ret = -EINVAL; + } + } +} + +int RGWListBucket::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_READ)) + return -EACCES; + + return 0; +} + +int RGWListBucket::parse_max_keys() +{ + if (!max_keys.empty()) { + char *endptr; + max = strtol(max_keys.c_str(), &endptr, 10); + if (endptr) { + while (*endptr && isspace(*endptr)) // ignore white space + endptr++; + if (*endptr) { + return -EINVAL; + } + } + } else { + max = default_max; + } + + return 0; +} + +void RGWListBucket::execute() +{ + string no_ns; + + ret = get_params(); + if (ret < 0) + return; + + ret = store->list_objects(s->bucket, max, prefix, delimiter, marker, objs, common_prefixes, + !!(s->prot_flags & RGW_REST_SWIFT), no_ns, true, &is_truncated, NULL); +} + +int RGWGetBucketLogging::verify_permission() +{ + if (s->user.user_id.compare(s->bucket_owner.get_id()) != 0) + return -EACCES; + + return 0; +} + +int RGWCreateBucket::verify_permission() +{ + if (!rgw_user_is_authenticated(s->user)) + return -EACCES; + + if (s->user.max_buckets) { + RGWUserBuckets buckets; + string marker; + int ret = rgw_read_user_buckets(store, s->user.user_id, buckets, marker, s->user.max_buckets, false); + if (ret < 0) + return ret; + + map& m = buckets.get_buckets(); + if (m.size() >= s->user.max_buckets) { + return -ERR_TOO_MANY_BUCKETS; + } + } + + return 0; +} + +static int forward_request_to_master(struct req_state *s, obj_version *objv, RGWRados *store, bufferlist& in_data, JSONParser *jp) +{ + if (!store->rest_master_conn) { + ldout(s->cct, 0) << "rest connection is invalid" << dendl; + return -EINVAL; + } + ldout(s->cct, 0) << "sending create_bucket request to master region" << dendl; + bufferlist response; +#define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response + int ret = store->rest_master_conn->forward(s->user.user_id, s->info, objv, MAX_REST_RESPONSE, &in_data, &response); + if (ret < 0) + return ret; + + ldout(s->cct, 20) << "response: " << response.c_str() << dendl; + ret = jp->parse(response.c_str(), response.length()); + if (ret < 0) { + ldout(s->cct, 0) << "failed parsing response from master region" << dendl; + return ret; + } + + return 0; +} + +void RGWCreateBucket::execute() +{ + RGWAccessControlPolicy old_policy(s->cct); + map attrs; + bufferlist aclbl; + bool existed; + int r; + rgw_obj obj(store->zone.domain_root, s->bucket_name_str); + obj_version objv, *pobjv = NULL; + + ret = get_params(); + if (ret < 0) + return; + + if (!store->region.is_master && + store->region.api_name != location_constraint) { + ldout(s->cct, 0) << "location constraint (" << location_constraint << ") doesn't match region" << " (" << store->region.api_name << ")" << dendl; + ret = -EINVAL; + return; + } + + /* we need to make sure we read bucket info, it's not read before for this specific request */ + ret = store->get_bucket_info(s->obj_ctx, s->bucket_name_str, s->bucket_info, NULL, &s->bucket_attrs); + if (ret < 0 && ret != -ENOENT) + return; + s->bucket_exists = (ret != -ENOENT); + + s->bucket_owner.set_id(s->user.user_id); + s->bucket_owner.set_name(s->user.display_name); + if (s->bucket_exists) { + r = get_policy_from_attr(s->cct, store, s->obj_ctx, s->bucket_info, s->bucket_attrs, + &old_policy, obj); + if (r >= 0) { + if (old_policy.get_owner().get_id().compare(s->user.user_id) != 0) { + ret = -EEXIST; + return; + } + } + } + + RGWBucketInfo master_info; + rgw_bucket *pmaster_bucket; + time_t creation_time; + + if (!store->region.is_master) { + JSONParser jp; + ret = forward_request_to_master(s, NULL, store, in_data, &jp); + if (ret < 0) + return; + + JSONDecoder::decode_json("entry_point_object_ver", ep_objv, &jp); + JSONDecoder::decode_json("object_ver", objv, &jp); + JSONDecoder::decode_json("bucket_info", master_info, &jp); + ldout(s->cct, 20) << "parsed: objv.tag=" << objv.tag << " objv.ver=" << objv.ver << dendl; + ldout(s->cct, 20) << "got creation time: << " << master_info.creation_time << dendl; + pmaster_bucket= &master_info.bucket; + creation_time = master_info.creation_time; + pobjv = &objv; + } else { + pmaster_bucket = NULL; + creation_time = 0; + } + + string region_name; + + if (s->system_request) { + region_name = s->info.args.get(RGW_SYS_PARAM_PREFIX "region"); + if (region_name.empty()) { + region_name = store->region.name; + } + } else { + region_name = store->region.name; + } + + policy.encode(aclbl); + + attrs[RGW_ATTR_ACL] = aclbl; + + s->bucket.name = s->bucket_name_str; + ret = store->create_bucket(s->user, s->bucket, region_name, placement_rule, attrs, info, pobjv, + &ep_objv, creation_time, pmaster_bucket, true); + /* continue if EEXIST and create_bucket will fail below. this way we can recover + * from a partial create by retrying it. */ + ldout(s->cct, 20) << "rgw_create_bucket returned ret=" << ret << " bucket=" << s->bucket << dendl; + + if (ret && ret != -EEXIST) + return; + + existed = (ret == -EEXIST); + + if (existed) { + /* bucket already existed, might have raced with another bucket creation, or + * might be partial bucket creation that never completed. Read existing bucket + * info, verify that the reported bucket owner is the current user. + * If all is ok then update the user's list of buckets + */ + if (info.owner.compare(s->user.user_id) != 0) { + ret = -ERR_BUCKET_EXISTS; + return; + } + s->bucket = info.bucket; + } + + ret = rgw_link_bucket(store, s->user.user_id, s->bucket, info.creation_time, false); + if (ret && !existed && ret != -EEXIST) /* if it exists (or previously existed), don't remove it! */ + rgw_unlink_bucket(store, s->user.user_id, s->bucket.name); + + if (ret == -EEXIST) + ret = -ERR_BUCKET_EXISTS; +} + +int RGWDeleteBucket::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_WRITE)) + return -EACCES; + + return 0; +} + +void RGWDeleteBucket::execute() +{ + ret = -EINVAL; + + if (s->bucket_name_str.empty()) + return; + + RGWObjVersionTracker ot; + ot.read_version = s->bucket_info.ep_objv; + + if (s->system_request) { + string tag = s->info.args.get(RGW_SYS_PARAM_PREFIX "tag"); + string ver_str = s->info.args.get(RGW_SYS_PARAM_PREFIX "ver"); + if (!tag.empty()) { + ot.read_version.tag = tag; + uint64_t ver; + string err; + ver = strict_strtol(ver_str.c_str(), 10, &err); + if (!err.empty()) { + ldout(s->cct, 0) << "failed to parse ver param" << dendl; + ret = -EINVAL; + return; + } + ot.read_version.ver = ver; + } + } + + ret = store->delete_bucket(s->bucket, ot); + + if (ret == 0) { + ret = rgw_unlink_bucket(store, s->user.user_id, s->bucket.name, false); + if (ret < 0) { + ldout(s->cct, 0) << "WARNING: failed to unlink bucket: ret=" << ret << dendl; + } + } + + if (ret < 0) { + return; + } + + if (!store->region.is_master) { + bufferlist in_data; + JSONParser jp; + ret = forward_request_to_master(s, &ot.read_version, store, in_data, &jp); + if (ret < 0) { + if (ret == -ENOENT) { /* adjust error, + we want to return with NoSuchBucket and not NoSuchKey */ + ret = -ERR_NO_SUCH_BUCKET; + } + return; + } + } + +} + +int RGWPutObj::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_WRITE)) + return -EACCES; + + return 0; +} + +class RGWPutObjProcessor_Multipart : public RGWPutObjProcessor_Atomic +{ + string part_num; + RGWMPObj mp; + req_state *s; + +protected: + bool immutable_head() { return true; } + int prepare(RGWRados *store, void *obj_ctx); + int do_complete(string& etag, time_t *mtime, time_t set_mtime, map& attrs); + +public: + RGWPutObjProcessor_Multipart(uint64_t _p, req_state *_s) : RGWPutObjProcessor_Atomic(_s->bucket, _s->object_str, _p, _s->req_id), s(_s) {} +}; + +int RGWPutObjProcessor_Multipart::prepare(RGWRados *store, void *obj_ctx) +{ + RGWPutObjProcessor::prepare(store, obj_ctx); + + string oid = obj_str; + string upload_id; + upload_id = s->info.args.get("uploadId"); + mp.init(oid, upload_id); + + part_num = s->info.args.get("partNumber"); + if (part_num.empty()) { + return -EINVAL; + } + + oid = mp.get_part(part_num); + + head_obj.init_ns(bucket, oid, mp_ns); + oid_prefix = oid; + oid_prefix.append("_"); + cur_obj = head_obj; + add_obj(head_obj); + return 0; +} + +int RGWPutObjProcessor_Multipart::do_complete(string& etag, time_t *mtime, time_t set_mtime, map& attrs) +{ + complete_writing_data(); + + RGWRados::PutObjMetaExtraParams params; + params.set_mtime = set_mtime; + params.mtime = mtime; + + int r = store->put_obj_meta(obj_ctx, head_obj, s->obj_size, attrs, RGW_OBJ_CATEGORY_MAIN, 0, params); + if (r < 0) + return r; + + bufferlist bl; + RGWUploadPartInfo info; + string p = "part."; + p.append(part_num); + info.num = atoi(part_num.c_str()); + info.etag = etag; + info.size = s->obj_size; + info.modified = ceph_clock_now(store->ctx()); + info.manifest = manifest; + ::encode(info, bl); + + string multipart_meta_obj = mp.get_meta(); + + rgw_obj meta_obj; + meta_obj.init_ns(bucket, multipart_meta_obj, mp_ns); + + r = store->omap_set(meta_obj, p, bl); + + return r; +} + + +RGWPutObjProcessor *RGWPutObj::select_processor() +{ + RGWPutObjProcessor *processor; + + bool multipart = s->info.args.exists("uploadId"); + + uint64_t part_size = s->cct->_conf->rgw_obj_stripe_size; + + if (!multipart) { + processor = new RGWPutObjProcessor_Atomic(s->bucket, s->object_str, part_size, s->req_id); + } else { + processor = new RGWPutObjProcessor_Multipart(part_size, s); + } + + return processor; +} + +void RGWPutObj::dispose_processor(RGWPutObjProcessor *processor) +{ + delete processor; +} + +void RGWPutObj::execute() +{ + RGWPutObjProcessor *processor = NULL; + char supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1]; + char supplied_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + MD5 hash; + bufferlist bl, aclbl; + map attrs; + int len; + map::iterator iter; + + + perfcounter->inc(l_rgw_put); + ret = -EINVAL; + if (!s->object) { + goto done; + } + + ret = get_params(); + if (ret < 0) + goto done; + + if (supplied_md5_b64) { + ldout(s->cct, 15) << "supplied_md5_b64=" << supplied_md5_b64 << dendl; + ret = ceph_unarmor(supplied_md5_bin, &supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1], + supplied_md5_b64, supplied_md5_b64 + strlen(supplied_md5_b64)); + ldout(s->cct, 15) << "ceph_armor ret=" << ret << dendl; + if (ret != CEPH_CRYPTO_MD5_DIGESTSIZE) { + ret = -ERR_INVALID_DIGEST; + goto done; + } + + buf_to_hex((const unsigned char *)supplied_md5_bin, CEPH_CRYPTO_MD5_DIGESTSIZE, supplied_md5); + ldout(s->cct, 15) << "supplied_md5=" << supplied_md5 << dendl; + } + + if (!chunked_upload) { /* with chunked upload we don't know how big is the upload. + we also check sizes at the end anyway */ + ret = store->check_quota(s->bucket, bucket_quota, s->content_length); + if (ret < 0) { + goto done; + } + } + + if (supplied_etag) { + strncpy(supplied_md5, supplied_etag, sizeof(supplied_md5) - 1); + supplied_md5[sizeof(supplied_md5) - 1] = '\0'; + } + + processor = select_processor(); + + ret = processor->prepare(store, s->obj_ctx); + if (ret < 0) + goto done; + + do { + bufferlist data; + len = get_data(data); + if (len < 0) { + ret = len; + goto done; + } + if (!len) + break; + + void *handle; + const unsigned char *data_ptr = (const unsigned char *)data.c_str(); + + ret = processor->handle_data(data, ofs, &handle); + if (ret < 0) + goto done; + + hash.Update(data_ptr, len); + + ret = processor->throttle_data(handle); + if (ret < 0) + goto done; + + ofs += len; + } while (len > 0); + + if (!chunked_upload && (uint64_t)ofs != s->content_length) { + ret = -ERR_REQUEST_TIMEOUT; + goto done; + } + s->obj_size = ofs; + perfcounter->inc(l_rgw_put_b, s->obj_size); + + ret = store->check_quota(s->bucket, bucket_quota, s->obj_size); + if (ret < 0) { + goto done; + } + + hash.Final(m); + + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5); + + if (supplied_md5_b64 && strcmp(calc_md5, supplied_md5)) { + ret = -ERR_BAD_DIGEST; + goto done; + } + policy.encode(aclbl); + + etag = calc_md5; + + if (supplied_etag && etag.compare(supplied_etag) != 0) { + ret = -ERR_UNPROCESSABLE_ENTITY; + goto done; + } + bl.append(etag.c_str(), etag.size() + 1); + attrs[RGW_ATTR_ETAG] = bl; + attrs[RGW_ATTR_ACL] = aclbl; + if (obj_manifest) { + bufferlist manifest_bl; + manifest_bl.append(obj_manifest, strlen(obj_manifest) + 1); + attrs[RGW_ATTR_USER_MANIFEST] = manifest_bl; + } + + for (iter = s->generic_attrs.begin(); iter != s->generic_attrs.end(); ++iter) { + bufferlist& attrbl = attrs[iter->first]; + const string& val = iter->second; + attrbl.append(val.c_str(), val.size() + 1); + } + + rgw_get_request_metadata(s->cct, s->info, attrs); + + ret = processor->complete(etag, &mtime, 0, attrs); +done: + dispose_processor(processor); + perfcounter->tinc(l_rgw_put_lat, + (ceph_clock_now(s->cct) - s->time)); +} + +int RGWPostObj::verify_permission() +{ + return 0; +} + +RGWPutObjProcessor *RGWPostObj::select_processor() +{ + RGWPutObjProcessor *processor; + + uint64_t part_size = s->cct->_conf->rgw_obj_stripe_size; + + processor = new RGWPutObjProcessor_Atomic(s->bucket, s->object_str, part_size, s->req_id); + + return processor; +} + +void RGWPostObj::dispose_processor(RGWPutObjProcessor *processor) +{ + delete processor; +} + +void RGWPostObj::execute() +{ + RGWPutObjProcessor *processor = NULL; + char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE]; + MD5 hash; + bufferlist bl, aclbl; + int len = 0; + + // read in the data from the POST form + ret = get_params(); + if (ret < 0) + goto done; + + ret = verify_params(); + if (ret < 0) + goto done; + + if (!verify_bucket_permission(s, RGW_PERM_WRITE)) { + ret = -EACCES; + goto done; + } + + processor = select_processor(); + + ret = processor->prepare(store, s->obj_ctx); + if (ret < 0) + goto done; + + while (data_pending) { + bufferlist data; + len = get_data(data); + + if (len < 0) { + ret = len; + goto done; + } + + if (!len) + break; + + void *handle; + const unsigned char *data_ptr = (const unsigned char *)data.c_str(); + + ret = processor->handle_data(data, ofs, &handle); + if (ret < 0) + goto done; + + hash.Update(data_ptr, len); + + ret = processor->throttle_data(handle); + if (ret < 0) + goto done; + + ofs += len; + + if (ofs > max_len) { + ret = -ERR_TOO_LARGE; + goto done; + } + } + + if (len < min_len) { + ret = -ERR_TOO_SMALL; + goto done; + } + + s->obj_size = ofs; + + hash.Final(m); + buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5); + + policy.encode(aclbl); + etag = calc_md5; + + bl.append(etag.c_str(), etag.size() + 1); + attrs[RGW_ATTR_ETAG] = bl; + attrs[RGW_ATTR_ACL] = aclbl; + + if (content_type.size()) { + bufferlist ct_bl; + ct_bl.append(content_type.c_str(), content_type.size() + 1); + attrs[RGW_ATTR_CONTENT_TYPE] = ct_bl; + } + + ret = processor->complete(etag, NULL, 0, attrs); + +done: + dispose_processor(processor); +} + + +int RGWPutMetadata::verify_permission() +{ + if (!verify_object_permission(s, RGW_PERM_WRITE)) + return -EACCES; + + return 0; +} + +void RGWPutMetadata::execute() +{ + const char *meta_prefix = RGW_ATTR_META_PREFIX; + int meta_prefix_len = sizeof(RGW_ATTR_META_PREFIX) - 1; + map attrs, orig_attrs, rmattrs; + map::iterator iter; + bufferlist bl, cors_bl; + + rgw_obj obj(s->bucket, s->object_str); + + store->set_atomic(s->obj_ctx, obj); + + ret = get_params(); + if (ret < 0) + return; + + rgw_get_request_metadata(s->cct, s->info, attrs); + + /* no need to track object versioning, need it for bucket's data only */ + RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->bucket_info.objv_tracker); + + /* check if obj exists, read orig attrs */ + ret = get_obj_attrs(store, s, obj, orig_attrs, NULL, ptracker); + if (ret < 0) + return; + + /* only remove meta attrs */ + for (iter = orig_attrs.begin(); iter != orig_attrs.end(); ++iter) { + const string& name = iter->first; + if (name.compare(0, meta_prefix_len, meta_prefix) == 0) { + rmattrs[name] = iter->second; + } else if (attrs.find(name) == attrs.end()) { + attrs[name] = iter->second; + } + } + + map::iterator giter; + for (giter = s->generic_attrs.begin(); giter != s->generic_attrs.end(); ++giter) { + bufferlist& attrbl = attrs[giter->first]; + const string& val = giter->second; + attrbl.append(val.c_str(), val.size() + 1); + } + + if (has_policy) { + policy.encode(bl); + attrs[RGW_ATTR_ACL] = bl; + } + if (has_cors) { + cors_config.encode(cors_bl); + attrs[RGW_ATTR_CORS] = cors_bl; + } + if (s->object) { + ret = store->set_attrs(s->obj_ctx, obj, attrs, &rmattrs, ptracker); + } else { + ret = rgw_bucket_set_attrs(store, obj.bucket, attrs, &rmattrs, ptracker); + } +} + +int RGWDeleteObj::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_WRITE)) + return -EACCES; + + return 0; +} + +void RGWDeleteObj::execute() +{ + ret = -EINVAL; + rgw_obj obj(s->bucket, s->object_str); + if (s->object) { + store->set_atomic(s->obj_ctx, obj); + ret = store->delete_obj(s->obj_ctx, obj); + } +} + +bool RGWCopyObj::parse_copy_location(const char *src, string& bucket_name, string& object) +{ + string url_src(src); + string dec_src; + + url_decode(url_src, dec_src); + src = dec_src.c_str(); + + if (*src == '/') ++src; + + string str(src); + + int pos = str.find("/"); + if (pos <= 0) + return false; + + bucket_name = str.substr(0, pos); + object = str.substr(pos + 1); + + if (object.size() == 0) + return false; + + return true; +} + +int RGWCopyObj::verify_permission() +{ + string empty_str; + RGWAccessControlPolicy src_policy(s->cct); + ret = get_params(); + if (ret < 0) + return ret; + + map src_attrs; + + ret = store->get_bucket_info(s->obj_ctx, src_bucket_name, src_bucket_info, NULL, &src_attrs); + if (ret < 0) + return ret; + + src_bucket = src_bucket_info.bucket; + + /* get buckets info (source and dest) */ + if (s->local_source && source_zone.empty()) { + rgw_obj src_obj(src_bucket, src_object); + store->set_atomic(s->obj_ctx, src_obj); + store->set_prefetch_data(s->obj_ctx, src_obj); + + /* check source object permissions */ + ret = read_policy(store, s, src_bucket_info, src_attrs, &src_policy, src_bucket, src_object); + if (ret < 0) + return ret; + + if (!s->system_request && /* system request overrides permission checks */ + !src_policy.verify_permission(s->user.user_id, s->perm_mask, RGW_PERM_READ)) + return -EACCES; + } + + RGWAccessControlPolicy dest_bucket_policy(s->cct); + map dest_attrs; + + if (src_bucket_name.compare(dest_bucket_name) == 0) { /* will only happen if s->local_source */ + dest_bucket_info = src_bucket_info; + } else { + ret = store->get_bucket_info(s->obj_ctx, dest_bucket_name, dest_bucket_info, NULL, &dest_attrs); + if (ret < 0) + return ret; + } + + dest_bucket = dest_bucket_info.bucket; + + rgw_obj dest_obj(dest_bucket, dest_object); + store->set_atomic(s->obj_ctx, dest_obj); + + /* check dest bucket permissions */ + ret = read_policy(store, s, dest_bucket_info, dest_attrs, &dest_bucket_policy, dest_bucket, empty_str); + if (ret < 0) + return ret; + + if (!s->system_request && /* system request overrides permission checks */ + !dest_bucket_policy.verify_permission(s->user.user_id, s->perm_mask, RGW_PERM_WRITE)) + return -EACCES; + + ret = init_dest_policy(); + if (ret < 0) + return ret; + + return 0; +} + + +int RGWCopyObj::init_common() +{ + if (if_mod) { + if (parse_time(if_mod, &mod_time) < 0) { + ret = -EINVAL; + return ret; + } + mod_ptr = &mod_time; + } + + if (if_unmod) { + if (parse_time(if_unmod, &unmod_time) < 0) { + ret = -EINVAL; + return ret; + } + unmod_ptr = &unmod_time; + } + + bufferlist aclbl; + dest_policy.encode(aclbl); + + attrs[RGW_ATTR_ACL] = aclbl; + rgw_get_request_metadata(s->cct, s->info, attrs); + + map::iterator iter; + for (iter = s->generic_attrs.begin(); iter != s->generic_attrs.end(); ++iter) { + bufferlist& attrbl = attrs[iter->first]; + const string& val = iter->second; + attrbl.append(val.c_str(), val.size() + 1); + } + return 0; +} + +static void copy_obj_progress_cb(off_t ofs, void *param) +{ + RGWCopyObj *op = static_cast(param); + op->progress_cb(ofs); +} + +void RGWCopyObj::progress_cb(off_t ofs) +{ + if (!s->cct->_conf->rgw_copy_obj_progress) + return; + + if (ofs - last_ofs < s->cct->_conf->rgw_copy_obj_progress_every_bytes) + return; + + send_partial_response(ofs); + + last_ofs = ofs; +} + +void RGWCopyObj::execute() +{ + rgw_obj src_obj, dst_obj; + + if (init_common() < 0) + return; + + src_obj.init(src_bucket, src_object); + dst_obj.init(dest_bucket, dest_object); + store->set_atomic(s->obj_ctx, src_obj); + + store->set_atomic(s->obj_ctx, dst_obj); + + ret = store->copy_obj(s->obj_ctx, + s->user.user_id, + client_id, + op_id, + &s->info, + source_zone, + dst_obj, + src_obj, + dest_bucket_info, + src_bucket_info, + &mtime, + mod_ptr, + unmod_ptr, + if_match, + if_nomatch, + replace_attrs, + attrs, RGW_OBJ_CATEGORY_MAIN, + &s->req_id, /* use req_id as tag */ + &s->err, + copy_obj_progress_cb, (void *)this + ); +} + +int RGWGetACLs::verify_permission() +{ + bool perm; + if (s->object) { + perm = verify_object_permission(s, RGW_PERM_READ_ACP); + } else { + perm = verify_bucket_permission(s, RGW_PERM_READ_ACP); + } + if (!perm) + return -EACCES; + + return 0; +} + +void RGWGetACLs::execute() +{ + stringstream ss; + RGWAccessControlPolicy *acl = (s->object ? s->object_acl : s->bucket_acl); + RGWAccessControlPolicy_S3 *s3policy = static_cast(acl); + s3policy->to_xml(ss); + acls = ss.str(); +} + + + +int RGWPutACLs::verify_permission() +{ + bool perm; + if (s->object) { + perm = verify_object_permission(s, RGW_PERM_WRITE_ACP); + } else { + perm = verify_bucket_permission(s, RGW_PERM_WRITE_ACP); + } + if (!perm) + return -EACCES; + + return 0; +} + +void RGWPutACLs::execute() +{ + bufferlist bl; + + RGWAccessControlPolicy_S3 *policy = NULL; + RGWACLXMLParser_S3 parser(s->cct); + RGWAccessControlPolicy_S3 new_policy(s->cct); + stringstream ss; + char *new_data = NULL; + ACLOwner owner; + rgw_obj obj; + + ret = 0; + + if (!parser.init()) { + ret = -EINVAL; + return; + } + + owner.set_id(s->user.user_id); + owner.set_name(s->user.display_name); + + ret = get_params(); + if (ret < 0) + return; + + ldout(s->cct, 15) << "read len=" << len << " data=" << (data ? data : "") << dendl; + + if (!s->canned_acl.empty() && len) { + ret = -EINVAL; + return; + } + + if (!s->canned_acl.empty() || s->has_acl_header) { + ret = get_policy_from_state(store, s, ss); + if (ret < 0) + return; + + new_data = strdup(ss.str().c_str()); + free(data); + data = new_data; + len = ss.str().size(); + } + + if (!parser.parse(data, len, 1)) { + ret = -EACCES; + return; + } + policy = static_cast(parser.find_first("AccessControlPolicy")); + if (!policy) { + ret = -EINVAL; + return; + } + + if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) { + ldout(s->cct, 15) << "Old AccessControlPolicy"; + policy->to_xml(*_dout); + *_dout << dendl; + } + + ret = policy->rebuild(store, &owner, new_policy); + if (ret < 0) + return; + + if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) { + ldout(s->cct, 15) << "New AccessControlPolicy:"; + new_policy.to_xml(*_dout); + *_dout << dendl; + } + + RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->bucket_info.objv_tracker); + + new_policy.encode(bl); + obj.init(s->bucket, s->object_str); + map attrs; + attrs[RGW_ATTR_ACL] = bl; + store->set_atomic(s->obj_ctx, obj); + if (s->object) { + ret = store->set_attrs(s->obj_ctx, obj, attrs, NULL, ptracker); + } else { + ret = rgw_bucket_set_attrs(store, obj.bucket, attrs, NULL, ptracker); + } +} + +int RGWGetCORS::verify_permission() +{ + if (s->user.user_id.compare(s->bucket_owner.get_id()) != 0) + return -EACCES; + + return 0; +} + +void RGWGetCORS::execute() +{ + ret = read_bucket_cors(); + if (ret < 0) + return ; + + if (!cors_exist) { + dout(2) << "No CORS configuration set yet for this bucket" << dendl; + ret = -ENOENT; + return; + } +} + +int RGWPutCORS::verify_permission() +{ + if (s->user.user_id.compare(s->bucket_owner.get_id()) != 0) + return -EACCES; + + return 0; +} + +void RGWPutCORS::execute() +{ + rgw_obj obj; + + ret = get_params(); + if (ret < 0) + return; + + RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->bucket_info.objv_tracker); + + store->get_bucket_instance_obj(s->bucket, obj); + store->set_atomic(s->obj_ctx, obj); + ret = store->set_attr(s->obj_ctx, obj, RGW_ATTR_CORS, cors_bl, ptracker); +} + +int RGWDeleteCORS::verify_permission() +{ + if (s->user.user_id.compare(s->bucket_owner.get_id()) != 0) + return -EACCES; + + return 0; +} + +void RGWDeleteCORS::execute() +{ + ret = read_bucket_cors(); + if (ret < 0) + return; + + bufferlist bl; + rgw_obj obj; + if (!cors_exist) { + dout(2) << "No CORS configuration set yet for this bucket" << dendl; + ret = -ENOENT; + return; + } + store->get_bucket_instance_obj(s->bucket, obj); + store->set_atomic(s->obj_ctx, obj); + map orig_attrs, attrs, rmattrs; + map::iterator iter; + + RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->bucket_info.objv_tracker); + + /* check if obj exists, read orig attrs */ + ret = get_obj_attrs(store, s, obj, orig_attrs, NULL, ptracker); + if (ret < 0) + return; + + /* only remove meta attrs */ + for (iter = orig_attrs.begin(); iter != orig_attrs.end(); ++iter) { + const string& name = iter->first; + dout(10) << "DeleteCORS : attr: " << name << dendl; + if (name.compare(0, (sizeof(RGW_ATTR_CORS) - 1), RGW_ATTR_CORS) == 0) { + rmattrs[name] = iter->second; + } else if (attrs.find(name) == attrs.end()) { + attrs[name] = iter->second; + } + } + ret = store->set_attrs(s->obj_ctx, obj, attrs, &rmattrs, ptracker); +} + +void RGWOptionsCORS::get_response_params(string& hdrs, string& exp_hdrs, unsigned *max_age) { + get_cors_response_headers(rule, req_hdrs, hdrs, exp_hdrs, max_age); +} + +int RGWOptionsCORS::validate_cors_request(RGWCORSConfiguration *cc) { + rule = cc->host_name_rule(origin); + if (!rule) { + dout(10) << "There is no cors rule present for " << origin << dendl; + return -ENOENT; + } + + if (!validate_cors_rule_method(rule, req_meth)) { + return -ENOENT; + } + return 0; +} + +void RGWOptionsCORS::execute() +{ + ret = read_bucket_cors(); + if (ret < 0) + return; + + origin = s->info.env->get("HTTP_ORIGIN"); + if (!origin) { + dout(0) << + "Preflight request without mandatory Origin header" + << dendl; + ret = -EINVAL; + return; + } + req_meth = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD"); + if (!req_meth) { + dout(0) << + "Preflight request without mandatory Access-control-request-method header" + << dendl; + ret = -EINVAL; + return; + } + if (!cors_exist) { + dout(2) << "No CORS configuration set yet for this bucket" << dendl; + ret = -ENOENT; + return; + } + req_hdrs = s->info.env->get("HTTP_ACCESS_CONTROL_ALLOW_HEADERS"); + ret = validate_cors_request(&bucket_cors); + if (!rule) { + origin = req_meth = NULL; + return; + } + return; +} + +int RGWInitMultipart::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_WRITE)) + return -EACCES; + + return 0; +} + +void RGWInitMultipart::execute() +{ + bufferlist aclbl; + map attrs; + rgw_obj obj; + map::iterator iter; + + if (get_params() < 0) + return; + ret = -EINVAL; + if (!s->object) + return; + + policy.encode(aclbl); + + attrs[RGW_ATTR_ACL] = aclbl; + + for (iter = s->generic_attrs.begin(); iter != s->generic_attrs.end(); ++iter) { + bufferlist& attrbl = attrs[iter->first]; + const string& val = iter->second; + attrbl.append(val.c_str(), val.size() + 1); + } + + rgw_get_request_metadata(s->cct, s->info, attrs); + + do { + char buf[33]; + gen_rand_alphanumeric(s->cct, buf, sizeof(buf) - 1); + upload_id = buf; + + string tmp_obj_name; + RGWMPObj mp(s->object_str, upload_id); + tmp_obj_name = mp.get_meta(); + + obj.init_ns(s->bucket, tmp_obj_name, mp_ns); + // the meta object will be indexed with 0 size, we c + ret = store->put_obj_meta(s->obj_ctx, obj, 0, NULL, attrs, RGW_OBJ_CATEGORY_MULTIMETA, PUT_OBJ_CREATE_EXCL); + } while (ret == -EEXIST); +} + +static int get_multiparts_info(RGWRados *store, struct req_state *s, string& meta_oid, map& parts, + RGWAccessControlPolicy& policy, map& attrs) +{ + map parts_map; + map::iterator iter; + bufferlist header; + + rgw_obj obj; + obj.init_ns(s->bucket, meta_oid, mp_ns); + + int ret = get_obj_attrs(store, s, obj, attrs, NULL, NULL); + if (ret < 0) + return ret; + + ret = store->omap_get_all(obj, header, parts_map); + if (ret < 0) + return ret; + + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + string name = iter->first; + if (name.compare(RGW_ATTR_ACL) == 0) { + bufferlist& bl = iter->second; + bufferlist::iterator bli = bl.begin(); + try { + ::decode(policy, bli); + } catch (buffer::error& err) { + ldout(s->cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + break; + } + } + + + for (iter = parts_map.begin(); iter != parts_map.end(); ++iter) { + bufferlist& bl = iter->second; + bufferlist::iterator bli = bl.begin(); + RGWUploadPartInfo info; + try { + ::decode(info, bli); + } catch (buffer::error& err) { + ldout(s->cct, 0) << "ERROR: could not part info, caught buffer::error" << dendl; + return -EIO; + } + parts[info.num] = info; + } + return 0; +} + +int RGWCompleteMultipart::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_WRITE)) + return -EACCES; + + return 0; +} + +void RGWCompleteMultipart::execute() +{ + RGWMultiCompleteUpload *parts; + map::iterator iter; + RGWMultiXMLParser parser; + string meta_oid; + map obj_parts; + map::iterator obj_iter; + RGWAccessControlPolicy policy(s->cct); + map attrs; + off_t ofs = 0; + MD5 hash; + char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16]; + bufferlist etag_bl; + rgw_obj meta_obj; + rgw_obj target_obj; + RGWMPObj mp; + RGWObjManifest manifest; + + ret = get_params(); + if (ret < 0) + return; + + if (!data) { + ret = -EINVAL; + return; + } + + if (!parser.init()) { + ret = -EINVAL; + return; + } + + if (!parser.parse(data, len, 1)) { + ret = -EINVAL; + return; + } + + parts = static_cast(parser.find_first("CompleteMultipartUpload")); + if (!parts) { + ret = -EINVAL; + return; + } + + // ensure that each part if of the minimum size + for (obj_iter = obj_parts.begin(); obj_iter != obj_parts.end(); ++obj_iter) { + if ((obj_iter->second).size < min_part_size) { + ret = -ERR_TOO_SMALL; + return; + } + } + + mp.init(s->object_str, upload_id); + meta_oid = mp.get_meta(); + + ret = get_multiparts_info(store, s, meta_oid, obj_parts, policy, attrs); + if (ret == -ENOENT) + ret = -ERR_NO_SUCH_UPLOAD; + if (parts->parts.size() != obj_parts.size()) + ret = -ERR_INVALID_PART; + if (ret < 0) + return; + + for (iter = parts->parts.begin(), obj_iter = obj_parts.begin(); + iter != parts->parts.end() && obj_iter != obj_parts.end(); + ++iter, ++obj_iter) { + char etag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + if (iter->first != (int)obj_iter->first) { + ldout(s->cct, 0) << "NOTICE: parts num mismatch: next requested: " << iter->first << " next uploaded: " << obj_iter->first << dendl; + ret = -ERR_INVALID_PART; + return; + } + string part_etag = rgw_string_unquote(iter->second); + if (part_etag.compare(obj_iter->second.etag) != 0) { + ldout(s->cct, 0) << "NOTICE: etag mismatch: part: " << iter->first << " etag: " << iter->second << dendl; + ret = -ERR_INVALID_PART; + return; + } + + hex_to_buf(obj_iter->second.etag.c_str(), etag, CEPH_CRYPTO_MD5_DIGESTSIZE); + hash.Update((const byte *)etag, sizeof(etag)); + } + hash.Final((byte *)final_etag); + + buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str); + snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2, + "-%lld", (long long)parts->parts.size()); + ldout(s->cct, 10) << "calculated etag: " << final_etag_str << dendl; + + etag_bl.append(final_etag_str, strlen(final_etag_str) + 1); + + attrs[RGW_ATTR_ETAG] = etag_bl; + + target_obj.init(s->bucket, s->object_str); + + list remove_objs; /* objects to be removed from index listing */ + + for (obj_iter = obj_parts.begin(); obj_iter != obj_parts.end(); ++obj_iter) { + RGWUploadPartInfo& obj_part = obj_iter->second; + string oid = mp.get_part(obj_iter->second.num); + rgw_obj src_obj; + src_obj.init_ns(s->bucket, oid, mp_ns); + + if (obj_part.manifest.empty()) { + RGWObjManifestPart& part = manifest.objs[ofs]; + + part.loc = src_obj; + part.loc_ofs = 0; + part.size = obj_iter->second.size; + } else { + manifest.append(obj_part.manifest); + } + + remove_objs.push_back(src_obj.object); + + ofs += obj_part.size; + } + + manifest.obj_size = ofs; + + store->set_atomic(s->obj_ctx, target_obj); + + RGWRados::PutObjMetaExtraParams extra_params; + + extra_params.manifest = &manifest; + extra_params.remove_objs = &remove_objs; + + extra_params.ptag = &s->req_id; /* use req_id as operation tag */ + + ret = store->put_obj_meta(s->obj_ctx, target_obj, ofs, attrs, + RGW_OBJ_CATEGORY_MAIN, PUT_OBJ_CREATE, + extra_params); + if (ret < 0) + return; + + // remove the upload obj + meta_obj.init_ns(s->bucket, meta_oid, mp_ns); + store->delete_obj(s->obj_ctx, meta_obj); +} + +int RGWAbortMultipart::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_WRITE)) + return -EACCES; + + return 0; +} + +void RGWAbortMultipart::execute() +{ + ret = -EINVAL; + string upload_id; + string meta_oid; + upload_id = s->info.args.get("uploadId"); + map obj_parts; + map::iterator obj_iter; + RGWAccessControlPolicy policy(s->cct); + map attrs; + rgw_obj meta_obj; + RGWMPObj mp; + + if (upload_id.empty() || s->object_str.empty()) + return; + + mp.init(s->object_str, upload_id); + meta_oid = mp.get_meta(); + + ret = get_multiparts_info(store, s, meta_oid, obj_parts, policy, attrs); + if (ret < 0) + return; + + for (obj_iter = obj_parts.begin(); obj_iter != obj_parts.end(); ++obj_iter) { + RGWUploadPartInfo& obj_part = obj_iter->second; + + if (obj_part.manifest.empty()) { + string oid = mp.get_part(obj_iter->second.num); + rgw_obj obj; + obj.init_ns(s->bucket, oid, mp_ns); + ret = store->delete_obj(s->obj_ctx, obj); + if (ret < 0 && ret != -ENOENT) + return; + } else { + RGWObjManifest& manifest = obj_part.manifest; + map::iterator oiter; + for (oiter = manifest.objs.begin(); oiter != manifest.objs.end(); ++oiter) { + RGWObjManifestPart& part = oiter->second; + ret = store->delete_obj(s->obj_ctx, part.loc); + if (ret < 0 && ret != -ENOENT) + return; + } + } + } + // and also remove the metadata obj + meta_obj.init_ns(s->bucket, meta_oid, mp_ns); + ret = store->delete_obj(s->obj_ctx, meta_obj); + if (ret == -ENOENT) { + ret = -ERR_NO_SUCH_BUCKET; + } +} + +int RGWListMultipart::verify_permission() +{ + if (!verify_object_permission(s, RGW_PERM_READ)) + return -EACCES; + + return 0; +} + +void RGWListMultipart::execute() +{ + map xattrs; + string meta_oid; + RGWMPObj mp; + + ret = get_params(); + if (ret < 0) + return; + + mp.init(s->object_str, upload_id); + meta_oid = mp.get_meta(); + + ret = get_multiparts_info(store, s, meta_oid, parts, policy, xattrs); +} + +int RGWListBucketMultiparts::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_READ)) + return -EACCES; + + return 0; +} + +void RGWListBucketMultiparts::execute() +{ + vector objs; + string marker_meta; + + ret = get_params(); + if (ret < 0) + return; + + if (s->prot_flags & RGW_REST_SWIFT) { + string path_args; + path_args = s->info.args.get("path"); + if (!path_args.empty()) { + if (!delimiter.empty() || !prefix.empty()) { + ret = -EINVAL; + return; + } + prefix = path_args; + delimiter="/"; + } + } + marker_meta = marker.get_meta(); + ret = store->list_objects(s->bucket, max_uploads, prefix, delimiter, marker_meta, objs, common_prefixes, + !!(s->prot_flags & RGW_REST_SWIFT), mp_ns, true, &is_truncated, &mp_filter); + if (!objs.empty()) { + vector::iterator iter; + RGWMultipartUploadEntry entry; + for (iter = objs.begin(); iter != objs.end(); ++iter) { + string name = iter->name; + if (!entry.mp.from_meta(name)) + continue; + entry.obj = *iter; + uploads.push_back(entry); + } + next_marker = entry; + } +} + +int RGWDeleteMultiObj::verify_permission() +{ + if (!verify_bucket_permission(s, RGW_PERM_WRITE)) + return -EACCES; + + return 0; +} + +void RGWDeleteMultiObj::execute() +{ + RGWMultiDelDelete *multi_delete; + vector::iterator iter; + RGWMultiDelXMLParser parser; + pair result; + int num_processed = 0; + + ret = get_params(); + if (ret < 0) { + goto error; + } + + if (!data) { + ret = -EINVAL; + goto error; + } + + if (!parser.init()) { + ret = -EINVAL; + goto error; + } + + if (!parser.parse(data, len, 1)) { + ret = -EINVAL; + goto error; + } + + multi_delete = static_cast(parser.find_first("Delete")); + if (!multi_delete) { + ret = -EINVAL; + goto error; + } + + if (multi_delete->is_quiet()) + quiet = true; + + begin_response(); + if (multi_delete->objects.empty()) { + goto done; + } + + for (iter = multi_delete->objects.begin(); + iter != multi_delete->objects.end() && num_processed < max_to_delete; + ++iter, num_processed++) { + + rgw_obj obj(bucket,(*iter)); + store->set_atomic(s->obj_ctx, obj); + ret = store->delete_obj(s->obj_ctx, obj); + result = make_pair(*iter, ret); + + send_partial_response(result); + } + + /* set the return code to zero, errors at this point will be + dumped to the response */ + ret = 0; + +done: + // will likely segfault if begin_response() has not been called + end_response(); + free(data); + return; + +error: + send_status(); + free(data); + return; + +} + +RGWHandler::~RGWHandler() +{ +} + +int RGWHandler::init(RGWRados *_store, struct req_state *_s, RGWClientIO *cio) +{ + store = _store; + s = _s; + + if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) { + const char *p; + const char **envp = cio->envp(); + for (int i=0; (p = envp[i]); ++i) { + ldout(s->cct, 20) << p << dendl; + } + } + + return 0; +} + +int RGWHandler::do_read_permissions(RGWOp *op, bool only_bucket) +{ + int ret = rgw_build_policies(store, s, only_bucket, op->prefetch_data()); + + if (ret < 0) { + ldout(s->cct, 10) << "read_permissions on " << s->bucket << ":" <object_str << " only_bucket=" << only_bucket << " ret=" << ret << dendl; + if (ret == -ENODATA) + ret = -EACCES; + } + + return ret; +} + + +RGWOp *RGWHandler::get_op(RGWRados *store) +{ + RGWOp *op; + switch (s->op) { + case OP_GET: + op = op_get(); + break; + case OP_PUT: + op = op_put(); + break; + case OP_DELETE: + op = op_delete(); + break; + case OP_HEAD: + op = op_head(); + break; + case OP_POST: + op = op_post(); + break; + case OP_COPY: + op = op_copy(); + break; + case OP_OPTIONS: + op = op_options(); + break; + default: + return NULL; + } + + if (op) { + op->init(store, s, this); + } + return op; +} + +void RGWHandler::put_op(RGWOp *op) +{ + delete op; +} + diff --git a/src/rgw/rgw_policy_s3.cc b/src/rgw/rgw_policy_s3.cc index fac05f18884b6..9bf20f2c57649 100644 --- a/src/rgw/rgw_policy_s3.cc +++ b/src/rgw/rgw_policy_s3.cc @@ -284,11 +284,13 @@ int RGWPolicy::from_json(bufferlist& bl, string& err_msg) int r = add_condition(v[0], v[1], v[2], err_msg); if (r < 0) return r; - } else { + } else if (!citer.end()) { JSONObj *c = *citer; dout(0) << "adding simple_check: " << c->get_name() << " : " << c->get_data() << dendl; add_simple_check(c->get_name(), c->get_data()); + } else { + return -EINVAL; } } return 0; diff --git a/src/rgw/rgw_policy_s3.cc.orig b/src/rgw/rgw_policy_s3.cc.orig new file mode 100644 index 0000000000000..fac05f18884b6 --- /dev/null +++ b/src/rgw/rgw_policy_s3.cc.orig @@ -0,0 +1,295 @@ + +#include + +#include "common/ceph_json.h" +#include "rgw_policy_s3.h" +#include "rgw_common.h" + + +#define dout_subsys ceph_subsys_rgw + +class RGWPolicyCondition { +protected: + string v1; + string v2; + + virtual bool check(const string& first, const string& second, string& err_msg) = 0; + +public: + virtual ~RGWPolicyCondition() {} + + void set_vals(const string& _v1, const string& _v2) { + v1 = _v1; + v2 = _v2; + } + + bool check(RGWPolicyEnv *env, map& checked_vars, string& err_msg) { + string first, second; + env->get_value(v1, first, checked_vars); + env->get_value(v2, second, checked_vars); + + dout(1) << "policy condition check " << v1 << " [" << first << "] " << v2 << " [" << second << "]" << dendl; + bool ret = check(first, second, err_msg); + if (!ret) { + err_msg.append(": "); + err_msg.append(v1); + err_msg.append(", "); + err_msg.append(v2); + } + return ret; + } + +}; + + +class RGWPolicyCondition_StrEqual : public RGWPolicyCondition { +protected: + bool check(const string& first, const string& second, string& msg) { + bool ret = first.compare(second) == 0; + if (!ret) { + msg = "Policy condition failed: eq"; + } + return ret; + } +}; + +class RGWPolicyCondition_StrStartsWith : public RGWPolicyCondition { +protected: + bool check(const string& first, const string& second, string& msg) { + bool ret = first.compare(0, second.size(), second) == 0; + if (!ret) { + msg = "Policy condition failed: starts-with"; + } + return ret; + } +}; + +void RGWPolicyEnv::add_var(const string& name, const string& value) +{ + vars[name] = value; +} + +bool RGWPolicyEnv::get_var(const string& name, string& val) +{ + map::iterator iter = vars.find(name); + if (iter == vars.end()) + return false; + + val = iter->second; + + return true; +} + +bool RGWPolicyEnv::get_value(const string& s, string& val, map& checked_vars) +{ + if (s.empty() || s[0] != '$') { + val = s; + return true; + } + + const string& var = s.substr(1); + checked_vars[var] = true; + + return get_var(var, val); +} + + +bool RGWPolicyEnv::match_policy_vars(map& policy_vars, string& err_msg) +{ + map::iterator iter; + string ignore_prefix = "x-ignore-"; + for (iter = vars.begin(); iter != vars.end(); ++iter) { + const string& var = iter->first; + if (strncasecmp(ignore_prefix.c_str(), var.c_str(), ignore_prefix.size()) == 0) + continue; + if (policy_vars.count(var) == 0) { + err_msg = "Policy missing condition: "; + err_msg.append(iter->first); + dout(1) << "env var missing in policy: " << iter->first << dendl; + return false; + } + } + return true; +} + +RGWPolicy::~RGWPolicy() +{ + list::iterator citer; + for (citer = conditions.begin(); citer != conditions.end(); ++citer) { + RGWPolicyCondition *cond = *citer; + delete cond; + } +} + +int RGWPolicy::set_expires(const string& e) +{ + struct tm t; + if (!parse_iso8601(e.c_str(), &t)) + return -EINVAL; + + expires = timegm(&t); + + return 0; +} + +int RGWPolicy::add_condition(const string& op, const string& first, const string& second, string& err_msg) +{ + RGWPolicyCondition *cond = NULL; + if (stringcasecmp(op, "eq") == 0) { + cond = new RGWPolicyCondition_StrEqual; + } else if (stringcasecmp(op, "starts-with") == 0) { + cond = new RGWPolicyCondition_StrStartsWith; + } else if (stringcasecmp(op, "content-length-range") == 0) { + off_t min, max; + int r = stringtoll(first, &min); + if (r < 0) { + err_msg = "Bad content-length-range param"; + dout(0) << "bad content-length-range param: " << first << dendl; + return r; + } + + r = stringtoll(second, &max); + if (r < 0) { + err_msg = "Bad content-length-range param"; + dout(0) << "bad content-length-range param: " << second << dendl; + return r; + } + + if (min > min_length) + min_length = min; + + if (max < max_length) + max_length = max; + + return 0; + } + + if (!cond) { + err_msg = "Invalid condition: "; + err_msg.append(op); + dout(0) << "invalid condition: " << op << dendl; + return -EINVAL; + } + + cond->set_vals(first, second); + + conditions.push_back(cond); + + return 0; +} + +int RGWPolicy::check(RGWPolicyEnv *env, string& err_msg) +{ + uint64_t now = ceph_clock_now(NULL).sec(); + if (expires <= now) { + dout(0) << "NOTICE: policy calculated as expired: " << expiration_str << dendl; + err_msg = "Policy expired"; + return -EACCES; // change to condition about expired policy following S3 + } + + list >::iterator viter; + for (viter = var_checks.begin(); viter != var_checks.end(); ++viter) { + pair& p = *viter; + const string& name = p.first; + const string& check_val = p.second; + string val; + if (!env->get_var(name, val)) { + dout(20) << " policy check failed, variable not found: '" << name << "'" << dendl; + err_msg = "Policy check failed, variable not found: "; + err_msg.append(name); + return -EACCES; + } + + set_var_checked(name); + + dout(20) << "comparing " << name << " [" << val << "], " << check_val << dendl; + if (val.compare(check_val) != 0) { + err_msg = "Policy check failed, variable not met condition: "; + err_msg.append(name); + dout(1) << "policy check failed, val=" << val << " != " << check_val << dendl; + return -EACCES; + } + } + + list::iterator citer; + for (citer = conditions.begin(); citer != conditions.end(); ++citer) { + RGWPolicyCondition *cond = *citer; + if (!cond->check(env, checked_vars, err_msg)) { + return -EACCES; + } + } + + if (!env->match_policy_vars(checked_vars, err_msg)) { + dout(1) << "missing policy condition" << dendl; + return -EACCES; + } + return 0; +} + + +int RGWPolicy::from_json(bufferlist& bl, string& err_msg) +{ + JSONParser parser; + + if (!parser.parse(bl.c_str(), bl.length())) { + err_msg = "Malformed JSON"; + dout(0) << "malformed json" << dendl; + return -EINVAL; + } + + // as no time was included in the request, we hope that the user has included a short timeout + JSONObjIter iter = parser.find_first("expiration"); + if (iter.end()) { + err_msg = "Policy missing expiration"; + dout(0) << "expiration not found" << dendl; + return -EINVAL; // change to a "no expiration" error following S3 + } + + JSONObj *obj = *iter; + expiration_str = obj->get_data(); + int r = set_expires(expiration_str); + if (r < 0) { + err_msg = "Failed to parse policy expiration"; + return r; + } + + iter = parser.find_first("conditions"); + if (iter.end()) { + err_msg = "Policy missing conditions"; + dout(0) << "conditions not found" << dendl; + return -EINVAL; // change to a "no conditions" error following S3 + } + + obj = *iter; + + iter = obj->find_first(); + for (; !iter.end(); ++iter) { + JSONObj *child = *iter; + dout(20) << "data=" << child->get_data() << dendl; + dout(20) << "is_object=" << child->is_object() << dendl; + dout(20) << "is_array=" << child->is_array() << dendl; + JSONObjIter citer = child->find_first(); + if (child->is_array()) { + vector v; + int i; + for (i = 0; !citer.end() && i < 3; ++citer, ++i) { + JSONObj *o = *citer; + v.push_back(o->get_data()); + } + if (i != 3 || !citer.end()) { /* we expect exactly 3 arguments here */ + err_msg = "Bad condition array, expecting 3 arguments"; + return -EINVAL; + } + + int r = add_condition(v[0], v[1], v[2], err_msg); + if (r < 0) + return r; + } else { + JSONObj *c = *citer; + dout(0) << "adding simple_check: " << c->get_name() << " : " << c->get_data() << dendl; + + add_simple_check(c->get_name(), c->get_data()); + } + } + return 0; +} diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc index 12bd377d8a064..1909188b5f6ab 100644 --- a/src/rgw/rgw_rest_s3.cc +++ b/src/rgw/rgw_rest_s3.cc @@ -124,6 +124,11 @@ int RGWGetObj_ObjStore_S3::send_response_data(bufferlist& bl, off_t bl_ofs, off_ bool exists; string val = s->info.args.get(p->param, &exists); if (exists) { + /* reject unauthenticated response header manipulation, see + * https://docs.aws.amazon.com/AmazonS3/latest/API/API_GetObject.html */ + if (s->auth.identity->is_anonymous()) { + return -ERR_INVALID_REQUEST; + } if (strcmp(p->param, "response-content-type") != 0) { response_attrs[p->http_attr] = val; } else { diff --git a/src/rgw/rgw_rest_s3.cc.orig b/src/rgw/rgw_rest_s3.cc.orig new file mode 100644 index 0000000000000..a130db7c8d496 --- /dev/null +++ b/src/rgw/rgw_rest_s3.cc.orig @@ -0,0 +1,2226 @@ +#include +#include + +#include "common/ceph_crypto.h" +#include "common/Formatter.h" +#include "common/utf8.h" +#include "common/ceph_json.h" + +#include "rgw_rest.h" +#include "rgw_rest_s3.h" +#include "rgw_auth_s3.h" +#include "rgw_acl.h" +#include "rgw_policy_s3.h" +#include "rgw_user.h" +#include "rgw_cors.h" +#include "rgw_cors_s3.h" + +#include "rgw_client_io.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace ceph::crypto; + +void list_all_buckets_start(struct req_state *s) +{ + s->formatter->open_array_section_in_ns("ListAllMyBucketsResult", + "http://s3.amazonaws.com/doc/2006-03-01/"); +} + +void list_all_buckets_end(struct req_state *s) +{ + s->formatter->close_section(); +} + +void dump_bucket(struct req_state *s, RGWBucketEnt& obj) +{ + s->formatter->open_object_section("Bucket"); + s->formatter->dump_string("Name", obj.bucket.name); + dump_time(s, "CreationDate", &obj.creation_time); + s->formatter->close_section(); +} + +void rgw_get_errno_s3(rgw_http_errors *e , int err_no) +{ + const struct rgw_http_errors *r; + r = search_err(err_no, RGW_HTTP_ERRORS, ARRAY_LEN(RGW_HTTP_ERRORS)); + + if (r) { + e->http_ret = r->http_ret; + e->s3_code = r->s3_code; + } else { + e->http_ret = 500; + e->s3_code = "UnknownError"; + } +} + +struct response_attr_param { + const char *param; + const char *http_attr; +}; + +static struct response_attr_param resp_attr_params[] = { + {"response-content-type", "Content-Type"}, + {"response-content-language", "Content-Language"}, + {"response-expires", "Expires"}, + {"response-cache-control", "Cache-Control"}, + {"response-content-disposition", "Content-Disposition"}, + {"response-content-encoding", "Content-Encoding"}, + {NULL, NULL}, +}; + +int RGWGetObj_ObjStore_S3::send_response_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) +{ + const char *content_type = NULL; + string content_type_str; + map response_attrs; + map::iterator riter; + bufferlist metadata_bl; + + if (ret) + goto done; + + if (sent_header) + goto send_data; + + if (range_str) + dump_range(s, start, end, s->obj_size); + + if (s->system_request && + s->info.args.exists(RGW_SYS_PARAM_PREFIX "prepend-metadata")) { + + /* JSON encode object metadata */ + JSONFormatter jf; + jf.open_object_section("obj_metadata"); + encode_json("attrs", attrs, &jf); + encode_json("mtime", lastmod, &jf); + jf.close_section(); + stringstream ss; + jf.flush(ss); + metadata_bl.append(ss.str()); + s->cio->print("Rgwx-Embedded-Metadata-Len: %lld\r\n", (long long)metadata_bl.length()); + total_len += metadata_bl.length(); + } + + if (s->system_request && lastmod) { + /* we end up dumping mtime in two different methods, a bit redundant */ + dump_epoch_header(s, "Rgwx-Mtime", lastmod); + } + + dump_content_length(s, total_len); + dump_last_modified(s, lastmod); + + if (!ret) { + map::iterator iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + bufferlist& bl = iter->second; + if (bl.length()) { + char *etag = bl.c_str(); + dump_etag(s, etag); + } + } + + for (struct response_attr_param *p = resp_attr_params; p->param; p++) { + bool exists; + string val = s->info.args.get(p->param, &exists); + if (exists) { + /* reject unauthenticated response header manipulation, see + * https://docs.aws.amazon.com/AmazonS3/latest/API/API_GetObject.html */ + if (s->auth.identity->is_anonymous()) { + return -EPERM; + } + if (strcmp(p->param, "response-content-type") != 0) { + response_attrs[p->http_attr] = val; + } else { + content_type_str = val; + content_type = content_type_str.c_str(); + } + } + } + + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + const char *name = iter->first.c_str(); + map::iterator aiter = rgw_to_http_attrs.find(name); + if (aiter != rgw_to_http_attrs.end()) { + if (response_attrs.count(aiter->second) > 0) // was already overridden by a response param + continue; + + if (aiter->first.compare(RGW_ATTR_CONTENT_TYPE) == 0) { // special handling for content_type + if (!content_type) + content_type = iter->second.c_str(); + continue; + } + response_attrs[aiter->second] = iter->second.c_str(); + } else { + if (strncmp(name, RGW_ATTR_META_PREFIX, sizeof(RGW_ATTR_META_PREFIX)-1) == 0) { + name += sizeof(RGW_ATTR_PREFIX) - 1; + s->cio->print("%s: %s\r\n", name, iter->second.c_str()); + } + } + } + } + +done: + set_req_state_err(s, (partial_content && !ret) ? STATUS_PARTIAL_CONTENT : ret); + + dump_errno(s); + + for (riter = response_attrs.begin(); riter != response_attrs.end(); ++riter) { + s->cio->print("%s: %s\n", riter->first.c_str(), riter->second.c_str()); + } + + if (!content_type) + content_type = "binary/octet-stream"; + + end_header(s, this, content_type); + + if (metadata_bl.length()) { + s->cio->write(metadata_bl.c_str(), metadata_bl.length()); + } + sent_header = true; + +send_data: + if (get_data && !ret) { + int r = s->cio->write(bl.c_str() + bl_ofs, bl_len); + if (r < 0) + return r; + } + + return 0; +} + +void RGWListBuckets_ObjStore_S3::send_response_begin(bool has_buckets) +{ + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + dump_start(s); + end_header(s, NULL, "application/xml"); + + if (!ret) { + list_all_buckets_start(s); + dump_owner(s, s->user.user_id, s->user.display_name); + s->formatter->open_array_section("Buckets"); + sent_data = true; + } +} + +void RGWListBuckets_ObjStore_S3::send_response_data(RGWUserBuckets& buckets) +{ + if (!sent_data) + return; + + map& m = buckets.get_buckets(); + map::iterator iter; + + for (iter = m.begin(); iter != m.end(); ++iter) { + RGWBucketEnt obj = iter->second; + dump_bucket(s, obj); + } + rgw_flush_formatter(s, s->formatter); +} + +void RGWListBuckets_ObjStore_S3::send_response_end() +{ + if (sent_data) { + s->formatter->close_section(); + list_all_buckets_end(s); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +int RGWListBucket_ObjStore_S3::get_params() +{ + prefix = s->info.args.get("prefix"); + marker = s->info.args.get("marker"); + max_keys = s->info.args.get("max-keys"); + ret = parse_max_keys(); + if (ret < 0) { + return ret; + } + delimiter = s->info.args.get("delimiter"); + return 0; +} + +void RGWListBucket_ObjStore_S3::send_response() +{ + if (ret < 0) + set_req_state_err(s, ret); + dump_errno(s); + + end_header(s, this, "application/xml"); + dump_start(s); + if (ret < 0) + return; + + s->formatter->open_object_section_in_ns("ListBucketResult", + "http://s3.amazonaws.com/doc/2006-03-01/"); + s->formatter->dump_string("Name", s->bucket_name_str); + if (!prefix.empty()) + s->formatter->dump_string("Prefix", prefix); + s->formatter->dump_string("Marker", marker); + s->formatter->dump_int("MaxKeys", max); + if (!delimiter.empty()) + s->formatter->dump_string("Delimiter", delimiter); + + s->formatter->dump_string("IsTruncated", (max && is_truncated ? "true" : "false")); + + if (ret >= 0) { + vector::iterator iter; + for (iter = objs.begin(); iter != objs.end(); ++iter) { + s->formatter->open_array_section("Contents"); + s->formatter->dump_string("Key", iter->name); + time_t mtime = iter->mtime.sec(); + dump_time(s, "LastModified", &mtime); + s->formatter->dump_format("ETag", "\"%s\"", iter->etag.c_str()); + s->formatter->dump_int("Size", iter->size); + s->formatter->dump_string("StorageClass", "STANDARD"); + dump_owner(s, iter->owner, iter->owner_display_name); + s->formatter->close_section(); + } + if (common_prefixes.size() > 0) { + map::iterator pref_iter; + for (pref_iter = common_prefixes.begin(); pref_iter != common_prefixes.end(); ++pref_iter) { + s->formatter->open_array_section("CommonPrefixes"); + s->formatter->dump_string("Prefix", pref_iter->first); + s->formatter->close_section(); + } + } + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWGetBucketLogging_ObjStore_S3::send_response() +{ + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); + + s->formatter->open_object_section_in_ns("BucketLoggingStatus", + "http://doc.s3.amazonaws.com/doc/2006-03-01/"); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +static void dump_bucket_metadata(struct req_state *s, RGWBucketEnt& bucket) +{ + char buf[32]; + snprintf(buf, sizeof(buf), "%lld", (long long)bucket.count); + s->cio->print("X-RGW-Object-Count: %s\n", buf); + snprintf(buf, sizeof(buf), "%lld", (long long)bucket.size); + s->cio->print("X-RGW-Bytes-Used: %s\n", buf); +} + +void RGWStatBucket_ObjStore_S3::send_response() +{ + if (ret >= 0) { + dump_bucket_metadata(s, bucket); + } + + set_req_state_err(s, ret); + dump_errno(s); + + end_header(s, this); + dump_start(s); +} + +static int create_s3_policy(struct req_state *s, RGWRados *store, RGWAccessControlPolicy_S3& s3policy) +{ + if (s->has_acl_header) { + if (!s->canned_acl.empty()) + return -ERR_INVALID_REQUEST; + + return s3policy.create_from_headers(store, s->info.env, s->owner); + } + + return s3policy.create_canned(s->owner, s->bucket_owner, s->canned_acl); +} + +class RGWLocationConstraint : public XMLObj +{ +public: + RGWLocationConstraint() {} + ~RGWLocationConstraint() {} + bool xml_end(const char *el) { + if (!el) + return false; + + location_constraint = get_data(); + + return true; + } + + string location_constraint; +}; + +class RGWCreateBucketConfig : public XMLObj +{ +public: + RGWCreateBucketConfig() {} + ~RGWCreateBucketConfig() {} +}; + +class RGWCreateBucketParser : public RGWXMLParser +{ + XMLObj *alloc_obj(const char *el) { + return new XMLObj; + } + +public: + RGWCreateBucketParser() {} + ~RGWCreateBucketParser() {} + + bool get_location_constraint(string& region) { + XMLObj *config = find_first("CreateBucketConfiguration"); + if (!config) + return false; + + XMLObj *constraint = config->find_first("LocationConstraint"); + if (!constraint) + return false; + + region = constraint->get_data(); + + return true; + } +}; + +int RGWCreateBucket_ObjStore_S3::get_params() +{ + RGWAccessControlPolicy_S3 s3policy(s->cct); + + int r = create_s3_policy(s, store, s3policy); + if (r < 0) + return r; + + policy = s3policy; + + int len = 0; + char *data; +#define CREATE_BUCKET_MAX_REQ_LEN (512 * 1024) /* this is way more than enough */ + ret = rgw_rest_read_all_input(s, &data, &len, CREATE_BUCKET_MAX_REQ_LEN); + if ((ret < 0) && (ret != -ERR_LENGTH_REQUIRED)) + return ret; + + bufferptr in_ptr(data, len); + in_data.append(in_ptr); + + if (len) { + RGWCreateBucketParser parser; + + if (!parser.init()) { + ldout(s->cct, 0) << "ERROR: failed to initialize parser" << dendl; + return -EIO; + } + + bool success = parser.parse(data, len, 1); + ldout(s->cct, 20) << "create bucket input data=" << data << dendl; + + if (!success) { + ldout(s->cct, 0) << "failed to parse input: " << data << dendl; + free(data); + return -EINVAL; + } + free(data); + + if (!parser.get_location_constraint(location_constraint)) { + ldout(s->cct, 0) << "provided input did not specify location constraint correctly" << dendl; + return -EINVAL; + } + + ldout(s->cct, 10) << "create bucket location constraint: " << location_constraint << dendl; + } + + int pos = location_constraint.find(':'); + if (pos >= 0) { + placement_rule = location_constraint.substr(pos + 1); + location_constraint = location_constraint.substr(0, pos); + } + + return 0; +} + +void RGWCreateBucket_ObjStore_S3::send_response() +{ + if (ret == -ERR_BUCKET_EXISTS) + ret = 0; + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + end_header(s); + + if (ret < 0) + return; + + if (s->system_request) { + JSONFormatter f; /* use json formatter for system requests output */ + + f.open_object_section("info"); + encode_json("entry_point_object_ver", ep_objv, &f); + encode_json("object_ver", info.objv_tracker.read_version, &f); + encode_json("bucket_info", info, &f); + f.close_section(); + rgw_flush_formatter_and_reset(s, &f); + } +} + +void RGWDeleteBucket_ObjStore_S3::send_response() +{ + int r = ret; + if (!r) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, this); + + if (s->system_request) { + JSONFormatter f; /* use json formatter for system requests output */ + + f.open_object_section("info"); + encode_json("object_ver", objv_tracker.read_version, &f); + f.close_section(); + rgw_flush_formatter_and_reset(s, &f); + } +} + +int RGWPutObj_ObjStore_S3::get_params() +{ + RGWAccessControlPolicy_S3 s3policy(s->cct); + if (!s->length) + return -ERR_LENGTH_REQUIRED; + + int r = create_s3_policy(s, store, s3policy); + if (r < 0) + return r; + + policy = s3policy; + + return RGWPutObj_ObjStore::get_params(); +} + +static int get_success_retcode(int code) +{ + switch (code) { + case 201: + return STATUS_CREATED; + case 204: + return STATUS_NO_CONTENT; + } + return 0; +} + +void RGWPutObj_ObjStore_S3::send_response() +{ + if (ret) { + set_req_state_err(s, ret); + } else { + if (s->cct->_conf->rgw_s3_success_create_obj_status) { + ret = get_success_retcode(s->cct->_conf->rgw_s3_success_create_obj_status); + set_req_state_err(s, ret); + } + dump_etag(s, etag.c_str()); + dump_content_length(s, 0); + } + if (s->system_request && mtime) { + dump_epoch_header(s, "Rgwx-Mtime", mtime); + } + dump_errno(s); + end_header(s, this); +} + +/* + * parses params in the format: 'first; param1=foo; param2=bar' + */ +static void parse_params(const string& params_str, string& first, map& params) +{ + int pos = params_str.find(';'); + if (pos < 0) { + first = rgw_trim_whitespace(params_str); + return; + } + + first = rgw_trim_whitespace(params_str.substr(0, pos)); + + pos++; + + while (pos < (int)params_str.size()) { + ssize_t end = params_str.find(';', pos); + if (end < 0) + end = params_str.size(); + + string param = params_str.substr(pos, end - pos); + + int eqpos = param.find('='); + if (eqpos > 0) { + string param_name = rgw_trim_whitespace(param.substr(0, eqpos)); + string val = rgw_trim_quotes(param.substr(eqpos + 1)); + params[param_name] = val; + } else { + params[rgw_trim_whitespace(param)] = ""; + } + + pos = end + 1; + } +} + +static int parse_part_field(const string& line, string& field_name, struct post_part_field& field) +{ + int pos = line.find(':'); + if (pos < 0) + return -EINVAL; + + field_name = line.substr(0, pos); + if (pos >= (int)line.size() - 1) + return 0; + + parse_params(line.substr(pos + 1), field.val, field.params); + + return 0; +} + +bool is_crlf(const char *s) +{ + return (*s == '\r' && *(s + 1) == '\n'); +} + +/* + * find the index of the boundary, if exists, or optionally the next end of line + * also returns how many bytes to skip + */ +static int index_of(bufferlist& bl, int max_len, const string& str, bool check_crlf, + bool *reached_boundary, int *skip) +{ + *reached_boundary = false; + *skip = 0; + + if (str.size() < 2) // we assume boundary is at least 2 chars (makes it easier with crlf checks) + return -EINVAL; + + if (bl.length() < str.size()) + return -1; + + const char *buf = bl.c_str(); + const char *s = str.c_str(); + + if (max_len > (int)bl.length()) + max_len = bl.length(); + + int i; + for (i = 0; i < max_len; i++, buf++) { + if (check_crlf && + i >= 1 && + is_crlf(buf - 1)) { + return i + 1; // skip the crlf + } + if ((i < max_len - (int)str.size() + 1) && + (buf[0] == s[0] && buf[1] == s[1]) && + (strncmp(buf, s, str.size()) == 0)) { + *reached_boundary = true; + *skip = str.size(); + + /* oh, great, now we need to swallow the preceding crlf + * if exists + */ + if ((i >= 2) && + is_crlf(buf - 2)) { + i -= 2; + *skip += 2; + } + return i; + } + } + + return -1; +} + +int RGWPostObj_ObjStore_S3::read_with_boundary(bufferlist& bl, uint64_t max, bool check_crlf, + bool *reached_boundary, bool *done) +{ + uint64_t cl = max + 2 + boundary.size(); + + if (max > in_data.length()) { + uint64_t need_to_read = cl - in_data.length(); + + bufferptr bp(need_to_read); + + int read_len; + s->cio->read(bp.c_str(), need_to_read, &read_len); + + in_data.append(bp, 0, read_len); + } + + *done = false; + int skip; + int index = index_of(in_data, cl, boundary, check_crlf, reached_boundary, &skip); + if (index >= 0) + max = index; + + if (max > in_data.length()) + max = in_data.length(); + + bl.substr_of(in_data, 0, max); + + bufferlist new_read_data; + + /* + * now we need to skip boundary for next time, also skip any crlf, or + * check to see if it's the last final boundary (marked with "--" at the end + */ + if (*reached_boundary) { + int left = in_data.length() - max; + if (left < skip + 2) { + int need = skip + 2 - left; + bufferptr boundary_bp(need); + int actual; + s->cio->read(boundary_bp.c_str(), need, &actual); + in_data.append(boundary_bp); + } + max += skip; // skip boundary for next time + if (in_data.length() >= max + 2) { + const char *data = in_data.c_str(); + if (is_crlf(data + max)) { + max += 2; + } else { + if (*(data + max) == '-' && + *(data + max + 1) == '-') { + *done = true; + max += 2; + } + } + } + } + + new_read_data.substr_of(in_data, max, in_data.length() - max); + in_data = new_read_data; + + return 0; +} + +int RGWPostObj_ObjStore_S3::read_line(bufferlist& bl, uint64_t max, + bool *reached_boundary, bool *done) +{ + return read_with_boundary(bl, max, true, reached_boundary, done); +} + +int RGWPostObj_ObjStore_S3::read_data(bufferlist& bl, uint64_t max, + bool *reached_boundary, bool *done) +{ + return read_with_boundary(bl, max, false, reached_boundary, done); +} + + +int RGWPostObj_ObjStore_S3::read_form_part_header(struct post_form_part *part, + bool *done) +{ + bufferlist bl; + bool reached_boundary; + int r = read_line(bl, RGW_MAX_CHUNK_SIZE, &reached_boundary, done); + if (r < 0) + return r; + + if (*done) { + return 0; + } + + if (reached_boundary) { // skip the first boundary + r = read_line(bl, RGW_MAX_CHUNK_SIZE, &reached_boundary, done); + if (r < 0) + return r; + if (*done) + return 0; + } + + while (true) { + /* + * iterate through fields + */ + string line = rgw_trim_whitespace(string(bl.c_str(), bl.length())); + + if (line.empty()) + break; + + struct post_part_field field; + + string field_name; + r = parse_part_field(line, field_name, field); + if (r < 0) + return r; + + part->fields[field_name] = field; + + if (stringcasecmp(field_name, "Content-Disposition") == 0) { + part->name = field.params["name"]; + } + + if (reached_boundary) + break; + + r = read_line(bl, RGW_MAX_CHUNK_SIZE, &reached_boundary, done); + } + + return 0; +} + +bool RGWPostObj_ObjStore_S3::part_str(const string& name, string *val) +{ + map::iterator iter = parts.find(name); + if (iter == parts.end()) + return false; + + bufferlist& data = iter->second.data; + string str = string(data.c_str(), data.length()); + *val = rgw_trim_whitespace(str); + return true; +} + +bool RGWPostObj_ObjStore_S3::part_bl(const string& name, bufferlist *pbl) +{ + map::iterator iter = parts.find(name); + if (iter == parts.end()) + return false; + + *pbl = iter->second.data; + return true; +} + +void RGWPostObj_ObjStore_S3::rebuild_key(string& key) +{ + static string var = "${filename}"; + int pos = key.find(var); + if (pos < 0) + return; + + string new_key = key.substr(0, pos); + new_key.append(filename); + new_key.append(key.substr(pos + var.size())); + + key = new_key; +} + +int RGWPostObj_ObjStore_S3::get_params() +{ + // get the part boundary + string req_content_type_str = s->info.env->get("CONTENT_TYPE", ""); + string req_content_type; + map params; + + if (s->expect_cont) { + /* ok, here it really gets ugly. With POST, the params are embedded in the + * request body, so we need to continue before being able to actually look + * at them. This diverts from the usual request flow. + */ + dump_continue(s); + s->expect_cont = false; + } + + parse_params(req_content_type_str, req_content_type, params); + + if (req_content_type.compare("multipart/form-data") != 0) { + err_msg = "Request Content-Type is not multipart/form-data"; + return -EINVAL; + } + + if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) { + ldout(s->cct, 20) << "request content_type_str=" << req_content_type_str << dendl; + ldout(s->cct, 20) << "request content_type params:" << dendl; + map::iterator iter; + for (iter = params.begin(); iter != params.end(); ++iter) { + ldout(s->cct, 20) << " " << iter->first << " -> " << iter->second << dendl; + } + } + + ldout(s->cct, 20) << "adding bucket to policy env: " << s->bucket.name << dendl; + env.add_var("bucket", s->bucket.name); + + map::iterator iter = params.find("boundary"); + if (iter == params.end()) { + err_msg = "Missing multipart boundary specification"; + return -EINVAL; + } + + // create the boundary + boundary = "--"; + boundary.append(iter->second); + + bool done; + do { + struct post_form_part part; + int r = read_form_part_header(&part, &done); + if (r < 0) + return r; + + if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) { + map::iterator piter; + for (piter = part.fields.begin(); piter != part.fields.end(); ++piter) { + ldout(s->cct, 20) << "read part header: name=" << part.name << " content_type=" << part.content_type << dendl; + ldout(s->cct, 20) << "name=" << piter->first << dendl; + ldout(s->cct, 20) << "val=" << piter->second.val << dendl; + ldout(s->cct, 20) << "params:" << dendl; + map& params = piter->second.params; + for (iter = params.begin(); iter != params.end(); ++iter) { + ldout(s->cct, 20) << " " << iter->first << " -> " << iter->second << dendl; + } + } + } + + if (done) { /* unexpected here */ + err_msg = "Malformed request"; + return -EINVAL; + } + + if (stringcasecmp(part.name, "file") == 0) { /* beginning of data transfer */ + struct post_part_field& field = part.fields["Content-Disposition"]; + map::iterator iter = field.params.find("filename"); + if (iter != field.params.end()) { + filename = iter->second; + } + parts[part.name] = part; + data_pending = true; + break; + } + + bool boundary; + r = read_data(part.data, RGW_MAX_CHUNK_SIZE, &boundary, &done); + if (!boundary) { + err_msg = "Couldn't find boundary"; + return -EINVAL; + } + parts[part.name] = part; + string part_str(part.data.c_str(), part.data.length()); + env.add_var(part.name, part_str); + } while (!done); + + if (!part_str("key", &s->object_str)) { + err_msg = "Key not specified"; + return -EINVAL; + } + + rebuild_key(s->object_str); + + if (s->object_str.empty()) { + err_msg = "Empty object name"; + return -EINVAL; + } + + env.add_var("key", s->object_str); + + part_str("Content-Type", &content_type); + env.add_var("Content-Type", content_type); + + map::iterator piter = parts.upper_bound(RGW_AMZ_META_PREFIX); + for (; piter != parts.end(); ++piter) { + string n = piter->first; + if (strncasecmp(n.c_str(), RGW_AMZ_META_PREFIX, sizeof(RGW_AMZ_META_PREFIX) - 1) != 0) + break; + + string attr_name = RGW_ATTR_PREFIX; + attr_name.append(n); + + /* need to null terminate it */ + bufferlist& data = piter->second.data; + string str = string(data.c_str(), data.length()); + + bufferlist attr_bl; + attr_bl.append(str.c_str(), str.size() + 1); + + attrs[attr_name] = attr_bl; + } + + int r = get_policy(); + if (r < 0) + return r; + + min_len = post_policy.min_length; + max_len = post_policy.max_length; + + return 0; +} + +int RGWPostObj_ObjStore_S3::get_policy() +{ + bufferlist encoded_policy; + + if (part_bl("policy", &encoded_policy)) { + + // check that the signature matches the encoded policy + string s3_access_key; + if (!part_str("AWSAccessKeyId", &s3_access_key)) { + ldout(s->cct, 0) << "No S3 access key found!" << dendl; + err_msg = "Missing access key"; + return -EINVAL; + } + string signature_str; + if (!part_str("signature", &signature_str)) { + ldout(s->cct, 0) << "No signature found!" << dendl; + err_msg = "Missing signature"; + return -EINVAL; + } + + RGWUserInfo user_info; + + ret = rgw_get_user_info_by_access_key(store, s3_access_key, user_info); + if (ret < 0) { + ldout(s->cct, 0) << "User lookup failed!" << dendl; + err_msg = "Bad access key / signature"; + return -EACCES; + } + + map access_keys = user_info.access_keys; + + map::const_iterator iter = access_keys.begin(); + string s3_secret_key = (iter->second).key; + + char calc_signature[CEPH_CRYPTO_HMACSHA1_DIGESTSIZE]; + + calc_hmac_sha1(s3_secret_key.c_str(), s3_secret_key.size(), encoded_policy.c_str(), encoded_policy.length(), calc_signature); + bufferlist encoded_hmac; + bufferlist raw_hmac; + raw_hmac.append(calc_signature, CEPH_CRYPTO_HMACSHA1_DIGESTSIZE); + raw_hmac.encode_base64(encoded_hmac); + encoded_hmac.append((char)0); /* null terminate */ + + if (signature_str.compare(encoded_hmac.c_str()) != 0) { + ldout(s->cct, 0) << "Signature verification failed!" << dendl; + ldout(s->cct, 0) << "expected: " << signature_str.c_str() << dendl; + ldout(s->cct, 0) << "got: " << encoded_hmac.c_str() << dendl; + err_msg = "Bad access key / signature"; + return -EACCES; + } + ldout(s->cct, 0) << "Successful Signature Verification!" << dendl; + bufferlist decoded_policy; + try { + decoded_policy.decode_base64(encoded_policy); + } catch (buffer::error& err) { + ldout(s->cct, 0) << "failed to decode_base64 policy" << dendl; + err_msg = "Could not decode policy"; + return -EINVAL; + } + + decoded_policy.append('\0'); // NULL terminate + + ldout(s->cct, 0) << "POST policy: " << decoded_policy.c_str() << dendl; + + int r = post_policy.from_json(decoded_policy, err_msg); + if (r < 0) { + if (err_msg.empty()) { + err_msg = "Failed to parse policy"; + } + ldout(s->cct, 0) << "failed to parse policy" << dendl; + return -EINVAL; + } + + post_policy.set_var_checked("AWSAccessKeyId"); + post_policy.set_var_checked("policy"); + post_policy.set_var_checked("signature"); + + r = post_policy.check(&env, err_msg); + if (r < 0) { + if (err_msg.empty()) { + err_msg = "Policy check failed"; + } + ldout(s->cct, 0) << "policy check failed" << dendl; + return r; + } + + s->user = user_info; + s->owner.set_id(user_info.user_id); + s->owner.set_name(user_info.display_name); + } else { + ldout(s->cct, 0) << "No attached policy found!" << dendl; + } + + string canned_acl; + part_str("acl", &canned_acl); + + RGWAccessControlPolicy_S3 s3policy(s->cct); + ldout(s->cct, 20) << "canned_acl=" << canned_acl << dendl; + if (s3policy.create_canned(s->owner, s->bucket_owner, canned_acl) < 0) { + err_msg = "Bad canned ACLs"; + return -EINVAL; + } + + policy = s3policy; + + return 0; +} + +int RGWPostObj_ObjStore_S3::complete_get_params() +{ + bool done; + do { + struct post_form_part part; + int r = read_form_part_header(&part, &done); + if (r < 0) + return r; + + bufferlist part_data; + bool boundary; + r = read_data(part.data, RGW_MAX_CHUNK_SIZE, &boundary, &done); + if (!boundary) { + return -EINVAL; + } + + parts[part.name] = part; + } while (!done); + + return 0; +} + +int RGWPostObj_ObjStore_S3::get_data(bufferlist& bl) +{ + bool boundary; + bool done; + + int r = read_data(bl, RGW_MAX_CHUNK_SIZE, &boundary, &done); + if (r < 0) + return r; + + if (boundary) { + data_pending = false; + + if (!done) { /* reached end of data, let's drain the rest of the params */ + r = complete_get_params(); + if (r < 0) + return r; + } + } + + return bl.length(); +} + +void RGWPostObj_ObjStore_S3::send_response() +{ + if (ret == 0 && parts.count("success_action_redirect")) { + string redirect; + + part_str("success_action_redirect", &redirect); + + string bucket; + string key; + string etag_str = "\""; + + etag_str.append(etag); + etag_str.append("\""); + + string etag_url; + + url_encode(s->bucket_name_str, bucket); + url_encode(s->object_str, key); + url_encode(etag_str, etag_url); + + redirect.append("?bucket="); + redirect.append(bucket); + redirect.append("&key="); + redirect.append(key); + redirect.append("&etag="); + redirect.append(etag_url); + + int r = check_utf8(redirect.c_str(), redirect.size()); + if (r < 0) { + ret = r; + goto done; + } + dump_redirect(s, redirect); + ret = STATUS_REDIRECT; + } else if (ret == 0 && parts.count("success_action_status")) { + string status_string; + uint32_t status_int; + + part_str("success_action_status", &status_string); + + int r = stringtoul(status_string, &status_int); + if (r < 0) { + ret = r; + goto done; + } + + switch (status_int) { + case 200: + break; + case 201: + ret = STATUS_CREATED; + break; + default: + ret = STATUS_NO_CONTENT; + break; + } + } else if (!ret) { + ret = STATUS_NO_CONTENT; + } + +done: + if (ret == STATUS_CREATED) { + s->formatter->open_object_section("PostResponse"); + if (g_conf->rgw_dns_name.length()) + s->formatter->dump_format("Location", "%s/%s", s->info.script_uri.c_str(), s->object_str.c_str()); + s->formatter->dump_string("Bucket", s->bucket_name_str); + s->formatter->dump_string("Key", s->object_str); + s->formatter->close_section(); + } + s->err.message = err_msg; + set_req_state_err(s, ret); + dump_errno(s); + dump_content_length(s, s->formatter->get_len()); + end_header(s, this); + if (ret != STATUS_CREATED) + return; + + rgw_flush_formatter_and_reset(s, s->formatter); +} + + +void RGWDeleteObj_ObjStore_S3::send_response() +{ + int r = ret; + if (r == -ENOENT) + r = 0; + if (!r) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, this); +} + +int RGWCopyObj_ObjStore_S3::init_dest_policy() +{ + RGWAccessControlPolicy_S3 s3policy(s->cct); + + /* build a policy for the target object */ + int r = create_s3_policy(s, store, s3policy); + if (r < 0) + return r; + + dest_policy = s3policy; + + return 0; +} + +int RGWCopyObj_ObjStore_S3::get_params() +{ + if_mod = s->info.env->get("HTTP_X_AMZ_COPY_IF_MODIFIED_SINCE"); + if_unmod = s->info.env->get("HTTP_X_AMZ_COPY_IF_UNMODIFIED_SINCE"); + if_match = s->info.env->get("HTTP_X_AMZ_COPY_IF_MATCH"); + if_nomatch = s->info.env->get("HTTP_X_AMZ_COPY_IF_NONE_MATCH"); + + src_bucket_name = s->src_bucket_name; + src_object = s->src_object; + dest_bucket_name = s->bucket.name; + dest_object = s->object_str; + + if (s->system_request) { + source_zone = s->info.args.get(RGW_SYS_PARAM_PREFIX "source-zone"); + if (!source_zone.empty()) { + client_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "client-id"); + op_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "op-id"); + + if (client_id.empty() || op_id.empty()) { + ldout(s->cct, 0) << RGW_SYS_PARAM_PREFIX "client-id or " RGW_SYS_PARAM_PREFIX "op-id were not provided, required for intra-region copy" << dendl; + return -EINVAL; + } + } + } + + const char *md_directive = s->info.env->get("HTTP_X_AMZ_METADATA_DIRECTIVE"); + if (md_directive) { + if (strcasecmp(md_directive, "COPY") == 0) { + replace_attrs = false; + } else if (strcasecmp(md_directive, "REPLACE") == 0) { + replace_attrs = true; + } else if (!source_zone.empty()) { + replace_attrs = false; // default for intra-region copy + } else { + ldout(s->cct, 0) << "invalid metadata directive" << dendl; + return -EINVAL; + } + } + + if (source_zone.empty() && + (dest_bucket_name.compare(src_bucket_name) == 0) && + (dest_object.compare(src_object) == 0) && + !replace_attrs) { + /* can only copy object into itself if replacing attrs */ + ldout(s->cct, 0) << "can't copy object into itself if not replacing attrs" << dendl; + return -ERR_INVALID_REQUEST; + } + return 0; +} + +void RGWCopyObj_ObjStore_S3::send_partial_response(off_t ofs) +{ + if (!sent_header) { + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + + end_header(s, this, "binary/octet-stream"); + if (ret == 0) { + s->formatter->open_object_section("CopyObjectResult"); + } + sent_header = true; + } else { + /* Send progress field. Note that this diverge from the original S3 + * spec. We do this in order to keep connection alive. + */ + s->formatter->dump_int("Progress", (uint64_t)ofs); + } + rgw_flush_formatter(s, s->formatter); +} + +void RGWCopyObj_ObjStore_S3::send_response() +{ + if (!sent_header) + send_partial_response(0); + + if (ret == 0) { + dump_time(s, "LastModified", &mtime); + map::iterator iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + bufferlist& bl = iter->second; + if (bl.length()) { + char *etag = bl.c_str(); + s->formatter->dump_string("ETag", etag); + } + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +void RGWGetACLs_ObjStore_S3::send_response() +{ + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); + s->cio->write(acls.c_str(), acls.size()); +} + +int RGWPutACLs_ObjStore_S3::get_policy_from_state(RGWRados *store, struct req_state *s, stringstream& ss) +{ + RGWAccessControlPolicy_S3 s3policy(s->cct); + + // bucket-* canned acls do not apply to bucket + if (s->object_str.empty()) { + if (s->canned_acl.find("bucket") != string::npos) + s->canned_acl.clear(); + } + + int r = create_s3_policy(s, store, s3policy); + if (r < 0) + return r; + + s3policy.to_xml(ss); + + return 0; +} + +void RGWPutACLs_ObjStore_S3::send_response() +{ + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + end_header(s, this, "application/xml"); + dump_start(s); +} + +void RGWGetCORS_ObjStore_S3::send_response() +{ + if (ret) { + if (ret == -ENOENT) + set_req_state_err(s, ERR_NOT_FOUND); + else + set_req_state_err(s, ret); + } + dump_errno(s); + end_header(s, NULL, "application/xml"); + dump_start(s); + if (!ret) { + string cors; + RGWCORSConfiguration_S3 *s3cors = static_cast(&bucket_cors); + stringstream ss; + + s3cors->to_xml(ss); + cors = ss.str(); + s->cio->write(cors.c_str(), cors.size()); + } +} + +int RGWPutCORS_ObjStore_S3::get_params() +{ + int r; + char *data = NULL; + int len = 0; + size_t cl = 0; + RGWCORSXMLParser_S3 parser(s->cct); + RGWCORSConfiguration_S3 *cors_config; + + if (s->length) + cl = atoll(s->length); + if (cl) { + data = (char *)malloc(cl + 1); + if (!data) { + r = -ENOMEM; + goto done_err; + } + int read_len; + r = s->cio->read(data, cl, &read_len); + len = read_len; + if (r < 0) + goto done_err; + data[len] = '\0'; + } else { + len = 0; + } + + if (!parser.init()) { + r = -EINVAL; + goto done_err; + } + + if (!parser.parse(data, len, 1)) { + r = -EINVAL; + goto done_err; + } + cors_config = static_cast(parser.find_first("CORSConfiguration")); + if (!cors_config) { + r = -EINVAL; + goto done_err; + } + + if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) { + ldout(s->cct, 15) << "CORSConfiguration"; + cors_config->to_xml(*_dout); + *_dout << dendl; + } + + cors_config->encode(cors_bl); + + free(data); + return 0; +done_err: + free(data); + return r; +} + +void RGWPutCORS_ObjStore_S3::send_response() +{ + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + end_header(s, NULL, "application/xml"); + dump_start(s); +} + +void RGWDeleteCORS_ObjStore_S3::send_response() +{ + int r = ret; + if (!r || r == -ENOENT) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, NULL); +} + +void RGWOptionsCORS_ObjStore_S3::send_response() +{ + string hdrs, exp_hdrs; + uint32_t max_age = CORS_MAX_AGE_INVALID; + /*EACCES means, there is no CORS registered yet for the bucket + *ENOENT means, there is no match of the Origin in the list of CORSRule + */ + if (ret == -ENOENT) + ret = -EACCES; + if (ret < 0) { + set_req_state_err(s, ret); + dump_errno(s); + end_header(s, NULL); + return; + } + get_response_params(hdrs, exp_hdrs, &max_age); + + dump_errno(s); + dump_access_control(s, origin, req_meth, hdrs.c_str(), exp_hdrs.c_str(), max_age); + end_header(s, NULL); +} + +int RGWInitMultipart_ObjStore_S3::get_params() +{ + RGWAccessControlPolicy_S3 s3policy(s->cct); + ret = create_s3_policy(s, store, s3policy); + if (ret < 0) + return ret; + + policy = s3policy; + + return 0; +} + +void RGWInitMultipart_ObjStore_S3::send_response() +{ + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + end_header(s, this, "application/xml"); + if (ret == 0) { + dump_start(s); + s->formatter->open_object_section_in_ns("InitiateMultipartUploadResult", + "http://s3.amazonaws.com/doc/2006-03-01/"); + s->formatter->dump_string("Bucket", s->bucket_name_str); + s->formatter->dump_string("Key", s->object); + s->formatter->dump_string("UploadId", upload_id); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +void RGWCompleteMultipart_ObjStore_S3::send_response() +{ + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + end_header(s, this, "application/xml"); + if (ret == 0) { + dump_start(s); + s->formatter->open_object_section_in_ns("CompleteMultipartUploadResult", + "http://s3.amazonaws.com/doc/2006-03-01/"); + if (g_conf->rgw_dns_name.length()) + s->formatter->dump_format("Location", "%s.%s", s->bucket_name_str.c_str(), g_conf->rgw_dns_name.c_str()); + s->formatter->dump_string("Bucket", s->bucket_name_str); + s->formatter->dump_string("Key", s->object); + s->formatter->dump_string("ETag", etag); + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +void RGWAbortMultipart_ObjStore_S3::send_response() +{ + int r = ret; + if (!r) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, this); +} + +void RGWListMultipart_ObjStore_S3::send_response() +{ + if (ret) + set_req_state_err(s, ret); + dump_errno(s); + end_header(s, this, "application/xml"); + + if (ret == 0) { + dump_start(s); + s->formatter->open_object_section_in_ns("ListMultipartUploadResult", + "http://s3.amazonaws.com/doc/2006-03-01/"); + map::iterator iter, test_iter; + int i, cur_max = 0; + + iter = parts.upper_bound(marker); + for (i = 0, test_iter = iter; test_iter != parts.end() && i < max_parts; ++test_iter, ++i) { + cur_max = test_iter->first; + } + s->formatter->dump_string("Bucket", s->bucket_name_str); + s->formatter->dump_string("Key", s->object); + s->formatter->dump_string("UploadId", upload_id); + s->formatter->dump_string("StorageClass", "STANDARD"); + s->formatter->dump_int("PartNumberMarker", marker); + s->formatter->dump_int("NextPartNumberMarker", cur_max + 1); + s->formatter->dump_int("MaxParts", max_parts); + s->formatter->dump_string("IsTruncated", (test_iter == parts.end() ? "false" : "true")); + + ACLOwner& owner = policy.get_owner(); + dump_owner(s, owner.get_id(), owner.get_display_name()); + + for (; iter != parts.end(); ++iter) { + RGWUploadPartInfo& info = iter->second; + + time_t sec = info.modified.sec(); + struct tm tmp; + gmtime_r(&sec, &tmp); + char buf[TIME_BUF_SIZE]; + + s->formatter->open_object_section("Part"); + + if (strftime(buf, sizeof(buf), "%Y-%m-%dT%T.000Z", &tmp) > 0) { + s->formatter->dump_string("LastModified", buf); + } + + s->formatter->dump_unsigned("PartNumber", info.num); + s->formatter->dump_string("ETag", info.etag); + s->formatter->dump_unsigned("Size", info.size); + s->formatter->close_section(); + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +void RGWListBucketMultiparts_ObjStore_S3::send_response() +{ + if (ret < 0) + set_req_state_err(s, ret); + dump_errno(s); + + end_header(s, this, "application/xml"); + dump_start(s); + if (ret < 0) + return; + + s->formatter->open_object_section("ListMultipartUploadsResult"); + s->formatter->dump_string("Bucket", s->bucket_name_str); + if (!prefix.empty()) + s->formatter->dump_string("ListMultipartUploadsResult.Prefix", prefix); + string& key_marker = marker.get_key(); + if (!key_marker.empty()) + s->formatter->dump_string("KeyMarker", key_marker); + string& upload_id_marker = marker.get_upload_id(); + if (!upload_id_marker.empty()) + s->formatter->dump_string("UploadIdMarker", upload_id_marker); + string next_key = next_marker.mp.get_key(); + if (!next_key.empty()) + s->formatter->dump_string("NextKeyMarker", next_key); + string next_upload_id = next_marker.mp.get_upload_id(); + if (!next_upload_id.empty()) + s->formatter->dump_string("NextUploadIdMarker", next_upload_id); + s->formatter->dump_int("MaxUploads", max_uploads); + if (!delimiter.empty()) + s->formatter->dump_string("Delimiter", delimiter); + s->formatter->dump_string("IsTruncated", (is_truncated ? "true" : "false")); + + if (ret >= 0) { + vector::iterator iter; + for (iter = uploads.begin(); iter != uploads.end(); ++iter) { + RGWMPObj& mp = iter->mp; + s->formatter->open_array_section("Upload"); + s->formatter->dump_string("Key", mp.get_key()); + s->formatter->dump_string("UploadId", mp.get_upload_id()); + dump_owner(s, s->user.user_id, s->user.display_name, "Initiator"); + dump_owner(s, s->user.user_id, s->user.display_name); + s->formatter->dump_string("StorageClass", "STANDARD"); + time_t mtime = iter->obj.mtime.sec(); + dump_time(s, "Initiated", &mtime); + s->formatter->close_section(); + } + if (common_prefixes.size() > 0) { + s->formatter->open_array_section("CommonPrefixes"); + map::iterator pref_iter; + for (pref_iter = common_prefixes.begin(); pref_iter != common_prefixes.end(); ++pref_iter) { + s->formatter->dump_string("CommonPrefixes.Prefix", pref_iter->first); + } + s->formatter->close_section(); + } + } + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWDeleteMultiObj_ObjStore_S3::send_status() +{ + if (!status_dumped) { + if (ret < 0) + set_req_state_err(s, ret); + dump_errno(s); + status_dumped = true; + } +} + +void RGWDeleteMultiObj_ObjStore_S3::begin_response() +{ + + if (!status_dumped) { + send_status(); + } + + dump_start(s); + end_header(s, this, "application/xml"); + s->formatter->open_object_section_in_ns("DeleteResult", + "http://s3.amazonaws.com/doc/2006-03-01/"); + + rgw_flush_formatter(s, s->formatter); +} + +void RGWDeleteMultiObj_ObjStore_S3::send_partial_response(pair& result) +{ + if (!result.first.empty()) { + if (result.second == 0 && !quiet) { + s->formatter->open_object_section("Deleted"); + s->formatter->dump_string("Key", result.first); + s->formatter->close_section(); + } else if (result.second < 0) { + struct rgw_http_errors r; + int err_no; + + s->formatter->open_object_section("Error"); + + err_no = -(result.second); + rgw_get_errno_s3(&r, err_no); + + s->formatter->dump_string("Key", result.first); + s->formatter->dump_int("Code", r.http_ret); + s->formatter->dump_string("Message", r.s3_code); + s->formatter->close_section(); + } + + rgw_flush_formatter(s, s->formatter); + } +} + +void RGWDeleteMultiObj_ObjStore_S3::end_response() +{ + + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +RGWOp *RGWHandler_ObjStore_Service_S3::op_get() +{ + return new RGWListBuckets_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Service_S3::op_head() +{ + return new RGWListBuckets_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Bucket_S3::get_obj_op(bool get_data) +{ + if (get_data) + return new RGWListBucket_ObjStore_S3; + else + return new RGWStatBucket_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Bucket_S3::op_get() +{ + if (s->info.args.sub_resource_exists("logging")) + return new RGWGetBucketLogging_ObjStore_S3; + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } else if (is_cors_op()) { + return new RGWGetCORS_ObjStore_S3; + } else if (s->info.args.exists("uploads")) { + return new RGWListBucketMultiparts_ObjStore_S3; + } + return get_obj_op(true); +} + +RGWOp *RGWHandler_ObjStore_Bucket_S3::op_head() +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } else if (s->info.args.exists("uploads")) { + return new RGWListBucketMultiparts_ObjStore_S3; + } + return get_obj_op(false); +} + +RGWOp *RGWHandler_ObjStore_Bucket_S3::op_put() +{ + if (s->info.args.sub_resource_exists("logging")) + return NULL; + if (is_acl_op()) { + return new RGWPutACLs_ObjStore_S3; + } else if (is_cors_op()) { + return new RGWPutCORS_ObjStore_S3; + } + return new RGWCreateBucket_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Bucket_S3::op_delete() +{ + if (is_cors_op()) { + return new RGWDeleteCORS_ObjStore_S3; + } + return new RGWDeleteBucket_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Bucket_S3::op_post() +{ + if ( s->info.request_params == "delete" ) { + return new RGWDeleteMultiObj_ObjStore_S3; + } + + return new RGWPostObj_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Bucket_S3::op_options() +{ + return new RGWOptionsCORS_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Obj_S3::get_obj_op(bool get_data) +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } + RGWGetObj_ObjStore_S3 *get_obj_op = new RGWGetObj_ObjStore_S3; + get_obj_op->set_get_data(get_data); + return get_obj_op; +} + +RGWOp *RGWHandler_ObjStore_Obj_S3::op_get() +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } else if (s->info.args.exists("uploadId")) { + return new RGWListMultipart_ObjStore_S3; + } + return get_obj_op(true); +} + +RGWOp *RGWHandler_ObjStore_Obj_S3::op_head() +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_S3; + } else if (s->info.args.exists("uploadId")) { + return new RGWListMultipart_ObjStore_S3; + } + return get_obj_op(false); +} + +RGWOp *RGWHandler_ObjStore_Obj_S3::op_put() +{ + if (is_acl_op()) { + return new RGWPutACLs_ObjStore_S3; + } + if (!s->copy_source) + return new RGWPutObj_ObjStore_S3; + else + return new RGWCopyObj_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Obj_S3::op_delete() +{ + string upload_id = s->info.args.get("uploadId"); + + if (upload_id.empty()) + return new RGWDeleteObj_ObjStore_S3; + else + return new RGWAbortMultipart_ObjStore_S3; +} + +RGWOp *RGWHandler_ObjStore_Obj_S3::op_post() +{ + if (s->info.args.exists("uploadId")) + return new RGWCompleteMultipart_ObjStore_S3; + + if (s->info.args.exists("uploads")) + return new RGWInitMultipart_ObjStore_S3; + + return NULL; +} + +RGWOp *RGWHandler_ObjStore_Obj_S3::op_options() +{ + return new RGWOptionsCORS_ObjStore_S3; +} + +int RGWHandler_ObjStore_S3::init_from_header(struct req_state *s, int default_formatter, bool configurable_format) +{ + string req; + string first; + + const char *req_name = s->relative_uri.c_str(); + const char *p; + + if (*req_name == '?') { + p = req_name; + } else { + p = s->info.request_params.c_str(); + } + + s->info.args.set(p); + s->info.args.parse(); + + /* must be called after the args parsing */ + int ret = allocate_formatter(s, default_formatter, configurable_format); + if (ret < 0) + return ret; + + if (*req_name != '/') + return 0; + + req_name++; + + if (!*req_name) + return 0; + + req = req_name; + int pos = req.find('/'); + if (pos >= 0) { + first = req.substr(0, pos); + } else { + first = req; + } + + if (s->bucket_name_str.empty()) { + s->bucket_name_str = first; + + if (pos >= 0) { + string encoded_obj_str = req.substr(pos+1); + s->object_str = encoded_obj_str; + + if (s->object_str.size() > 0) { + s->object = strdup(s->object_str.c_str()); + } + } + } else { + s->object_str = req_name; + s->object = strdup(s->object_str.c_str()); + } + return 0; +} + +static bool looks_like_ip_address(const char *bucket) +{ + int num_periods = 0; + bool expect_period = false; + for (const char *b = bucket; *b; ++b) { + if (*b == '.') { + if (!expect_period) + return false; + ++num_periods; + if (num_periods > 3) + return false; + expect_period = false; + } + else if (isdigit(*b)) { + expect_period = true; + } + else { + return false; + } + } + return (num_periods == 3); +} + +int RGWHandler_ObjStore_S3::validate_bucket_name(const string& bucket, bool relaxed_names) +{ + int ret = RGWHandler_ObjStore::validate_bucket_name(bucket); + if (ret < 0) + return ret; + + if (bucket.size() == 0) + return 0; + + // bucket names must start with a number, letter, or underscore + if (!(isalpha(bucket[0]) || isdigit(bucket[0]))) { + if (!relaxed_names) + return -ERR_INVALID_BUCKET_NAME; + else if (!(bucket[0] == '_' || bucket[0] == '.' || bucket[0] == '-')) + return -ERR_INVALID_BUCKET_NAME; + } + + for (const char *s = bucket.c_str(); *s; ++s) { + char c = *s; + if (isdigit(c) || (c == '.')) + continue; + if (isalpha(c)) + continue; + if ((c == '-') || (c == '_')) + continue; + // Invalid character + return -ERR_INVALID_BUCKET_NAME; + } + + if (looks_like_ip_address(bucket.c_str())) + return -ERR_INVALID_BUCKET_NAME; + + return 0; +} + +int RGWHandler_ObjStore_S3::init(RGWRados *store, struct req_state *s, RGWClientIO *cio) +{ + dout(10) << "s->object=" << (s->object ? s->object : "") << " s->bucket=" << (!s->bucket_name_str.empty() ? s->bucket_name_str : "") << dendl; + + bool relaxed_names = s->cct->_conf->rgw_relaxed_s3_bucket_names; + int ret = validate_bucket_name(s->bucket_name_str, relaxed_names); + if (ret) + return ret; + ret = validate_object_name(s->object_str); + if (ret) + return ret; + + const char *cacl = s->info.env->get("HTTP_X_AMZ_ACL"); + if (cacl) + s->canned_acl = cacl; + + s->has_acl_header = s->info.env->exists_prefix("HTTP_X_AMZ_GRANT"); + + s->copy_source = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE"); + if (s->copy_source) { + ret = RGWCopyObj::parse_copy_location(s->copy_source, s->src_bucket_name, s->src_object); + if (!ret) { + ldout(s->cct, 0) << "failed to parse copy location" << dendl; + return -EINVAL; + } + } + + s->dialect = "s3"; + + return RGWHandler_ObjStore::init(store, s, cio); +} + + +/* + * Try to validate S3 auth against keystone s3token interface + */ +int RGW_Auth_S3_Keystone_ValidateToken::validate_s3token(const string& auth_id, const string& auth_token, const string& auth_sign) { + /* prepare keystone url */ + string keystone_url = cct->_conf->rgw_keystone_url; + if (keystone_url[keystone_url.size() - 1] != '/') + keystone_url.append("/"); + keystone_url.append("v2.0/s3tokens"); + + /* set required headers for keystone request */ + append_header("X-Auth-Token", cct->_conf->rgw_keystone_admin_token); + append_header("Content-Type", "application/json"); + + /* encode token */ + bufferlist token_buff; + bufferlist token_encoded; + token_buff.append(auth_token); + token_buff.encode_base64(token_encoded); + token_encoded.append((char)0); + + /* create json credentials request body */ + JSONFormatter credentials(false); + credentials.open_object_section(""); + credentials.open_object_section("credentials"); + credentials.dump_string("access", auth_id); + credentials.dump_string("token", token_encoded.c_str()); + credentials.dump_string("signature", auth_sign); + credentials.close_section(); + credentials.close_section(); + + std::stringstream os; + credentials.flush(os); + set_tx_buffer(os.str()); + + /* send request */ + int ret = process("POST", keystone_url.c_str()); + if (ret < 0) { + dout(2) << "s3 keystone: token validation ERROR: " << rx_buffer.c_str() << dendl; + return -EPERM; + } + + /* now parse response */ + if (response.parse(cct, rx_buffer) < 0) { + dout(2) << "s3 keystone: token parsing failed" << dendl; + return -EPERM; + } + + /* check if we have a valid role */ + bool found = false; + list::iterator iter; + for (iter = roles_list.begin(); iter != roles_list.end(); ++iter) { + if ((found=response.user.has_role(*iter))==true) + break; + } + + if (!found) { + ldout(cct, 5) << "s3 keystone: user does not hold a matching role; required roles: " << cct->_conf->rgw_keystone_accepted_roles << dendl; + return -EPERM; + } + + /* everything seems fine, continue with this user */ + ldout(cct, 5) << "s3 keystone: validated token: " << response.token.tenant.name << ":" << response.user.name << " expires: " << response.token.expires << dendl; + return 0; +} + +/* + * verify that a signed request comes from the keyholder + * by checking the signature against our locally-computed version + */ +int RGW_Auth_S3::authorize(RGWRados *store, struct req_state *s) +{ + bool qsr = false; + string auth_id; + string auth_sign; + + time_t now; + time(&now); + + /* neither keystone and rados enabled; warn and exit! */ + if (!store->ctx()->_conf->rgw_s3_auth_use_rados + && !store->ctx()->_conf->rgw_s3_auth_use_keystone) { + dout(0) << "WARNING: no authorization backend enabled! Users will never authenticate." << dendl; + return -EPERM; + } + + if (!s->http_auth || !(*s->http_auth)) { + auth_id = s->info.args.get("AWSAccessKeyId"); + if (auth_id.size()) { + auth_sign = s->info.args.get("Signature"); + + string date = s->info.args.get("Expires"); + time_t exp = atoll(date.c_str()); + if (now >= exp) + return -EPERM; + + qsr = true; + } else { + /* anonymous access */ + rgw_get_anon_user(s->user); + s->perm_mask = RGW_PERM_FULL_CONTROL; + return 0; + } + } else { + if (strncmp(s->http_auth, "AWS ", 4)) + return -EINVAL; + string auth_str(s->http_auth + 4); + int pos = auth_str.find(':'); + if (pos < 0) + return -EINVAL; + + auth_id = auth_str.substr(0, pos); + auth_sign = auth_str.substr(pos + 1); + } + + /* try keystone auth first */ + int keystone_result = -EINVAL; + if (store->ctx()->_conf->rgw_s3_auth_use_keystone + && !store->ctx()->_conf->rgw_keystone_url.empty()) { + dout(20) << "s3 keystone: trying keystone auth" << dendl; + + RGW_Auth_S3_Keystone_ValidateToken keystone_validator(store->ctx()); + string token; + + if (!rgw_create_s3_canonical_header(s->info, &s->header_time, token, qsr)) { + dout(10) << "failed to create auth header\n" << token << dendl; + } else { + keystone_result = keystone_validator.validate_s3token(auth_id, token, auth_sign); + if (keystone_result == 0) { + s->user.user_id = keystone_validator.response.token.tenant.id; + s->user.display_name = keystone_validator.response.token.tenant.name; // wow. + + /* try to store user if it not already exists */ + if (rgw_get_user_info_by_uid(store, keystone_validator.response.token.tenant.id, s->user) < 0) { + int ret = rgw_store_user_info(store, s->user, NULL, NULL, 0, true); + if (ret < 0) + dout(10) << "NOTICE: failed to store new user's info: ret=" << ret << dendl; + } + + s->perm_mask = RGW_PERM_FULL_CONTROL; + } + } + } + + /* keystone failed (or not enabled); check if we want to use rados backend */ + if (!store->ctx()->_conf->rgw_s3_auth_use_rados + && keystone_result < 0) + return keystone_result; + + /* now try rados backend, but only if keystone did not succeed */ + if (keystone_result < 0) { + /* get the user info */ + if (rgw_get_user_info_by_access_key(store, auth_id, s->user) < 0) { + dout(5) << "error reading user info, uid=" << auth_id << " can't authenticate" << dendl; + return -EPERM; + } + + /* now verify signature */ + + string auth_hdr; + if (!rgw_create_s3_canonical_header(s->info, &s->header_time, auth_hdr, qsr)) { + dout(10) << "failed to create auth header\n" << auth_hdr << dendl; + return -EPERM; + } + dout(10) << "auth_hdr:\n" << auth_hdr << dendl; + + time_t req_sec = s->header_time.sec(); + if ((req_sec < now - RGW_AUTH_GRACE_MINS * 60 || + req_sec > now + RGW_AUTH_GRACE_MINS * 60) && !qsr) { + dout(10) << "req_sec=" << req_sec << " now=" << now << "; now - RGW_AUTH_GRACE_MINS=" << now - RGW_AUTH_GRACE_MINS * 60 << "; now + RGW_AUTH_GRACE_MINS=" << now + RGW_AUTH_GRACE_MINS * 60 << dendl; + dout(0) << "NOTICE: request time skew too big now=" << utime_t(now, 0) << " req_time=" << s->header_time << dendl; + return -ERR_REQUEST_TIME_SKEWED; + } + + map::iterator iter = s->user.access_keys.find(auth_id); + if (iter == s->user.access_keys.end()) { + dout(0) << "ERROR: access key not encoded in user info" << dendl; + return -EPERM; + } + RGWAccessKey& k = iter->second; + + if (!k.subuser.empty()) { + map::iterator uiter = s->user.subusers.find(k.subuser); + if (uiter == s->user.subusers.end()) { + dout(0) << "NOTICE: could not find subuser: " << k.subuser << dendl; + return -EPERM; + } + RGWSubUser& subuser = uiter->second; + s->perm_mask = subuser.perm_mask; + } else + s->perm_mask = RGW_PERM_FULL_CONTROL; + + string digest; + int ret = rgw_get_s3_header_digest(auth_hdr, k.key, digest); + if (ret < 0) { + return -EPERM; + } + + dout(15) << "calculated digest=" << digest << dendl; + dout(15) << "auth_sign=" << auth_sign << dendl; + dout(15) << "compare=" << auth_sign.compare(digest) << dendl; + + if (auth_sign != digest) + return -EPERM; + + if (s->user.system) { + s->system_request = true; + dout(20) << "system request" << dendl; + s->info.args.set_system(); + string effective_uid = s->info.args.get(RGW_SYS_PARAM_PREFIX "uid"); + RGWUserInfo effective_user; + if (!effective_uid.empty()) { + ret = rgw_get_user_info_by_uid(store, effective_uid, effective_user); + if (ret < 0) { + ldout(s->cct, 0) << "User lookup failed!" << dendl; + return -ENOENT; + } + s->user = effective_user; + } + } + + } /* if keystone_result < 0 */ + + // populate the owner info + s->owner.set_id(s->user.user_id); + s->owner.set_name(s->user.display_name); + + + return 0; +} + +int RGWHandler_Auth_S3::init(RGWRados *store, struct req_state *state, RGWClientIO *cio) +{ + int ret = RGWHandler_ObjStore_S3::init_from_header(state, RGW_FORMAT_JSON, true); + if (ret < 0) + return ret; + + return RGWHandler_ObjStore::init(store, state, cio); +} + +RGWHandler *RGWRESTMgr_S3::get_handler(struct req_state *s) +{ + int ret = RGWHandler_ObjStore_S3::init_from_header(s, RGW_FORMAT_XML, false); + if (ret < 0) + return NULL; + + if (s->bucket_name_str.empty()) + return new RGWHandler_ObjStore_Service_S3; + + if (!s->object) + return new RGWHandler_ObjStore_Bucket_S3; + + return new RGWHandler_ObjStore_Obj_S3; +} diff --git a/src/rgw/rgw_rest_swift.cc.orig b/src/rgw/rgw_rest_swift.cc.orig new file mode 100644 index 0000000000000..6c6d7b55f3e85 --- /dev/null +++ b/src/rgw/rgw_rest_swift.cc.orig @@ -0,0 +1,3090 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include + +#include "include/ceph_assert.h" +#include "ceph_ver.h" + +#include "common/Formatter.h" +#include "common/utf8.h" +#include "common/ceph_json.h" + +#include "rgw_rest_swift.h" +#include "rgw_acl_swift.h" +#include "rgw_cors_swift.h" +#include "rgw_formats.h" +#include "rgw_client_io.h" + +#include "rgw_auth.h" +#include "rgw_swift_auth.h" + +#include "rgw_request.h" +#include "rgw_process.h" + +#include "rgw_zone.h" + +#include "services/svc_zone.h" + +#include +#include +#include + +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +int RGWListBuckets_ObjStore_SWIFT::get_params() +{ + prefix = s->info.args.get("prefix"); + marker = s->info.args.get("marker"); + end_marker = s->info.args.get("end_marker"); + wants_reversed = s->info.args.exists("reverse"); + + if (wants_reversed) { + std::swap(marker, end_marker); + } + + std::string limit_str = s->info.args.get("limit"); + if (!limit_str.empty()) { + std::string err; + long l = strict_strtol(limit_str.c_str(), 10, &err); + if (!err.empty()) { + return -EINVAL; + } + + if (l > (long)limit_max || l < 0) { + return -ERR_PRECONDITION_FAILED; + } + + limit = (uint64_t)l; + } + + if (s->cct->_conf->rgw_swift_need_stats) { + bool stats, exists; + int r = s->info.args.get_bool("stats", &stats, &exists); + + if (r < 0) { + return r; + } + + if (exists) { + need_stats = stats; + } + } else { + need_stats = false; + } + + return 0; +} + +static void dump_account_metadata(struct req_state * const s, + const RGWUsageStats& global_stats, + const std::map &policies_stats, + /* const */map& attrs, + const RGWQuotaInfo& quota, + const RGWAccessControlPolicy_SWIFTAcct &policy) +{ + /* Adding X-Timestamp to keep align with Swift API */ + dump_header(s, "X-Timestamp", ceph_clock_now()); + + dump_header(s, "X-Account-Container-Count", global_stats.buckets_count); + dump_header(s, "X-Account-Object-Count", global_stats.objects_count); + dump_header(s, "X-Account-Bytes-Used", global_stats.bytes_used); + dump_header(s, "X-Account-Bytes-Used-Actual", global_stats.bytes_used_rounded); + + for (const auto& kv : policies_stats) { + const auto& policy_name = camelcase_dash_http_attr(kv.first); + const auto& policy_stats = kv.second; + + dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name, + "-Container-Count", policy_stats.buckets_count); + dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name, + "-Object-Count", policy_stats.objects_count); + dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name, + "-Bytes-Used", policy_stats.bytes_used); + dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name, + "-Bytes-Used-Actual", policy_stats.bytes_used_rounded); + } + + /* Dump TempURL-related stuff */ + if (s->perm_mask == RGW_PERM_FULL_CONTROL) { + auto iter = s->user->temp_url_keys.find(0); + if (iter != std::end(s->user->temp_url_keys) && ! iter->second.empty()) { + dump_header(s, "X-Account-Meta-Temp-Url-Key", iter->second); + } + + iter = s->user->temp_url_keys.find(1); + if (iter != std::end(s->user->temp_url_keys) && ! iter->second.empty()) { + dump_header(s, "X-Account-Meta-Temp-Url-Key-2", iter->second); + } + } + + /* Dump quota headers. */ + if (quota.enabled) { + if (quota.max_size >= 0) { + dump_header(s, "X-Account-Meta-Quota-Bytes", quota.max_size); + } + + /* Limit on the number of objects in a given account is a RadosGW's + * extension. Swift's account quota WSGI filter doesn't support it. */ + if (quota.max_objects >= 0) { + dump_header(s, "X-Account-Meta-Quota-Count", quota.max_objects); + } + } + + /* Dump user-defined metadata items and generic attrs. */ + const size_t PREFIX_LEN = sizeof(RGW_ATTR_META_PREFIX) - 1; + map::iterator iter; + for (iter = attrs.lower_bound(RGW_ATTR_PREFIX); iter != attrs.end(); ++iter) { + const char *name = iter->first.c_str(); + map::const_iterator geniter = rgw_to_http_attrs.find(name); + + if (geniter != rgw_to_http_attrs.end()) { + dump_header(s, geniter->second, iter->second); + } else if (strncmp(name, RGW_ATTR_META_PREFIX, PREFIX_LEN) == 0) { + dump_header_prefixed(s, "X-Account-Meta-", + camelcase_dash_http_attr(name + PREFIX_LEN), + iter->second); + } + } + + /* Dump account ACLs */ + auto account_acls = policy.to_str(); + if (account_acls) { + dump_header(s, "X-Account-Access-Control", std::move(*account_acls)); + } +} + +void RGWListBuckets_ObjStore_SWIFT::send_response_begin(bool has_buckets) +{ + if (op_ret) { + set_req_state_err(s, op_ret); + } else if (!has_buckets && s->format == RGW_FORMAT_PLAIN) { + op_ret = STATUS_NO_CONTENT; + set_req_state_err(s, op_ret); + } + + if (! s->cct->_conf->rgw_swift_enforce_content_length) { + /* Adding account stats in the header to keep align with Swift API */ + dump_account_metadata(s, + global_stats, + policies_stats, + attrs, + user_quota, + static_cast(*s->user_acl)); + dump_errno(s); + dump_header(s, "Accept-Ranges", "bytes"); + end_header(s, NULL, NULL, NO_CONTENT_LENGTH, true); + } + + if (! op_ret) { + dump_start(s); + s->formatter->open_array_section_with_attrs("account", + FormatterAttrs("name", s->user->display_name.c_str(), NULL)); + + sent_data = true; + } +} + +void RGWListBuckets_ObjStore_SWIFT::handle_listing_chunk(RGWUserBuckets&& buckets) +{ + if (wants_reversed) { + /* Just store in the reversal buffer. Its content will be handled later, + * in send_response_end(). */ + reverse_buffer.emplace(std::begin(reverse_buffer), std::move(buckets)); + } else { + return send_response_data(buckets); + } +} + +void RGWListBuckets_ObjStore_SWIFT::send_response_data(RGWUserBuckets& buckets) +{ + if (! sent_data) { + return; + } + + /* Take care of the prefix parameter of Swift API. There is no business + * in applying the filter earlier as we really need to go through all + * entries regardless of it (the headers like X-Account-Container-Count + * aren't affected by specifying prefix). */ + const std::map& m = buckets.get_buckets(); + for (auto iter = m.lower_bound(prefix); + iter != m.end() && boost::algorithm::starts_with(iter->first, prefix); + ++iter) { + dump_bucket_entry(iter->second); + } +} + +void RGWListBuckets_ObjStore_SWIFT::dump_bucket_entry(const RGWBucketEnt& obj) +{ + s->formatter->open_object_section("container"); + s->formatter->dump_string("name", obj.bucket.name); + + if (need_stats) { + s->formatter->dump_int("count", obj.count); + s->formatter->dump_int("bytes", obj.size); + } + + s->formatter->close_section(); + + if (! s->cct->_conf->rgw_swift_enforce_content_length) { + rgw_flush_formatter(s, s->formatter); + } +} + +void RGWListBuckets_ObjStore_SWIFT::send_response_data_reversed(RGWUserBuckets& buckets) +{ + if (! sent_data) { + return; + } + + /* Take care of the prefix parameter of Swift API. There is no business + * in applying the filter earlier as we really need to go through all + * entries regardless of it (the headers like X-Account-Container-Count + * aren't affected by specifying prefix). */ + std::map& m = buckets.get_buckets(); + + auto iter = m.rbegin(); + for (/* initialized above */; + iter != m.rend() && !boost::algorithm::starts_with(iter->first, prefix); + ++iter) { + /* NOP */; + } + + for (/* iter carried */; + iter != m.rend() && boost::algorithm::starts_with(iter->first, prefix); + ++iter) { + dump_bucket_entry(iter->second); + } +} + +void RGWListBuckets_ObjStore_SWIFT::send_response_end() +{ + if (wants_reversed) { + for (auto& buckets : reverse_buffer) { + send_response_data_reversed(buckets); + } + } + + if (sent_data) { + s->formatter->close_section(); + } + + if (s->cct->_conf->rgw_swift_enforce_content_length) { + /* Adding account stats in the header to keep align with Swift API */ + dump_account_metadata(s, + global_stats, + policies_stats, + attrs, + user_quota, + static_cast(*s->user_acl)); + dump_errno(s); + end_header(s, nullptr, nullptr, s->formatter->get_len(), true); + } + + if (sent_data || s->cct->_conf->rgw_swift_enforce_content_length) { + rgw_flush_formatter_and_reset(s, s->formatter); + } +} + +int RGWListBucket_ObjStore_SWIFT::get_params() +{ + prefix = s->info.args.get("prefix"); + marker = s->info.args.get("marker"); + end_marker = s->info.args.get("end_marker"); + max_keys = s->info.args.get("limit"); + + // non-standard + s->info.args.get_bool("allow_unordered", &allow_unordered, false); + + delimiter = s->info.args.get("delimiter"); + + op_ret = parse_max_keys(); + if (op_ret < 0) { + return op_ret; + } + // S3 behavior is to silently cap the max-keys. + // Swift behavior is to abort. + if (max > default_max) + return -ERR_PRECONDITION_FAILED; + + string path_args; + if (s->info.args.exists("path")) { // should handle empty path + path_args = s->info.args.get("path"); + if (!delimiter.empty() || !prefix.empty()) { + return -EINVAL; + } + prefix = path_args; + delimiter="/"; + + path = prefix; + if (path.size() && path[path.size() - 1] != '/') + path.append("/"); + + int len = prefix.size(); + int delim_size = delimiter.size(); + + if (len >= delim_size) { + if (prefix.substr(len - delim_size).compare(delimiter) != 0) + prefix.append(delimiter); + } + } + + return 0; +} + +static void dump_container_metadata(struct req_state *, + const RGWBucketEnt&, + const RGWQuotaInfo&, + const RGWBucketWebsiteConf&); + +void RGWListBucket_ObjStore_SWIFT::send_response() +{ + vector::iterator iter = objs.begin(); + map::iterator pref_iter = common_prefixes.begin(); + + dump_start(s); + dump_container_metadata(s, bucket, bucket_quota, + s->bucket_info.website_conf); + + s->formatter->open_array_section_with_attrs("container", + FormatterAttrs("name", + s->bucket.name.c_str(), + NULL)); + + while (iter != objs.end() || pref_iter != common_prefixes.end()) { + bool do_pref = false; + bool do_objs = false; + rgw_obj_key key; + if (iter != objs.end()) { + key = iter->key; + } + if (pref_iter == common_prefixes.end()) + do_objs = true; + else if (iter == objs.end()) + do_pref = true; + else if (!key.empty() && key.name.compare(pref_iter->first) == 0) { + do_objs = true; + ++pref_iter; + } else if (!key.empty() && key.name.compare(pref_iter->first) <= 0) + do_objs = true; + else + do_pref = true; + + if (do_objs && (allow_unordered || marker.empty() || marker < key)) { + if (key.name.compare(path) == 0) + goto next; + + s->formatter->open_object_section("object"); + s->formatter->dump_string("name", key.name); + s->formatter->dump_string("hash", iter->meta.etag); + s->formatter->dump_int("bytes", iter->meta.accounted_size); + if (!iter->meta.user_data.empty()) + s->formatter->dump_string("user_custom_data", iter->meta.user_data); + string single_content_type = iter->meta.content_type; + if (iter->meta.content_type.size()) { + // content type might hold multiple values, just dump the last one + ssize_t pos = iter->meta.content_type.rfind(','); + if (pos > 0) { + ++pos; + while (single_content_type[pos] == ' ') + ++pos; + single_content_type = single_content_type.substr(pos); + } + s->formatter->dump_string("content_type", single_content_type); + } + dump_time(s, "last_modified", &iter->meta.mtime); + s->formatter->close_section(); + } + + if (do_pref && (marker.empty() || pref_iter->first.compare(marker.name) > 0)) { + const string& name = pref_iter->first; + if (name.compare(delimiter) == 0) + goto next; + + s->formatter->open_object_section_with_attrs("subdir", FormatterAttrs("name", name.c_str(), NULL)); + + /* swift is a bit inconsistent here */ + switch (s->format) { + case RGW_FORMAT_XML: + s->formatter->dump_string("name", name); + break; + default: + s->formatter->dump_string("subdir", name); + } + s->formatter->close_section(); + } +next: + if (do_objs) + ++iter; + else + ++pref_iter; + } + + s->formatter->close_section(); + + int64_t content_len = 0; + if (! op_ret) { + content_len = s->formatter->get_len(); + if (content_len == 0) { + op_ret = STATUS_NO_CONTENT; + } + } else if (op_ret > 0) { + op_ret = 0; + } + + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, NULL, content_len); + if (op_ret < 0) { + return; + } + + rgw_flush_formatter_and_reset(s, s->formatter); +} // RGWListBucket_ObjStore_SWIFT::send_response + +static void dump_container_metadata(struct req_state *s, + const RGWBucketEnt& bucket, + const RGWQuotaInfo& quota, + const RGWBucketWebsiteConf& ws_conf) +{ + /* Adding X-Timestamp to keep align with Swift API */ + dump_header(s, "X-Timestamp", utime_t(s->bucket_info.creation_time)); + + dump_header(s, "X-Container-Object-Count", bucket.count); + dump_header(s, "X-Container-Bytes-Used", bucket.size); + dump_header(s, "X-Container-Bytes-Used-Actual", bucket.size_rounded); + + if (s->object.empty()) { + auto swift_policy = \ + static_cast(s->bucket_acl.get()); + std::string read_acl, write_acl; + swift_policy->to_str(read_acl, write_acl); + + if (read_acl.size()) { + dump_header(s, "X-Container-Read", read_acl); + } + if (write_acl.size()) { + dump_header(s, "X-Container-Write", write_acl); + } + if (!s->bucket_info.placement_rule.name.empty()) { + dump_header(s, "X-Storage-Policy", s->bucket_info.placement_rule.name); + } + dump_header(s, "X-Storage-Class", s->bucket_info.placement_rule.get_storage_class()); + + /* Dump user-defined metadata items and generic attrs. */ + const size_t PREFIX_LEN = sizeof(RGW_ATTR_META_PREFIX) - 1; + map::iterator iter; + for (iter = s->bucket_attrs.lower_bound(RGW_ATTR_PREFIX); + iter != s->bucket_attrs.end(); + ++iter) { + const char *name = iter->first.c_str(); + map::const_iterator geniter = rgw_to_http_attrs.find(name); + + if (geniter != rgw_to_http_attrs.end()) { + dump_header(s, geniter->second, iter->second); + } else if (strncmp(name, RGW_ATTR_META_PREFIX, PREFIX_LEN) == 0) { + dump_header_prefixed(s, "X-Container-Meta-", + camelcase_dash_http_attr(name + PREFIX_LEN), + iter->second); + } + } + } + + /* Dump container versioning info. */ + if (! s->bucket_info.swift_ver_location.empty()) { + dump_header(s, "X-Versions-Location", + url_encode(s->bucket_info.swift_ver_location)); + } + + /* Dump quota headers. */ + if (quota.enabled) { + if (quota.max_size >= 0) { + dump_header(s, "X-Container-Meta-Quota-Bytes", quota.max_size); + } + + if (quota.max_objects >= 0) { + dump_header(s, "X-Container-Meta-Quota-Count", quota.max_objects); + } + } + + /* Dump Static Website headers. */ + if (! ws_conf.index_doc_suffix.empty()) { + dump_header(s, "X-Container-Meta-Web-Index", ws_conf.index_doc_suffix); + } + + if (! ws_conf.error_doc.empty()) { + dump_header(s, "X-Container-Meta-Web-Error", ws_conf.error_doc); + } + + if (! ws_conf.subdir_marker.empty()) { + dump_header(s, "X-Container-Meta-Web-Directory-Type", + ws_conf.subdir_marker); + } + + if (! ws_conf.listing_css_doc.empty()) { + dump_header(s, "X-Container-Meta-Web-Listings-CSS", + ws_conf.listing_css_doc); + } + + if (ws_conf.listing_enabled) { + dump_header(s, "X-Container-Meta-Web-Listings", "true"); + } + + /* Dump bucket's modification time. Compliance with the Swift API really + * needs that. */ + dump_last_modified(s, s->bucket_mtime); +} + +void RGWStatAccount_ObjStore_SWIFT::execute() +{ + RGWStatAccount_ObjStore::execute(); + op_ret = rgw_get_user_attrs_by_uid(store, s->user->user_id, attrs); +} + +void RGWStatAccount_ObjStore_SWIFT::send_response() +{ + if (op_ret >= 0) { + op_ret = STATUS_NO_CONTENT; + dump_account_metadata(s, + global_stats, + policies_stats, + attrs, + user_quota, + static_cast(*s->user_acl)); + } + + set_req_state_err(s, op_ret); + dump_errno(s); + + end_header(s, NULL, NULL, 0, true); + + dump_start(s); +} + +void RGWStatBucket_ObjStore_SWIFT::send_response() +{ + if (op_ret >= 0) { + op_ret = STATUS_NO_CONTENT; + dump_container_metadata(s, bucket, bucket_quota, + s->bucket_info.website_conf); + } + + set_req_state_err(s, op_ret); + dump_errno(s); + + end_header(s, this, NULL, 0, true); + dump_start(s); +} + +static int get_swift_container_settings(req_state * const s, + RGWRados * const store, + RGWAccessControlPolicy * const policy, + bool * const has_policy, + uint32_t * rw_mask, + RGWCORSConfiguration * const cors_config, + bool * const has_cors) +{ + const char * const read_list = s->info.env->get("HTTP_X_CONTAINER_READ"); + const char * const write_list = s->info.env->get("HTTP_X_CONTAINER_WRITE"); + + *has_policy = false; + + if (read_list || write_list) { + RGWAccessControlPolicy_SWIFT swift_policy(s->cct); + const auto r = swift_policy.create(store, + s->user->user_id, + s->user->display_name, + read_list, + write_list, + *rw_mask); + if (r < 0) { + return r; + } + + *policy = swift_policy; + *has_policy = true; + } + + *has_cors = false; + + /*Check and update CORS configuration*/ + const char *allow_origins = s->info.env->get("HTTP_X_CONTAINER_META_ACCESS_CONTROL_ALLOW_ORIGIN"); + const char *allow_headers = s->info.env->get("HTTP_X_CONTAINER_META_ACCESS_CONTROL_ALLOW_HEADERS"); + const char *expose_headers = s->info.env->get("HTTP_X_CONTAINER_META_ACCESS_CONTROL_EXPOSE_HEADERS"); + const char *max_age = s->info.env->get("HTTP_X_CONTAINER_META_ACCESS_CONTROL_MAX_AGE"); + if (allow_origins) { + RGWCORSConfiguration_SWIFT *swift_cors = new RGWCORSConfiguration_SWIFT; + int r = swift_cors->create_update(allow_origins, allow_headers, expose_headers, max_age); + if (r < 0) { + dout(0) << "Error creating/updating the cors configuration" << dendl; + delete swift_cors; + return r; + } + *has_cors = true; + *cors_config = *swift_cors; + cors_config->dump(); + delete swift_cors; + } + + return 0; +} + +#define ACCT_REMOVE_ATTR_PREFIX "HTTP_X_REMOVE_ACCOUNT_META_" +#define ACCT_PUT_ATTR_PREFIX "HTTP_X_ACCOUNT_META_" +#define CONT_REMOVE_ATTR_PREFIX "HTTP_X_REMOVE_CONTAINER_META_" +#define CONT_PUT_ATTR_PREFIX "HTTP_X_CONTAINER_META_" + +static void get_rmattrs_from_headers(const req_state * const s, + const char * const put_prefix, + const char * const del_prefix, + set& rmattr_names) +{ + const size_t put_prefix_len = strlen(put_prefix); + const size_t del_prefix_len = strlen(del_prefix); + + for (const auto& kv : s->info.env->get_map()) { + size_t prefix_len = 0; + const char * const p = kv.first.c_str(); + + if (strncasecmp(p, del_prefix, del_prefix_len) == 0) { + /* Explicitly requested removal. */ + prefix_len = del_prefix_len; + } else if ((strncasecmp(p, put_prefix, put_prefix_len) == 0) + && kv.second.empty()) { + /* Removal requested by putting an empty value. */ + prefix_len = put_prefix_len; + } + + if (prefix_len > 0) { + string name(RGW_ATTR_META_PREFIX); + name.append(lowercase_dash_http_attr(p + prefix_len)); + rmattr_names.insert(name); + } + } +} + +static int get_swift_versioning_settings( + req_state * const s, + boost::optional& swift_ver_location) +{ + /* Removing the Swift's versions location has lower priority than setting + * a new one. That's the reason why we're handling it first. */ + const std::string vlocdel = + s->info.env->get("HTTP_X_REMOVE_VERSIONS_LOCATION", ""); + if (vlocdel.size()) { + swift_ver_location = boost::in_place(std::string()); + } + + if (s->info.env->exists("HTTP_X_VERSIONS_LOCATION")) { + /* If the Swift's versioning is globally disabled but someone wants to + * enable it for a given container, new version of Swift will generate + * the precondition failed error. */ + if (! s->cct->_conf->rgw_swift_versioning_enabled) { + return -ERR_PRECONDITION_FAILED; + } + + swift_ver_location = s->info.env->get("HTTP_X_VERSIONS_LOCATION", ""); + } + + return 0; +} + +int RGWCreateBucket_ObjStore_SWIFT::get_params() +{ + bool has_policy; + uint32_t policy_rw_mask = 0; + + int r = get_swift_container_settings(s, store, &policy, &has_policy, + &policy_rw_mask, &cors_config, &has_cors); + if (r < 0) { + return r; + } + + if (!has_policy) { + policy.create_default(s->user->user_id, s->user->display_name); + } + + location_constraint = store->svc.zone->get_zonegroup().api_name; + get_rmattrs_from_headers(s, CONT_PUT_ATTR_PREFIX, + CONT_REMOVE_ATTR_PREFIX, rmattr_names); + placement_rule.init(s->info.env->get("HTTP_X_STORAGE_POLICY", ""), s->info.storage_class); + + return get_swift_versioning_settings(s, swift_ver_location); +} + +static inline int handle_metadata_errors(req_state* const s, const int op_ret) +{ + if (op_ret == -EFBIG) { + /* Handle the custom error message of exceeding maximum custom attribute + * (stored as xattr) size. */ + const auto error_message = boost::str( + boost::format("Metadata value longer than %lld") + % s->cct->_conf.get_val("rgw_max_attr_size")); + set_req_state_err(s, EINVAL, error_message); + return -EINVAL; + } else if (op_ret == -E2BIG) { + const auto error_message = boost::str( + boost::format("Too many metadata items; max %lld") + % s->cct->_conf.get_val("rgw_max_attrs_num_in_req")); + set_req_state_err(s, EINVAL, error_message); + return -EINVAL; + } + + return op_ret; +} + +void RGWCreateBucket_ObjStore_SWIFT::send_response() +{ + const auto meta_ret = handle_metadata_errors(s, op_ret); + if (meta_ret != op_ret) { + op_ret = meta_ret; + } else { + if (!op_ret) { + op_ret = STATUS_CREATED; + } else if (op_ret == -ERR_BUCKET_EXISTS) { + op_ret = STATUS_ACCEPTED; + } + set_req_state_err(s, op_ret); + } + + dump_errno(s); + /* Propose ending HTTP header with 0 Content-Length header. */ + end_header(s, NULL, NULL, 0); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWDeleteBucket_ObjStore_SWIFT::send_response() +{ + int r = op_ret; + if (!r) + r = STATUS_NO_CONTENT; + + set_req_state_err(s, r); + dump_errno(s); + end_header(s, this, NULL, 0); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +static int get_delete_at_param(req_state *s, boost::optional &delete_at) +{ + /* Handle Swift object expiration. */ + real_time delat_proposal; + string x_delete = s->info.env->get("HTTP_X_DELETE_AFTER", ""); + + if (x_delete.empty()) { + x_delete = s->info.env->get("HTTP_X_DELETE_AT", ""); + } else { + /* X-Delete-After HTTP is present. It means we need add its value + * to the current time. */ + delat_proposal = real_clock::now(); + } + + if (x_delete.empty()) { + delete_at = boost::none; + if (s->info.env->exists("HTTP_X_REMOVE_DELETE_AT")) { + delete_at = boost::in_place(real_time()); + } + return 0; + } + string err; + long ts = strict_strtoll(x_delete.c_str(), 10, &err); + + if (!err.empty()) { + return -EINVAL; + } + + delat_proposal += make_timespan(ts); + if (delat_proposal < real_clock::now()) { + return -EINVAL; + } + + delete_at = delat_proposal; + + return 0; +} + +int RGWPutObj_ObjStore_SWIFT::verify_permission() +{ + op_ret = RGWPutObj_ObjStore::verify_permission(); + + /* We have to differentiate error codes depending on whether user is + * anonymous (401 Unauthorized) or he doesn't have necessary permissions + * (403 Forbidden). */ + if (s->auth.identity->is_anonymous() && op_ret == -EACCES) { + return -EPERM; + } else { + return op_ret; + } +} + +int RGWPutObj_ObjStore_SWIFT::update_slo_segment_size(rgw_slo_entry& entry) { + + int r = 0; + const string& path = entry.path; + + /* If the path starts with slashes, strip them all. */ + const size_t pos_init = path.find_first_not_of('/'); + + if (pos_init == string::npos) { + return -EINVAL; + } + + const size_t pos_sep = path.find('/', pos_init); + if (pos_sep == string::npos) { + return -EINVAL; + } + + string bucket_name = path.substr(pos_init, pos_sep - pos_init); + string obj_name = path.substr(pos_sep + 1); + + rgw_bucket bucket; + + if (bucket_name.compare(s->bucket.name) != 0) { + RGWBucketInfo bucket_info; + map bucket_attrs; + auto obj_ctx = store->svc.sysobj->init_obj_ctx(); + r = store->get_bucket_info(obj_ctx, s->user->user_id.tenant, + bucket_name, bucket_info, nullptr, + &bucket_attrs); + if (r < 0) { + ldpp_dout(this, 0) << "could not get bucket info for bucket=" + << bucket_name << dendl; + return r; + } + bucket = bucket_info.bucket; + } else { + bucket = s->bucket; + } + + /* fetch the stored size of the seg (or error if not valid) */ + rgw_obj_key slo_key(obj_name); + rgw_obj slo_seg(bucket, slo_key); + + /* no prefetch */ + RGWObjectCtx obj_ctx(store); + obj_ctx.set_atomic(slo_seg); + + RGWRados::Object op_target(store, s->bucket_info, obj_ctx, slo_seg); + RGWRados::Object::Read read_op(&op_target); + + bool compressed; + RGWCompressionInfo cs_info; + map attrs; + uint64_t size_bytes{0}; + + read_op.params.attrs = &attrs; + read_op.params.obj_size = &size_bytes; + + r = read_op.prepare(); + if (r < 0) { + return r; + } + + r = rgw_compression_info_from_attrset(attrs, compressed, cs_info); + if (r < 0) { + return -EIO; + } + + if (compressed) { + size_bytes = cs_info.orig_size; + } + + /* "When the PUT operation sees the multipart-manifest=put query + * parameter, it reads the request body and verifies that each + * segment object exists and that the sizes and ETags match. If + * there is a mismatch, the PUT operation fails." + */ + if (entry.size_bytes && + (entry.size_bytes != size_bytes)) { + return -EINVAL; + } + + entry.size_bytes = size_bytes; + + return 0; +} /* RGWPutObj_ObjStore_SWIFT::update_slo_segment_sizes */ + +int RGWPutObj_ObjStore_SWIFT::get_params() +{ + if (s->has_bad_meta) { + return -EINVAL; + } + + if (!s->length) { + const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING"); + if (!encoding || strcmp(encoding, "chunked") != 0) { + ldout(s->cct, 20) << "neither length nor chunked encoding" << dendl; + return -ERR_LENGTH_REQUIRED; + } + + chunked_upload = true; + } + + supplied_etag = s->info.env->get("HTTP_ETAG"); + + if (!s->generic_attrs.count(RGW_ATTR_CONTENT_TYPE)) { + ldout(s->cct, 5) << "content type wasn't provided, trying to guess" << dendl; + const char *suffix = strrchr(s->object.name.c_str(), '.'); + if (suffix) { + suffix++; + if (*suffix) { + string suffix_str(suffix); + const char *mime = rgw_find_mime_by_ext(suffix_str); + if (mime) { + s->generic_attrs[RGW_ATTR_CONTENT_TYPE] = mime; + } + } + } + } + + policy.create_default(s->user->user_id, s->user->display_name); + + int r = get_delete_at_param(s, delete_at); + if (r < 0) { + ldout(s->cct, 5) << "ERROR: failed to get Delete-At param" << dendl; + return r; + } + + if (!s->cct->_conf->rgw_swift_custom_header.empty()) { + string custom_header = s->cct->_conf->rgw_swift_custom_header; + if (s->info.env->exists(custom_header.c_str())) { + user_data = s->info.env->get(custom_header.c_str()); + } + } + + dlo_manifest = s->info.env->get("HTTP_X_OBJECT_MANIFEST"); + bool exists; + string multipart_manifest = s->info.args.get("multipart-manifest", &exists); + if (exists) { + if (multipart_manifest != "put") { + ldout(s->cct, 5) << "invalid multipart-manifest http param: " << multipart_manifest << dendl; + return -EINVAL; + } + +#define MAX_SLO_ENTRY_SIZE (1024 + 128) // 1024 - max obj name, 128 - enough extra for other info + uint64_t max_len = s->cct->_conf->rgw_max_slo_entries * MAX_SLO_ENTRY_SIZE; + + slo_info = new RGWSLOInfo; + + int r = 0; + std::tie(r, slo_info->raw_data) = rgw_rest_get_json_input_keep_data(s->cct, s, slo_info->entries, max_len); + if (r < 0) { + ldout(s->cct, 5) << "failed to read input for slo r=" << r << dendl; + return r; + } + + if ((int64_t)slo_info->entries.size() > s->cct->_conf->rgw_max_slo_entries) { + ldout(s->cct, 5) << "too many entries in slo request: " << slo_info->entries.size() << dendl; + return -EINVAL; + } + + MD5 etag_sum; + uint64_t total_size = 0; + for (auto& entry : slo_info->entries) { + etag_sum.Update((const unsigned char *)entry.etag.c_str(), + entry.etag.length()); + + /* if size_bytes == 0, it should be replaced with the + * real segment size (which could be 0); this follows from the + * fact that Swift requires all segments to exist, but permits + * the size_bytes element to be omitted from the SLO manifest, see + * https://docs.openstack.org/swift/latest/api/large_objects.html + */ + r = update_slo_segment_size(entry); + if (r < 0) { + return r; + } + + total_size += entry.size_bytes; + + ldout(s->cct, 20) << "slo_part: " << entry.path + << " size=" << entry.size_bytes + << " etag=" << entry.etag + << dendl; + } + complete_etag(etag_sum, &lo_etag); + slo_info->total_size = total_size; + + ofs = slo_info->raw_data.length(); + } + + return RGWPutObj_ObjStore::get_params(); +} + +void RGWPutObj_ObjStore_SWIFT::send_response() +{ + const auto meta_ret = handle_metadata_errors(s, op_ret); + if (meta_ret) { + op_ret = meta_ret; + } else { + if (!op_ret) { + op_ret = STATUS_CREATED; + } + set_req_state_err(s, op_ret); + } + + if (! lo_etag.empty()) { + /* Static Large Object of Swift API has two etags represented by + * following members: + * - etag - for the manifest itself (it will be stored in xattrs), + * - lo_etag - for the content composited from SLO's segments. + * The value is calculated basing on segments' etags. + * In response for PUT request we have to expose the second one. + * The first one may be obtained by GET with "multipart-manifest=get" + * in query string on a given SLO. */ + dump_etag(s, lo_etag, true /* quoted */); + } else { + dump_etag(s, etag); + } + + dump_last_modified(s, mtime); + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +static int get_swift_account_settings(req_state * const s, + RGWRados * const store, + RGWAccessControlPolicy_SWIFTAcct * const policy, + bool * const has_policy) +{ + *has_policy = false; + + const char * const acl_attr = s->info.env->get("HTTP_X_ACCOUNT_ACCESS_CONTROL"); + if (acl_attr) { + RGWAccessControlPolicy_SWIFTAcct swift_acct_policy(s->cct); + const bool r = swift_acct_policy.create(store, + s->user->user_id, + s->user->display_name, + string(acl_attr)); + if (r != true) { + return -EINVAL; + } + + *policy = swift_acct_policy; + *has_policy = true; + } + + return 0; +} + +int RGWPutMetadataAccount_ObjStore_SWIFT::get_params() +{ + if (s->has_bad_meta) { + return -EINVAL; + } + + int ret = get_swift_account_settings(s, + store, + // FIXME: we need to carry unique_ptr in generic class + // and allocate appropriate ACL class in the ctor + static_cast(&policy), + &has_policy); + if (ret < 0) { + return ret; + } + + get_rmattrs_from_headers(s, ACCT_PUT_ATTR_PREFIX, ACCT_REMOVE_ATTR_PREFIX, + rmattr_names); + return 0; +} + +void RGWPutMetadataAccount_ObjStore_SWIFT::send_response() +{ + const auto meta_ret = handle_metadata_errors(s, op_ret); + if (meta_ret != op_ret) { + op_ret = meta_ret; + } else { + if (!op_ret) { + op_ret = STATUS_NO_CONTENT; + } + set_req_state_err(s, op_ret); + } + + dump_errno(s); + end_header(s, this); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +int RGWPutMetadataBucket_ObjStore_SWIFT::get_params() +{ + if (s->has_bad_meta) { + return -EINVAL; + } + + int r = get_swift_container_settings(s, store, &policy, &has_policy, + &policy_rw_mask, &cors_config, &has_cors); + if (r < 0) { + return r; + } + + get_rmattrs_from_headers(s, CONT_PUT_ATTR_PREFIX, CONT_REMOVE_ATTR_PREFIX, + rmattr_names); + placement_rule.init(s->info.env->get("HTTP_X_STORAGE_POLICY", ""), s->info.storage_class); + + return get_swift_versioning_settings(s, swift_ver_location); +} + +void RGWPutMetadataBucket_ObjStore_SWIFT::send_response() +{ + const auto meta_ret = handle_metadata_errors(s, op_ret); + if (meta_ret != op_ret) { + op_ret = meta_ret; + } else { + if (!op_ret && (op_ret != -EINVAL)) { + op_ret = STATUS_NO_CONTENT; + } + set_req_state_err(s, op_ret); + } + + dump_errno(s); + end_header(s, this); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +int RGWPutMetadataObject_ObjStore_SWIFT::get_params() +{ + if (s->has_bad_meta) { + return -EINVAL; + } + + /* Handle Swift object expiration. */ + int r = get_delete_at_param(s, delete_at); + if (r < 0) { + ldout(s->cct, 5) << "ERROR: failed to get Delete-At param" << dendl; + return r; + } + + dlo_manifest = s->info.env->get("HTTP_X_OBJECT_MANIFEST"); + + return 0; +} + +void RGWPutMetadataObject_ObjStore_SWIFT::send_response() +{ + const auto meta_ret = handle_metadata_errors(s, op_ret); + if (meta_ret != op_ret) { + op_ret = meta_ret; + } else { + if (!op_ret) { + op_ret = STATUS_ACCEPTED; + } + set_req_state_err(s, op_ret); + } + + if (!s->is_err()) { + dump_content_length(s, 0); + } + + dump_errno(s); + end_header(s, this); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +static void bulkdelete_respond(const unsigned num_deleted, + const unsigned int num_unfound, + const std::list& failures, + const int prot_flags, /* in */ + ceph::Formatter& formatter) /* out */ +{ + formatter.open_object_section("delete"); + + string resp_status; + string resp_body; + + if (!failures.empty()) { + int reason = ERR_INVALID_REQUEST; + for (const auto fail_desc : failures) { + if (-ENOENT != fail_desc.err && -EACCES != fail_desc.err) { + reason = fail_desc.err; + } + } + rgw_err err; + set_req_state_err(err, reason, prot_flags); + dump_errno(err, resp_status); + } else if (0 == num_deleted && 0 == num_unfound) { + /* 400 Bad Request */ + dump_errno(400, resp_status); + resp_body = "Invalid bulk delete."; + } else { + /* 200 OK */ + dump_errno(200, resp_status); + } + + encode_json("Number Deleted", num_deleted, &formatter); + encode_json("Number Not Found", num_unfound, &formatter); + encode_json("Response Body", resp_body, &formatter); + encode_json("Response Status", resp_status, &formatter); + + formatter.open_array_section("Errors"); + for (const auto fail_desc : failures) { + formatter.open_array_section("object"); + + stringstream ss_name; + ss_name << fail_desc.path; + encode_json("Name", ss_name.str(), &formatter); + + rgw_err err; + set_req_state_err(err, fail_desc.err, prot_flags); + string status; + dump_errno(err, status); + encode_json("Status", status, &formatter); + formatter.close_section(); + } + formatter.close_section(); + + formatter.close_section(); +} + +int RGWDeleteObj_ObjStore_SWIFT::verify_permission() +{ + op_ret = RGWDeleteObj_ObjStore::verify_permission(); + + /* We have to differentiate error codes depending on whether user is + * anonymous (401 Unauthorized) or he doesn't have necessary permissions + * (403 Forbidden). */ + if (s->auth.identity->is_anonymous() && op_ret == -EACCES) { + return -EPERM; + } else { + return op_ret; + } +} + +int RGWDeleteObj_ObjStore_SWIFT::get_params() +{ + const string& mm = s->info.args.get("multipart-manifest"); + multipart_delete = (mm.compare("delete") == 0); + + return RGWDeleteObj_ObjStore::get_params(); +} + +void RGWDeleteObj_ObjStore_SWIFT::send_response() +{ + int r = op_ret; + + if (multipart_delete) { + r = 0; + } else if(!r) { + r = STATUS_NO_CONTENT; + } + + set_req_state_err(s, r); + dump_errno(s); + + if (multipart_delete) { + end_header(s, this /* RGWOp */, nullptr /* contype */, + CHUNKED_TRANSFER_ENCODING); + + if (deleter) { + bulkdelete_respond(deleter->get_num_deleted(), + deleter->get_num_unfound(), + deleter->get_failures(), + s->prot_flags, + *s->formatter); + } else if (-ENOENT == op_ret) { + bulkdelete_respond(0, 1, {}, s->prot_flags, *s->formatter); + } else { + RGWBulkDelete::acct_path_t path; + path.bucket_name = s->bucket_name; + path.obj_key = s->object; + + RGWBulkDelete::fail_desc_t fail_desc; + fail_desc.err = op_ret; + fail_desc.path = path; + + bulkdelete_respond(0, 0, { fail_desc }, s->prot_flags, *s->formatter); + } + } else { + end_header(s, this); + } + + rgw_flush_formatter_and_reset(s, s->formatter); + +} + +static void get_contype_from_attrs(map& attrs, + string& content_type) +{ + map::iterator iter = attrs.find(RGW_ATTR_CONTENT_TYPE); + if (iter != attrs.end()) { + content_type = rgw_bl_str(iter->second); + } +} + +static void dump_object_metadata(struct req_state * const s, + const map& attrs) +{ + map response_attrs; + + for (auto kv : attrs) { + const char * name = kv.first.c_str(); + const auto aiter = rgw_to_http_attrs.find(name); + + if (aiter != std::end(rgw_to_http_attrs)) { + response_attrs[aiter->second] = rgw_bl_str(kv.second); + } else if (strcmp(name, RGW_ATTR_SLO_UINDICATOR) == 0) { + // this attr has an extra length prefix from encode() in prior versions + dump_header(s, "X-Object-Meta-Static-Large-Object", "True"); + } else if (strncmp(name, RGW_ATTR_META_PREFIX, + sizeof(RGW_ATTR_META_PREFIX)-1) == 0) { + name += sizeof(RGW_ATTR_META_PREFIX) - 1; + dump_header_prefixed(s, "X-Object-Meta-", + camelcase_dash_http_attr(name), kv.second); + } + } + + /* Handle override and fallback for Content-Disposition HTTP header. + * At the moment this will be used only by TempURL of the Swift API. */ + const auto cditer = rgw_to_http_attrs.find(RGW_ATTR_CONTENT_DISP); + if (cditer != std::end(rgw_to_http_attrs)) { + const auto& name = cditer->second; + + if (!s->content_disp.override.empty()) { + response_attrs[name] = s->content_disp.override; + } else if (!s->content_disp.fallback.empty() + && response_attrs.find(name) == std::end(response_attrs)) { + response_attrs[name] = s->content_disp.fallback; + } + } + + for (const auto kv : response_attrs) { + dump_header(s, kv.first, kv.second); + } + + const auto iter = attrs.find(RGW_ATTR_DELETE_AT); + if (iter != std::end(attrs)) { + utime_t delete_at; + try { + decode(delete_at, iter->second); + if (!delete_at.is_zero()) { + dump_header(s, "X-Delete-At", delete_at.sec()); + } + } catch (buffer::error& err) { + ldout(s->cct, 0) << "ERROR: cannot decode object's " RGW_ATTR_DELETE_AT + " attr, ignoring" + << dendl; + } + } +} + +int RGWCopyObj_ObjStore_SWIFT::init_dest_policy() +{ + dest_policy.create_default(s->user->user_id, s->user->display_name); + + return 0; +} + +int RGWCopyObj_ObjStore_SWIFT::get_params() +{ + if_mod = s->info.env->get("HTTP_IF_MODIFIED_SINCE"); + if_unmod = s->info.env->get("HTTP_IF_UNMODIFIED_SINCE"); + if_match = s->info.env->get("HTTP_COPY_IF_MATCH"); + if_nomatch = s->info.env->get("HTTP_COPY_IF_NONE_MATCH"); + + src_tenant_name = s->src_tenant_name; + src_bucket_name = s->src_bucket_name; + src_object = s->src_object; + dest_tenant_name = s->bucket_tenant; + dest_bucket_name = s->bucket_name; + dest_object = s->object.name; + + const char * const fresh_meta = s->info.env->get("HTTP_X_FRESH_METADATA"); + if (fresh_meta && strcasecmp(fresh_meta, "TRUE") == 0) { + attrs_mod = RGWRados::ATTRSMOD_REPLACE; + } else { + attrs_mod = RGWRados::ATTRSMOD_MERGE; + } + + int r = get_delete_at_param(s, delete_at); + if (r < 0) { + ldout(s->cct, 5) << "ERROR: failed to get Delete-At param" << dendl; + return r; + } + + return 0; +} + +void RGWCopyObj_ObjStore_SWIFT::send_partial_response(off_t ofs) +{ + if (! sent_header) { + if (! op_ret) + op_ret = STATUS_CREATED; + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this); + + /* Send progress information. Note that this diverge from the original swift + * spec. We do this in order to keep connection alive. + */ + if (op_ret == 0) { + s->formatter->open_array_section("progress"); + } + sent_header = true; + } else { + s->formatter->dump_int("ofs", (uint64_t)ofs); + } + rgw_flush_formatter(s, s->formatter); +} + +void RGWCopyObj_ObjStore_SWIFT::dump_copy_info() +{ + /* Dump X-Copied-From. */ + dump_header(s, "X-Copied-From", url_encode(src_bucket.name) + + "/" + url_encode(src_object.name)); + + /* Dump X-Copied-From-Account. */ + /* XXX tenant */ + dump_header(s, "X-Copied-From-Account", url_encode(s->user->user_id.id)); + + /* Dump X-Copied-From-Last-Modified. */ + dump_time_header(s, "X-Copied-From-Last-Modified", src_mtime); +} + +void RGWCopyObj_ObjStore_SWIFT::send_response() +{ + if (! sent_header) { + string content_type; + if (! op_ret) + op_ret = STATUS_CREATED; + set_req_state_err(s, op_ret); + dump_errno(s); + dump_etag(s, etag); + dump_last_modified(s, mtime); + dump_copy_info(); + get_contype_from_attrs(attrs, content_type); + dump_object_metadata(s, attrs); + end_header(s, this, !content_type.empty() ? content_type.c_str() + : "binary/octet-stream"); + } else { + s->formatter->close_section(); + rgw_flush_formatter(s, s->formatter); + } +} + +int RGWGetObj_ObjStore_SWIFT::verify_permission() +{ + op_ret = RGWGetObj_ObjStore::verify_permission(); + + /* We have to differentiate error codes depending on whether user is + * anonymous (401 Unauthorized) or he doesn't have necessary permissions + * (403 Forbidden). */ + if (s->auth.identity->is_anonymous() && op_ret == -EACCES) { + return -EPERM; + } else { + return op_ret; + } +} + +int RGWGetObj_ObjStore_SWIFT::get_params() +{ + const string& mm = s->info.args.get("multipart-manifest"); + skip_manifest = (mm.compare("get") == 0); + + return RGWGetObj_ObjStore::get_params(); +} + +int RGWGetObj_ObjStore_SWIFT::send_response_data_error() +{ + std::string error_content; + op_ret = error_handler(op_ret, &error_content); + if (! op_ret) { + /* The error handler has taken care of the error. */ + return 0; + } + + bufferlist error_bl; + error_bl.append(error_content); + return send_response_data(error_bl, 0, error_bl.length()); +} + +int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl, + const off_t bl_ofs, + const off_t bl_len) +{ + string content_type; + + if (sent_header) { + goto send_data; + } + + if (custom_http_ret) { + set_req_state_err(s, 0); + dump_errno(s, custom_http_ret); + } else { + set_req_state_err(s, (partial_content && !op_ret) ? STATUS_PARTIAL_CONTENT + : op_ret); + dump_errno(s); + + if (s->is_err()) { + end_header(s, NULL); + return 0; + } + } + + if (range_str) { + dump_range(s, ofs, end, s->obj_size); + } + + if (s->is_err()) { + end_header(s, NULL); + return 0; + } + + dump_content_length(s, total_len); + dump_last_modified(s, lastmod); + dump_header(s, "X-Timestamp", utime_t(lastmod)); + if (is_slo) { + dump_header(s, "X-Static-Large-Object", "True"); + } + + if (! op_ret) { + if (! lo_etag.empty()) { + dump_etag(s, lo_etag, true /* quoted */); + } else { + auto iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + dump_etag(s, iter->second.to_str()); + } + } + + get_contype_from_attrs(attrs, content_type); + dump_object_metadata(s, attrs); + } + + end_header(s, this, !content_type.empty() ? content_type.c_str() + : "binary/octet-stream"); + + sent_header = true; + +send_data: + if (get_data && !op_ret) { + const auto r = dump_body(s, bl.c_str() + bl_ofs, bl_len); + if (r < 0) { + return r; + } + } + rgw_flush_formatter_and_reset(s, s->formatter); + + return 0; +} + +void RGWOptionsCORS_ObjStore_SWIFT::send_response() +{ + string hdrs, exp_hdrs; + uint32_t max_age = CORS_MAX_AGE_INVALID; + /*EACCES means, there is no CORS registered yet for the bucket + *ENOENT means, there is no match of the Origin in the list of CORSRule + */ + if (op_ret == -ENOENT) + op_ret = -EACCES; + if (op_ret < 0) { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, NULL); + return; + } + get_response_params(hdrs, exp_hdrs, &max_age); + dump_errno(s); + dump_access_control(s, origin, req_meth, hdrs.c_str(), exp_hdrs.c_str(), + max_age); + end_header(s, NULL); +} + +int RGWBulkDelete_ObjStore_SWIFT::get_data( + list& items, bool * const is_truncated) +{ + constexpr size_t MAX_LINE_SIZE = 2048; + + RGWClientIOStreamBuf ciosb(static_cast(*(s->cio)), + size_t(s->cct->_conf->rgw_max_chunk_size)); + istream cioin(&ciosb); + + char buf[MAX_LINE_SIZE]; + while (cioin.getline(buf, sizeof(buf))) { + string path_str(buf); + + ldout(s->cct, 20) << "extracted Bulk Delete entry: " << path_str << dendl; + + RGWBulkDelete::acct_path_t path; + + /* We need to skip all slashes at the beginning in order to preserve + * compliance with Swift. */ + const size_t start_pos = path_str.find_first_not_of('/'); + + if (string::npos != start_pos) { + /* Seperator is the first slash after the leading ones. */ + const size_t sep_pos = path_str.find('/', start_pos); + + if (string::npos != sep_pos) { + path.bucket_name = url_decode(path_str.substr(start_pos, + sep_pos - start_pos)); + path.obj_key = url_decode(path_str.substr(sep_pos + 1)); + } else { + /* It's guaranteed here that bucket name is at least one character + * long and is different than slash. */ + path.bucket_name = url_decode(path_str.substr(start_pos)); + } + + items.push_back(path); + } + + if (items.size() == MAX_CHUNK_ENTRIES) { + *is_truncated = true; + return 0; + } + } + + *is_truncated = false; + return 0; +} + +void RGWBulkDelete_ObjStore_SWIFT::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this /* RGWOp */, nullptr /* contype */, + CHUNKED_TRANSFER_ENCODING); + + bulkdelete_respond(deleter->get_num_deleted(), + deleter->get_num_unfound(), + deleter->get_failures(), + s->prot_flags, + *s->formatter); + rgw_flush_formatter_and_reset(s, s->formatter); +} + + +std::unique_ptr +RGWBulkUploadOp_ObjStore_SWIFT::create_stream() +{ + class SwiftStreamGetter : public StreamGetter { + const size_t conlen; + size_t curpos; + req_state* const s; + + public: + SwiftStreamGetter(req_state* const s, const size_t conlen) + : conlen(conlen), + curpos(0), + s(s) { + } + + ssize_t get_at_most(size_t want, ceph::bufferlist& dst) override { + /* maximum requested by a caller */ + /* data provided by client */ + /* RadosGW's limit. */ + const size_t max_chunk_size = \ + static_cast(s->cct->_conf->rgw_max_chunk_size); + const size_t max_to_read = std::min({ want, conlen - curpos, max_chunk_size }); + + ldout(s->cct, 20) << "bulk_upload: get_at_most max_to_read=" + << max_to_read + << ", dst.c_str()=" << reinterpret_cast(dst.c_str()) << dendl; + + bufferptr bp(max_to_read); + const auto read_len = recv_body(s, bp.c_str(), max_to_read); + dst.append(bp, 0, read_len); + //const auto read_len = recv_body(s, dst.c_str(), max_to_read); + if (read_len < 0) { + return read_len; + } + + curpos += read_len; + return curpos > s->cct->_conf->rgw_max_put_size ? -ERR_TOO_LARGE + : read_len; + } + + ssize_t get_exactly(size_t want, ceph::bufferlist& dst) override { + ldout(s->cct, 20) << "bulk_upload: get_exactly want=" << want << dendl; + + /* FIXME: do this in a loop. */ + const auto ret = get_at_most(want, dst); + ldout(s->cct, 20) << "bulk_upload: get_exactly ret=" << ret << dendl; + if (ret < 0) { + return ret; + } else if (static_cast(ret) != want) { + return -EINVAL; + } else { + return want; + } + } + }; + + if (! s->length) { + op_ret = -EINVAL; + return nullptr; + } else { + ldout(s->cct, 20) << "bulk upload: create_stream for length=" + << s->length << dendl; + + const size_t conlen = atoll(s->length); + return std::unique_ptr(new SwiftStreamGetter(s, conlen)); + } +} + +void RGWBulkUploadOp_ObjStore_SWIFT::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this /* RGWOp */, nullptr /* contype */, + CHUNKED_TRANSFER_ENCODING); + rgw_flush_formatter_and_reset(s, s->formatter); + + s->formatter->open_object_section("delete"); + + std::string resp_status; + std::string resp_body; + + if (! failures.empty()) { + rgw_err err; + + const auto last_err = { failures.back().err }; + if (boost::algorithm::contains(last_err, terminal_errors)) { + /* The terminal errors are affecting the status of the whole upload. */ + set_req_state_err(err, failures.back().err, s->prot_flags); + } else { + set_req_state_err(err, ERR_INVALID_REQUEST, s->prot_flags); + } + + dump_errno(err, resp_status); + } else if (0 == num_created && failures.empty()) { + /* Nothing created, nothing failed. This means the archive contained no + * entity we could understand (regular file or directory). We need to + * send 400 Bad Request to an HTTP client in the internal status field. */ + dump_errno(400, resp_status); + resp_body = "Invalid Tar File: No Valid Files"; + } else { + /* 200 OK */ + dump_errno(201, resp_status); + } + + encode_json("Number Files Created", num_created, s->formatter); + encode_json("Response Body", resp_body, s->formatter); + encode_json("Response Status", resp_status, s->formatter); + + s->formatter->open_array_section("Errors"); + for (const auto& fail_desc : failures) { + s->formatter->open_array_section("object"); + + encode_json("Name", fail_desc.path, s->formatter); + + rgw_err err; + set_req_state_err(err, fail_desc.err, s->prot_flags); + std::string status; + dump_errno(err, status); + encode_json("Status", status, s->formatter); + + s->formatter->close_section(); + } + s->formatter->close_section(); + + s->formatter->close_section(); + rgw_flush_formatter_and_reset(s, s->formatter); +} + + +void RGWGetCrossDomainPolicy_ObjStore_SWIFT::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, "application/xml"); + + std::stringstream ss; + + ss << R"()" << "\n" + << R"()" << "\n" + << R"()" << "\n" + << g_conf()->rgw_cross_domain_policy << "\n" + << R"()"; + + dump_body(s, ss.str()); +} + +void RGWGetHealthCheck_ObjStore_SWIFT::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this, "application/xml"); + + if (op_ret) { + static constexpr char DISABLED[] = "DISABLED BY FILE"; + dump_body(s, DISABLED, strlen(DISABLED)); + } +} + +const vector> RGWInfo_ObjStore_SWIFT::swift_info = +{ + {"bulk_delete", {false, nullptr}}, + {"container_quotas", {false, nullptr}}, + {"swift", {false, RGWInfo_ObjStore_SWIFT::list_swift_data}}, + {"tempurl", { false, RGWInfo_ObjStore_SWIFT::list_tempurl_data}}, + {"slo", {false, RGWInfo_ObjStore_SWIFT::list_slo_data}}, + {"account_quotas", {false, nullptr}}, + {"staticweb", {false, nullptr}}, + {"tempauth", {false, RGWInfo_ObjStore_SWIFT::list_tempauth_data}}, +}; + +void RGWInfo_ObjStore_SWIFT::execute() +{ + bool is_admin_info_enabled = false; + + const string& swiftinfo_sig = s->info.args.get("swiftinfo_sig"); + const string& swiftinfo_expires = s->info.args.get("swiftinfo_expires"); + + if (!swiftinfo_sig.empty() && + !swiftinfo_expires.empty() && + !is_expired(swiftinfo_expires, s->cct)) { + is_admin_info_enabled = true; + } + + s->formatter->open_object_section("info"); + + for (const auto& pair : swift_info) { + if(!is_admin_info_enabled && pair.second.is_admin_info) + continue; + + if (!pair.second.list_data) { + s->formatter->open_object_section((pair.first).c_str()); + s->formatter->close_section(); + } + else { + pair.second.list_data(*(s->formatter), s->cct->_conf, *store); + } + } + + s->formatter->close_section(); +} + +void RGWInfo_ObjStore_SWIFT::send_response() +{ + if (op_ret < 0) { + op_ret = STATUS_NO_CONTENT; + } + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s, this); + rgw_flush_formatter_and_reset(s, s->formatter); +} + +void RGWInfo_ObjStore_SWIFT::list_swift_data(Formatter& formatter, + const ConfigProxy& config, + RGWRados& store) +{ + formatter.open_object_section("swift"); + formatter.dump_int("max_file_size", config->rgw_max_put_size); + formatter.dump_int("container_listing_limit", RGW_LIST_BUCKETS_LIMIT_MAX); + + string ceph_version(CEPH_GIT_NICE_VER); + formatter.dump_string("version", ceph_version); + + const size_t max_attr_name_len = \ + g_conf().get_val("rgw_max_attr_name_len"); + if (max_attr_name_len) { + const size_t meta_name_limit = \ + max_attr_name_len - strlen(RGW_ATTR_PREFIX RGW_AMZ_META_PREFIX); + formatter.dump_int("max_meta_name_length", meta_name_limit); + } + + const size_t meta_value_limit = g_conf().get_val("rgw_max_attr_size"); + if (meta_value_limit) { + formatter.dump_int("max_meta_value_length", meta_value_limit); + } + + const size_t meta_num_limit = \ + g_conf().get_val("rgw_max_attrs_num_in_req"); + if (meta_num_limit) { + formatter.dump_int("max_meta_count", meta_num_limit); + } + + formatter.open_array_section("policies"); + const RGWZoneGroup& zonegroup = store.svc.zone->get_zonegroup(); + + for (const auto& placement_targets : zonegroup.placement_targets) { + formatter.open_object_section("policy"); + if (placement_targets.second.name.compare(zonegroup.default_placement.name) == 0) + formatter.dump_bool("default", true); + formatter.dump_string("name", placement_targets.second.name.c_str()); + formatter.close_section(); + } + formatter.close_section(); + + formatter.dump_int("max_object_name_size", RGWHandler_REST::MAX_OBJ_NAME_LEN); + formatter.dump_bool("strict_cors_mode", true); + formatter.dump_int("max_container_name_length", RGWHandler_REST::MAX_BUCKET_NAME_LEN); + formatter.close_section(); +} + +void RGWInfo_ObjStore_SWIFT::list_tempauth_data(Formatter& formatter, + const ConfigProxy& config, + RGWRados& store) +{ + formatter.open_object_section("tempauth"); + formatter.dump_bool("account_acls", true); + formatter.close_section(); +} +void RGWInfo_ObjStore_SWIFT::list_tempurl_data(Formatter& formatter, + const ConfigProxy& config, + RGWRados& store) +{ + formatter.open_object_section("tempurl"); + formatter.open_array_section("methods"); + formatter.dump_string("methodname", "GET"); + formatter.dump_string("methodname", "HEAD"); + formatter.dump_string("methodname", "PUT"); + formatter.dump_string("methodname", "POST"); + formatter.dump_string("methodname", "DELETE"); + formatter.close_section(); + formatter.close_section(); +} + +void RGWInfo_ObjStore_SWIFT::list_slo_data(Formatter& formatter, + const ConfigProxy& config, + RGWRados& store) +{ + formatter.open_object_section("slo"); + formatter.dump_int("max_manifest_segments", config->rgw_max_slo_entries); + formatter.close_section(); +} + +bool RGWInfo_ObjStore_SWIFT::is_expired(const std::string& expires, CephContext* cct) +{ + string err; + const utime_t now = ceph_clock_now(); + const uint64_t expiration = (uint64_t)strict_strtoll(expires.c_str(), + 10, &err); + if (!err.empty()) { + ldout(cct, 5) << "failed to parse siginfo_expires: " << err << dendl; + return true; + } + + if (expiration <= (uint64_t)now.sec()) { + ldout(cct, 5) << "siginfo expired: " << expiration << " <= " << now.sec() << dendl; + return true; + } + + return false; +} + + +void RGWFormPost::init(RGWRados* const store, + req_state* const s, + RGWHandler* const dialect_handler) +{ + prefix = std::move(s->object.name); + s->object = rgw_obj_key(); + + return RGWPostObj_ObjStore::init(store, s, dialect_handler); +} + +std::size_t RGWFormPost::get_max_file_size() /*const*/ +{ + std::string max_str = get_part_str(ctrl_parts, "max_file_size", "0"); + + std::string err; + const std::size_t max_file_size = + static_cast(strict_strtoll(max_str.c_str(), 10, &err)); + + if (! err.empty()) { + ldout(s->cct, 5) << "failed to parse FormPost's max_file_size: " << err + << dendl; + return 0; + } + + return max_file_size; +} + +bool RGWFormPost::is_non_expired() +{ + std::string expires = get_part_str(ctrl_parts, "expires", "0"); + + std::string err; + const uint64_t expires_timestamp = + static_cast(strict_strtoll(expires.c_str(), 10, &err)); + + if (! err.empty()) { + dout(5) << "failed to parse FormPost's expires: " << err << dendl; + return false; + } + + const utime_t now = ceph_clock_now(); + if (expires_timestamp <= static_cast(now.sec())) { + dout(5) << "FormPost form expired: " + << expires_timestamp << " <= " << now.sec() << dendl; + return false; + } + + return true; +} + +bool RGWFormPost::is_integral() +{ + const std::string form_signature = get_part_str(ctrl_parts, "signature"); + + try { + get_owner_info(s, *s->user); + s->auth.identity = rgw::auth::transform_old_authinfo(s); + } catch (...) { + ldout(s->cct, 5) << "cannot get user_info of account's owner" << dendl; + return false; + } + + for (const auto& kv : s->user->temp_url_keys) { + const int temp_url_key_num = kv.first; + const string& temp_url_key = kv.second; + + if (temp_url_key.empty()) { + continue; + } + + SignatureHelper sig_helper; + sig_helper.calc(temp_url_key, + s->info.request_uri, + get_part_str(ctrl_parts, "redirect"), + get_part_str(ctrl_parts, "max_file_size", "0"), + get_part_str(ctrl_parts, "max_file_count", "0"), + get_part_str(ctrl_parts, "expires", "0")); + + const auto local_sig = sig_helper.get_signature(); + + ldout(s->cct, 20) << "FormPost signature [" << temp_url_key_num << "]" + << " (calculated): " << local_sig << dendl; + + if (sig_helper.is_equal_to(form_signature)) { + return true; + } else { + ldout(s->cct, 5) << "FormPost's signature mismatch: " + << local_sig << " != " << form_signature << dendl; + } + } + + return false; +} + +void RGWFormPost::get_owner_info(const req_state* const s, + RGWUserInfo& owner_info) const +{ + /* We cannot use req_state::bucket_name because it isn't available + * now. It will be initialized in RGWHandler_REST_SWIFT::postauth_init(). */ + const string& bucket_name = s->init_state.url_bucket; + + /* TempURL in Formpost only requires that bucket name is specified. */ + if (bucket_name.empty()) { + throw -EPERM; + } + + string bucket_tenant; + if (!s->account_name.empty()) { + RGWUserInfo uinfo; + bool found = false; + + const rgw_user uid(s->account_name); + if (uid.tenant.empty()) { + const rgw_user tenanted_uid(uid.id, uid.id); + + if (rgw_get_user_info_by_uid(store, tenanted_uid, uinfo) >= 0) { + /* Succeeded. */ + bucket_tenant = uinfo.user_id.tenant; + found = true; + } + } + + if (!found && rgw_get_user_info_by_uid(store, uid, uinfo) < 0) { + throw -EPERM; + } else { + bucket_tenant = uinfo.user_id.tenant; + } + } + + /* Need to get user info of bucket owner. */ + RGWBucketInfo bucket_info; + int ret = store->get_bucket_info(*s->sysobj_ctx, + bucket_tenant, bucket_name, + bucket_info, nullptr); + if (ret < 0) { + throw ret; + } + + ldout(s->cct, 20) << "temp url user (bucket owner): " << bucket_info.owner + << dendl; + + if (rgw_get_user_info_by_uid(store, bucket_info.owner, owner_info) < 0) { + throw -EPERM; + } +} + +int RGWFormPost::get_params() +{ + /* The parentt class extracts boundary info from the Content-Type. */ + int ret = RGWPostObj_ObjStore::get_params(); + if (ret < 0) { + return ret; + } + + policy.create_default(s->user->user_id, s->user->display_name); + + /* Let's start parsing the HTTP body by parsing each form part step- + * by-step till encountering the first part with file data. */ + do { + struct post_form_part part; + ret = read_form_part_header(&part, stream_done); + if (ret < 0) { + return ret; + } + + if (s->cct->_conf->subsys.should_gather()) { + ldout(s->cct, 20) << "read part header -- part.name=" + << part.name << dendl; + + for (const auto& pair : part.fields) { + ldout(s->cct, 20) << "field.name=" << pair.first << dendl; + ldout(s->cct, 20) << "field.val=" << pair.second.val << dendl; + ldout(s->cct, 20) << "field.params:" << dendl; + + for (const auto& param_pair : pair.second.params) { + ldout(s->cct, 20) << " " << param_pair.first + << " -> " << param_pair.second << dendl; + } + } + } + + if (stream_done) { + /* Unexpected here. */ + err_msg = "Malformed request"; + return -EINVAL; + } + + const auto field_iter = part.fields.find("Content-Disposition"); + if (std::end(part.fields) != field_iter && + std::end(field_iter->second.params) != field_iter->second.params.find("filename")) { + /* First data part ahead. */ + current_data_part = std::move(part); + + /* Stop the iteration. We can assume that all control parts have been + * already parsed. The rest of HTTP body should contain data parts + * only. They will be picked up by ::get_data(). */ + break; + } else { + /* Control part ahead. Receive, parse and store for later usage. */ + bool boundary; + ret = read_data(part.data, s->cct->_conf->rgw_max_chunk_size, + boundary, stream_done); + if (ret < 0) { + return ret; + } else if (! boundary) { + err_msg = "Couldn't find boundary"; + return -EINVAL; + } + + ctrl_parts[part.name] = std::move(part); + } + } while (! stream_done); + + min_len = 0; + max_len = get_max_file_size(); + + if (! current_data_part) { + err_msg = "FormPost: no files to process"; + return -EINVAL; + } + + if (! is_non_expired()) { + err_msg = "FormPost: Form Expired"; + return -EPERM; + } + + if (! is_integral()) { + err_msg = "FormPost: Invalid Signature"; + return -EPERM; + } + + return 0; +} + +std::string RGWFormPost::get_current_filename() const +{ + try { + const auto& field = current_data_part->fields.at("Content-Disposition"); + const auto iter = field.params.find("filename"); + + if (std::end(field.params) != iter) { + return prefix + iter->second; + } + } catch (std::out_of_range&) { + /* NOP */; + } + + return prefix; +} + +std::string RGWFormPost::get_current_content_type() const +{ + try { + const auto& field = current_data_part->fields.at("Content-Type"); + return field.val; + } catch (std::out_of_range&) { + /* NOP */; + } + + return std::string(); +} + +bool RGWFormPost::is_next_file_to_upload() +{ + if (! stream_done) { + /* We have at least one additional part in the body. */ + struct post_form_part part; + int r = read_form_part_header(&part, stream_done); + if (r < 0) { + return false; + } + + const auto field_iter = part.fields.find("Content-Disposition"); + if (std::end(part.fields) != field_iter) { + const auto& params = field_iter->second.params; + const auto& filename_iter = params.find("filename"); + + if (std::end(params) != filename_iter && ! filename_iter->second.empty()) { + current_data_part = std::move(part); + return true; + } + } + } + + return false; +} + +int RGWFormPost::get_data(ceph::bufferlist& bl, bool& again) +{ + bool boundary; + + int r = read_data(bl, s->cct->_conf->rgw_max_chunk_size, + boundary, stream_done); + if (r < 0) { + return r; + } + + /* Tell RGWPostObj::execute() that it has some data to put. */ + again = !boundary; + + return bl.length(); +} + +void RGWFormPost::send_response() +{ + std::string redirect = get_part_str(ctrl_parts, "redirect"); + if (! redirect.empty()) { + op_ret = STATUS_REDIRECT; + } + + set_req_state_err(s, op_ret); + s->err.err_code = err_msg; + dump_errno(s); + if (! redirect.empty()) { + dump_redirect(s, redirect); + } + end_header(s, this); +} + +bool RGWFormPost::is_formpost_req(req_state* const s) +{ + std::string content_type; + std::map params; + + parse_boundary_params(s->info.env->get("CONTENT_TYPE", ""), + content_type, params); + + return boost::algorithm::iequals(content_type, "multipart/form-data") && + params.count("boundary") > 0; +} + + +RGWOp *RGWHandler_REST_Service_SWIFT::op_get() +{ + return new RGWListBuckets_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Service_SWIFT::op_head() +{ + return new RGWStatAccount_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Service_SWIFT::op_put() +{ + if (s->info.args.exists("extract-archive")) { + return new RGWBulkUploadOp_ObjStore_SWIFT; + } + return nullptr; +} + +RGWOp *RGWHandler_REST_Service_SWIFT::op_post() +{ + if (s->info.args.exists("bulk-delete")) { + return new RGWBulkDelete_ObjStore_SWIFT; + } + return new RGWPutMetadataAccount_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Service_SWIFT::op_delete() +{ + if (s->info.args.exists("bulk-delete")) { + return new RGWBulkDelete_ObjStore_SWIFT; + } + return NULL; +} + +int RGWSwiftWebsiteHandler::serve_errordoc(const int http_ret, + const std::string error_doc) +{ + /* Try to throw it all away. */ + s->formatter->reset(); + + class RGWGetErrorPage : public RGWGetObj_ObjStore_SWIFT { + public: + RGWGetErrorPage(RGWRados* const store, + RGWHandler_REST* const handler, + req_state* const s, + const int http_ret) { + /* Calling a virtual from the base class is safe as the subobject should + * be properly initialized and we haven't overridden the init method. */ + init(store, s, handler); + set_get_data(true); + set_custom_http_response(http_ret); + } + + int error_handler(const int err_no, + std::string* const error_content) override { + /* Enforce that any error generated while getting the error page will + * not be send to a client. This allows us to recover from the double + * fault situation by sending the original message. */ + return 0; + } + } get_errpage_op(store, handler, s, http_ret); + + s->object = std::to_string(http_ret) + error_doc; + + RGWOp* newop = &get_errpage_op; + RGWRequest req(0); + return rgw_process_authenticated(handler, newop, &req, s, true); +} + +int RGWSwiftWebsiteHandler::error_handler(const int err_no, + std::string* const error_content) +{ + const auto& ws_conf = s->bucket_info.website_conf; + + if (can_be_website_req() && ! ws_conf.error_doc.empty()) { + set_req_state_err(s, err_no); + return serve_errordoc(s->err.http_ret, ws_conf.error_doc); + } + + /* Let's go to the default, no-op handler. */ + return err_no; +} + +bool RGWSwiftWebsiteHandler::is_web_mode() const +{ + const boost::string_ref webmode = s->info.env->get("HTTP_X_WEB_MODE", ""); + return boost::algorithm::iequals(webmode, "true"); +} + +bool RGWSwiftWebsiteHandler::can_be_website_req() const +{ + /* Static website works only with the GET or HEAD method. Nothing more. */ + static const std::set ws_methods = { "GET", "HEAD" }; + if (ws_methods.count(s->info.method) == 0) { + return false; + } + + /* We also need to handle early failures from the auth system. In such cases + * req_state::auth.identity may be empty. Let's treat that the same way as + * the anonymous access. */ + if (! s->auth.identity) { + return true; + } + + /* Swift serves websites only for anonymous requests unless client explicitly + * requested this behaviour by supplying X-Web-Mode HTTP header set to true. */ + if (s->auth.identity->is_anonymous() || is_web_mode()) { + return true; + } + + return false; +} + +RGWOp* RGWSwiftWebsiteHandler::get_ws_redirect_op() +{ + class RGWMovedPermanently: public RGWOp { + const std::string location; + public: + explicit RGWMovedPermanently(const std::string& location) + : location(location) { + } + + int verify_permission() override { + return 0; + } + + void execute() override { + op_ret = -ERR_PERMANENT_REDIRECT; + return; + } + + void send_response() override { + set_req_state_err(s, op_ret); + dump_errno(s); + dump_content_length(s, 0); + dump_redirect(s, location); + end_header(s, this); + } + + const char* name() const override { + return "RGWMovedPermanently"; + } + }; + + return new RGWMovedPermanently(s->info.request_uri + '/'); +} + +RGWOp* RGWSwiftWebsiteHandler::get_ws_index_op() +{ + /* Retarget to get obj on requested index file. */ + if (! s->object.empty()) { + s->object = s->object.name + + s->bucket_info.website_conf.get_index_doc(); + } else { + s->object = s->bucket_info.website_conf.get_index_doc(); + } + + auto getop = new RGWGetObj_ObjStore_SWIFT; + getop->set_get_data(boost::algorithm::equals("GET", s->info.method)); + + return getop; +} + +RGWOp* RGWSwiftWebsiteHandler::get_ws_listing_op() +{ + class RGWWebsiteListing : public RGWListBucket_ObjStore_SWIFT { + const std::string prefix_override; + + int get_params() override { + prefix = prefix_override; + max = default_max; + delimiter = "/"; + return 0; + } + + void send_response() override { + /* Generate the header now. */ + set_req_state_err(s, op_ret); + dump_errno(s); + dump_container_metadata(s, bucket, bucket_quota, + s->bucket_info.website_conf); + end_header(s, this, "text/html"); + if (op_ret < 0) { + return; + } + + /* Now it's the time to start generating HTML bucket listing. + * All the crazy stuff with crafting tags will be delegated to + * RGWSwiftWebsiteListingFormatter. */ + std::stringstream ss; + RGWSwiftWebsiteListingFormatter htmler(ss, prefix); + + const auto& ws_conf = s->bucket_info.website_conf; + htmler.generate_header(s->decoded_uri, + ws_conf.listing_css_doc); + + for (const auto& pair : common_prefixes) { + std::string subdir_name = pair.first; + if (! subdir_name.empty()) { + /* To be compliant with Swift we need to remove the trailing + * slash. */ + subdir_name.pop_back(); + } + + htmler.dump_subdir(subdir_name); + } + + for (const rgw_bucket_dir_entry& obj : objs) { + if (! common_prefixes.count(obj.key.name + '/')) { + htmler.dump_object(obj); + } + } + + htmler.generate_footer(); + dump_body(s, ss.str()); + } + public: + /* Taking prefix_override by value to leverage std::string r-value ref + * ctor and thus avoid extra memory copying/increasing ref counter. */ + explicit RGWWebsiteListing(std::string prefix_override) + : prefix_override(std::move(prefix_override)) { + } + }; + + std::string prefix = std::move(s->object.name); + s->object = rgw_obj_key(); + + return new RGWWebsiteListing(std::move(prefix)); +} + +bool RGWSwiftWebsiteHandler::is_web_dir() const +{ + std::string subdir_name = url_decode(s->object.name); + + /* Remove character from the subdir name if it is "/". */ + if (subdir_name.empty()) { + return false; + } else if (subdir_name.back() == '/') { + subdir_name.pop_back(); + } + + rgw_obj obj(s->bucket, std::move(subdir_name)); + + /* First, get attrset of the object we'll try to retrieve. */ + RGWObjectCtx& obj_ctx = *static_cast(s->obj_ctx); + obj_ctx.set_atomic(obj); + obj_ctx.set_prefetch_data(obj); + + RGWObjState* state = nullptr; + if (store->get_obj_state(&obj_ctx, s->bucket_info, obj, &state, false) < 0) { + return false; + } + + /* A nonexistent object cannot be a considered as a marker representing + * the emulation of catalog in FS hierarchy. */ + if (! state->exists) { + return false; + } + + /* Decode the content type. */ + std::string content_type; + get_contype_from_attrs(state->attrset, content_type); + + const auto& ws_conf = s->bucket_info.website_conf; + const std::string subdir_marker = ws_conf.subdir_marker.empty() + ? "application/directory" + : ws_conf.subdir_marker; + return subdir_marker == content_type && state->size <= 1; +} + +bool RGWSwiftWebsiteHandler::is_index_present(const std::string& index) +{ + rgw_obj obj(s->bucket, index); + + RGWObjectCtx& obj_ctx = *static_cast(s->obj_ctx); + obj_ctx.set_atomic(obj); + obj_ctx.set_prefetch_data(obj); + + RGWObjState* state = nullptr; + if (store->get_obj_state(&obj_ctx, s->bucket_info, obj, &state, false) < 0) { + return false; + } + + /* A nonexistent object cannot be a considered as a viable index. We will + * try to list the bucket or - if this is impossible - return an error. */ + return state->exists; +} + +int RGWSwiftWebsiteHandler::retarget_bucket(RGWOp* op, RGWOp** new_op) +{ + ldout(s->cct, 10) << "Starting retarget" << dendl; + RGWOp* op_override = nullptr; + + /* In Swift static web content is served if the request is anonymous or + * has X-Web-Mode HTTP header specified to true. */ + if (can_be_website_req()) { + const auto& ws_conf = s->bucket_info.website_conf; + const auto& index = s->bucket_info.website_conf.get_index_doc(); + + if (s->decoded_uri.back() != '/') { + op_override = get_ws_redirect_op(); + } else if (! index.empty() && is_index_present(index)) { + op_override = get_ws_index_op(); + } else if (ws_conf.listing_enabled) { + op_override = get_ws_listing_op(); + } + } + + if (op_override) { + handler->put_op(op); + op_override->init(store, s, handler); + + *new_op = op_override; + } else { + *new_op = op; + } + + /* Return 404 Not Found is the request has web mode enforced but we static web + * wasn't able to serve it accordingly. */ + return ! op_override && is_web_mode() ? -ENOENT : 0; +} + +int RGWSwiftWebsiteHandler::retarget_object(RGWOp* op, RGWOp** new_op) +{ + ldout(s->cct, 10) << "Starting object retarget" << dendl; + RGWOp* op_override = nullptr; + + /* In Swift static web content is served if the request is anonymous or + * has X-Web-Mode HTTP header specified to true. */ + if (can_be_website_req() && is_web_dir()) { + const auto& ws_conf = s->bucket_info.website_conf; + const auto& index = s->bucket_info.website_conf.get_index_doc(); + + if (s->decoded_uri.back() != '/') { + op_override = get_ws_redirect_op(); + } else if (! index.empty() && is_index_present(index)) { + op_override = get_ws_index_op(); + } else if (ws_conf.listing_enabled) { + op_override = get_ws_listing_op(); + } + } else { + /* A regular request or the specified object isn't a subdirectory marker. + * We don't need any re-targeting. Error handling (like sending a custom + * error page) will be performed by error_handler of the actual RGWOp. */ + return 0; + } + + if (op_override) { + handler->put_op(op); + op_override->init(store, s, handler); + + *new_op = op_override; + } else { + *new_op = op; + } + + /* Return 404 Not Found if we aren't able to re-target for subdir marker. */ + return ! op_override ? -ENOENT : 0; +} + + +RGWOp *RGWHandler_REST_Bucket_SWIFT::get_obj_op(bool get_data) +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_SWIFT; + } + + if (get_data) + return new RGWListBucket_ObjStore_SWIFT; + else + return new RGWStatBucket_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_get() +{ + return get_obj_op(true); +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_head() +{ + return get_obj_op(false); +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_put() +{ + if (is_acl_op()) { + return new RGWPutACLs_ObjStore_SWIFT; + } + if(s->info.args.exists("extract-archive")) { + return new RGWBulkUploadOp_ObjStore_SWIFT; + } + return new RGWCreateBucket_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_delete() +{ + return new RGWDeleteBucket_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_post() +{ + if (RGWFormPost::is_formpost_req(s)) { + return new RGWFormPost; + } else { + return new RGWPutMetadataBucket_ObjStore_SWIFT; + } +} + +RGWOp *RGWHandler_REST_Bucket_SWIFT::op_options() +{ + return new RGWOptionsCORS_ObjStore_SWIFT; +} + + +RGWOp *RGWHandler_REST_Obj_SWIFT::get_obj_op(bool get_data) +{ + if (is_acl_op()) { + return new RGWGetACLs_ObjStore_SWIFT; + } + + RGWGetObj_ObjStore_SWIFT *get_obj_op = new RGWGetObj_ObjStore_SWIFT; + get_obj_op->set_get_data(get_data); + return get_obj_op; +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_get() +{ + return get_obj_op(true); +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_head() +{ + return get_obj_op(false); +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_put() +{ + if (is_acl_op()) { + return new RGWPutACLs_ObjStore_SWIFT; + } + if(s->info.args.exists("extract-archive")) { + return new RGWBulkUploadOp_ObjStore_SWIFT; + } + if (s->init_state.src_bucket.empty()) + return new RGWPutObj_ObjStore_SWIFT; + else + return new RGWCopyObj_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_delete() +{ + return new RGWDeleteObj_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_post() +{ + if (RGWFormPost::is_formpost_req(s)) { + return new RGWFormPost; + } else { + return new RGWPutMetadataObject_ObjStore_SWIFT; + } +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_copy() +{ + return new RGWCopyObj_ObjStore_SWIFT; +} + +RGWOp *RGWHandler_REST_Obj_SWIFT::op_options() +{ + return new RGWOptionsCORS_ObjStore_SWIFT; +} + + +int RGWHandler_REST_SWIFT::authorize(const DoutPrefixProvider *dpp) +{ + return rgw::auth::Strategy::apply(dpp, auth_strategy, s); +} + +int RGWHandler_REST_SWIFT::postauth_init() +{ + struct req_init_state* t = &s->init_state; + + /* XXX Stub this until Swift Auth sets account into URL. */ + s->bucket_tenant = s->user->user_id.tenant; + s->bucket_name = t->url_bucket; + + dout(10) << "s->object=" << + (!s->object.empty() ? s->object : rgw_obj_key("")) + << " s->bucket=" + << rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name) + << dendl; + + int ret; + ret = rgw_validate_tenant_name(s->bucket_tenant); + if (ret) + return ret; + ret = validate_bucket_name(s->bucket_name); + if (ret) + return ret; + ret = validate_object_name(s->object.name); + if (ret) + return ret; + + if (!t->src_bucket.empty()) { + /* + * We don't allow cross-tenant copy at present. It requires account + * names in the URL for Swift. + */ + s->src_tenant_name = s->user->user_id.tenant; + s->src_bucket_name = t->src_bucket; + + ret = validate_bucket_name(s->src_bucket_name); + if (ret < 0) { + return ret; + } + ret = validate_object_name(s->src_object.name); + if (ret < 0) { + return ret; + } + } + + return 0; +} + +int RGWHandler_REST_SWIFT::validate_bucket_name(const string& bucket) +{ + const size_t len = bucket.size(); + + if (len > MAX_BUCKET_NAME_LEN) { + /* Bucket Name too long. Generate custom error message and bind it + * to an R-value reference. */ + const auto msg = boost::str( + boost::format("Container name length of %lld longer than %lld") + % len % int(MAX_BUCKET_NAME_LEN)); + set_req_state_err(s, ERR_INVALID_BUCKET_NAME, msg); + return -ERR_INVALID_BUCKET_NAME; + } + + const auto ret = RGWHandler_REST::validate_bucket_name(bucket); + if (ret < 0) { + return ret; + } + + if (len == 0) + return 0; + + if (bucket[0] == '.') + return -ERR_INVALID_BUCKET_NAME; + + if (check_utf8(bucket.c_str(), len)) + return -ERR_INVALID_UTF8; + + const char *s = bucket.c_str(); + + for (size_t i = 0; i < len; ++i, ++s) { + if (*(unsigned char *)s == 0xff) + return -ERR_INVALID_BUCKET_NAME; + if (*(unsigned char *)s == '/') + return -ERR_INVALID_BUCKET_NAME; + } + + return 0; +} + +static void next_tok(string& str, string& tok, char delim) +{ + if (str.size() == 0) { + tok = ""; + return; + } + tok = str; + int pos = str.find(delim); + if (pos > 0) { + tok = str.substr(0, pos); + str = str.substr(pos + 1); + } else { + str = ""; + } +} + +int RGWHandler_REST_SWIFT::init_from_header(struct req_state* const s, + const std::string& frontend_prefix) +{ + string req; + string first; + + s->prot_flags |= RGW_REST_SWIFT; + + char reqbuf[frontend_prefix.length() + s->decoded_uri.length() + 1]; + sprintf(reqbuf, "%s%s", frontend_prefix.c_str(), s->decoded_uri.c_str()); + const char *req_name = reqbuf; + + const char *p; + + if (*req_name == '?') { + p = req_name; + } else { + p = s->info.request_params.c_str(); + } + + s->info.args.set(p); + s->info.args.parse(); + + /* Skip the leading slash of URL hierarchy. */ + if (req_name[0] != '/') { + return 0; + } else { + req_name++; + } + + if ('\0' == req_name[0]) { + return g_conf()->rgw_swift_url_prefix == "/" ? -ERR_BAD_URL : 0; + } + + req = req_name; + + size_t pos = req.find('/'); + if (std::string::npos != pos && g_conf()->rgw_swift_url_prefix != "/") { + bool cut_url = g_conf()->rgw_swift_url_prefix.length(); + first = req.substr(0, pos); + + if (first.compare(g_conf()->rgw_swift_url_prefix) == 0) { + if (cut_url) { + /* Rewind to the "v1/..." part. */ + next_tok(req, first, '/'); + } + } + } else if (req.compare(g_conf()->rgw_swift_url_prefix) == 0) { + s->formatter = new RGWFormatter_Plain; + return -ERR_BAD_URL; + } else { + first = req; + } + + std::string tenant_path; + if (! g_conf()->rgw_swift_tenant_name.empty()) { + tenant_path = "/AUTH_"; + tenant_path.append(g_conf()->rgw_swift_tenant_name); + } + + /* verify that the request_uri conforms with what's expected */ + char buf[g_conf()->rgw_swift_url_prefix.length() + 16 + tenant_path.length()]; + int blen; + if (g_conf()->rgw_swift_url_prefix == "/") { + blen = sprintf(buf, "/v1%s", tenant_path.c_str()); + } else { + blen = sprintf(buf, "/%s/v1%s", + g_conf()->rgw_swift_url_prefix.c_str(), tenant_path.c_str()); + } + + if (strncmp(reqbuf, buf, blen) != 0) { + return -ENOENT; + } + + int ret = allocate_formatter(s, RGW_FORMAT_PLAIN, true); + if (ret < 0) + return ret; + + string ver; + + next_tok(req, ver, '/'); + + if (!tenant_path.empty() || g_conf()->rgw_swift_account_in_url) { + string account_name; + next_tok(req, account_name, '/'); + + /* Erase all pre-defined prefixes like "AUTH_" or "KEY_". */ + const vector skipped_prefixes = { "AUTH_", "KEY_" }; + + for (const auto pfx : skipped_prefixes) { + const size_t comp_len = min(account_name.length(), pfx.length()); + if (account_name.compare(0, comp_len, pfx) == 0) { + /* Prefix is present. Drop it. */ + account_name = account_name.substr(comp_len); + break; + } + } + + if (account_name.empty()) { + return -ERR_PRECONDITION_FAILED; + } else { + s->account_name = account_name; + } + } + + next_tok(req, first, '/'); + + dout(10) << "ver=" << ver << " first=" << first << " req=" << req << dendl; + if (first.size() == 0) + return 0; + + s->info.effective_uri = "/" + first; + + // Save bucket to tide us over until token is parsed. + s->init_state.url_bucket = first; + + if (req.size()) { + s->object = + rgw_obj_key(req, s->info.env->get("HTTP_X_OBJECT_VERSION_ID", "")); /* rgw swift extension */ + s->info.effective_uri.append("/" + s->object.name); + } + + return 0; +} + +int RGWHandler_REST_SWIFT::init(RGWRados* store, struct req_state* s, + rgw::io::BasicClient *cio) +{ + struct req_init_state *t = &s->init_state; + + s->dialect = "swift"; + + std::string copy_source = s->info.env->get("HTTP_X_COPY_FROM", ""); + if (! copy_source.empty()) { + bool result = RGWCopyObj::parse_copy_location(copy_source, t->src_bucket, + s->src_object); + if (!result) + return -ERR_BAD_URL; + } + + if (s->op == OP_COPY) { + std::string req_dest = s->info.env->get("HTTP_DESTINATION", ""); + if (req_dest.empty()) + return -ERR_BAD_URL; + + std::string dest_bucket_name; + rgw_obj_key dest_obj_key; + bool result = + RGWCopyObj::parse_copy_location(req_dest, dest_bucket_name, + dest_obj_key); + if (!result) + return -ERR_BAD_URL; + + std::string dest_object = dest_obj_key.name; + + /* convert COPY operation into PUT */ + t->src_bucket = t->url_bucket; + s->src_object = s->object; + t->url_bucket = dest_bucket_name; + s->object = rgw_obj_key(dest_object); + s->op = OP_PUT; + } + + s->info.storage_class = s->info.env->get("HTTP_X_OBJECT_STORAGE_CLASS", ""); + + return RGWHandler_REST::init(store, s, cio); +} + +RGWHandler_REST* +RGWRESTMgr_SWIFT::get_handler(struct req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) +{ + int ret = RGWHandler_REST_SWIFT::init_from_header(s, frontend_prefix); + if (ret < 0) { + ldout(s->cct, 10) << "init_from_header returned err=" << ret << dendl; + return nullptr; + } + + const auto& auth_strategy = auth_registry.get_swift(); + + if (s->init_state.url_bucket.empty()) { + return new RGWHandler_REST_Service_SWIFT(auth_strategy); + } + + if (s->object.empty()) { + return new RGWHandler_REST_Bucket_SWIFT(auth_strategy); + } + + return new RGWHandler_REST_Obj_SWIFT(auth_strategy); +} + +RGWHandler_REST* RGWRESTMgr_SWIFT_Info::get_handler( + struct req_state* const s, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefix) +{ + s->prot_flags |= RGW_REST_SWIFT; + const auto& auth_strategy = auth_registry.get_swift(); + return new RGWHandler_REST_SWIFT_Info(auth_strategy); +} diff --git a/src/test/librados/cmd.cc b/src/test/librados/cmd.cc index f47cc9fc7d271..42415e8a1d4db 100644 --- a/src/test/librados/cmd.cc +++ b/src/test/librados/cmd.cc @@ -47,6 +47,41 @@ TEST(LibRadosCmd, MonDescribe) { rados_buffer_free(buf); rados_buffer_free(st); + cmd[0] = (char *)""; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "{}", 2, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{\"abc\":\"something\"}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{\"prefix\":\"\"}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{\"prefix\":\" \"}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{\"prefix\":\";;;,,,;;,,\"}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{\"prefix\":\"extra command\"}"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + cmd[0] = (char *)"{\"prefix\":\"mon_status\"}"; ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); ASSERT_LT(0u, buflen); diff --git a/src/test/librados/cmd.cc.orig b/src/test/librados/cmd.cc.orig new file mode 100644 index 0000000000000..f47cc9fc7d271 --- /dev/null +++ b/src/test/librados/cmd.cc.orig @@ -0,0 +1,204 @@ +#include "mds/mdstypes.h" +#include "include/buffer.h" +#include "include/rbd_types.h" +#include "include/rados/librados.h" +#include "include/rados/librados.hpp" +#include "include/stringify.h" +#include "test/librados/test.h" + +#include "common/Cond.h" + +#include "gtest/gtest.h" +#include +#include +#include +#include + +using namespace librados; +using ceph::buffer; +using std::map; +using std::ostringstream; +using std::string; + +TEST(LibRadosCmd, MonDescribe) { + rados_t cluster; + std::string pool_name = get_temp_pool_name(); + ASSERT_EQ("", create_one_pool(pool_name, &cluster)); + + char *buf, *st; + size_t buflen, stlen; + char *cmd[2]; + + cmd[1] = NULL; + + cmd[0] = (char *)"{\"prefix\":\"get_command_descriptions\"}"; + ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + ASSERT_LT(0u, buflen); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"get_command_descriptions"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"asdfqwer"; + ASSERT_EQ(-EINVAL, rados_mon_command(cluster, (const char **)cmd, 1, "{}", 2, &buf, &buflen, &st, &stlen)); + rados_buffer_free(buf); + rados_buffer_free(st); + + cmd[0] = (char *)"{\"prefix\":\"mon_status\"}"; + ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + ASSERT_LT(0u, buflen); + //ASSERT_LT(0u, stlen); + rados_buffer_free(buf); + rados_buffer_free(st); + + ASSERT_EQ(0, destroy_one_pool(pool_name, &cluster)); +} + +TEST(LibRadosCmd, OSDCmd) { + rados_t cluster; + std::string pool_name = get_temp_pool_name(); + ASSERT_EQ("", create_one_pool(pool_name, &cluster)); + + int r; + char *buf, *st; + size_t buflen, stlen; + char *cmd[2]; + cmd[1] = NULL; + + // note: tolerate NXIO here in case the cluster is thrashing out underneath us. + cmd[0] = (char *)"asdfasdf"; + r = rados_osd_command(cluster, 0, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen); + ASSERT_TRUE(r == -22 || r == -ENXIO); + cmd[0] = (char *)"version"; + r = rados_osd_command(cluster, 0, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen); + ASSERT_TRUE(r == -22 || r == -ENXIO); + cmd[0] = (char *)"{\"prefix\":\"version\"}"; + r = rados_osd_command(cluster, 0, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen); + ASSERT_TRUE((r == 0 && buflen > 0) || (r == -ENXIO && buflen == 0)); + rados_buffer_free(buf); + rados_buffer_free(st); + + ASSERT_EQ(0, destroy_one_pool(pool_name, &cluster)); +} + +TEST(LibRadosCmd, PGCmd) { + rados_t cluster; + std::string pool_name = get_temp_pool_name(); + ASSERT_EQ("", create_one_pool(pool_name, &cluster)); + + char *buf, *st; + size_t buflen, stlen; + char *cmd[2]; + cmd[1] = NULL; + + int64_t poolid = rados_pool_lookup(cluster, pool_name.c_str()); + ASSERT_LT(0, poolid); + + string pgid = stringify(poolid) + ".0"; + + cmd[0] = (char *)"asdfasdf"; + // note: tolerate NXIO here in case the cluster is thrashing out underneath us. + int r = rados_pg_command(cluster, pgid.c_str(), (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen); + ASSERT_TRUE(r == -22 || r == -ENXIO); + + // make sure the pg exists on the osd before we query it + rados_ioctx_t io; + rados_ioctx_create(cluster, pool_name.c_str(), &io); + for (int i=0; i<100; i++) { + string oid = "obj" + stringify(i); + ASSERT_EQ(-ENOENT, rados_stat(io, oid.c_str(), NULL, NULL)); + } + rados_ioctx_destroy(io); + + string qstr = "{\"prefix\":\"pg\", \"cmd\":\"query\", \"pgid\":\"" + pgid + "\"}"; + cmd[0] = (char *)qstr.c_str(); + // note: tolerate ENOENT/ENXIO here if hte osd is thrashing out underneath us + r = rados_pg_command(cluster, pgid.c_str(), (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen); + ASSERT_TRUE(r == 0 || r == -ENOENT || r == -ENXIO); + + ASSERT_LT(0u, buflen); + rados_buffer_free(buf); + rados_buffer_free(st); + + ASSERT_EQ(0, destroy_one_pool(pool_name, &cluster)); +} + +struct Log { + list log; + Cond cond; + Mutex lock; + + Log() : lock("l::lock") {} + + bool contains(string str) { + Mutex::Locker l(lock); + for (list::iterator p = log.begin(); p != log.end(); ++p) { + if (p->find(str) != std::string::npos) + return true; + } + return false; + } +}; + +void log_cb(void *arg, + const char *line, + const char *who, uint64_t stampsec, uint64_t stamp_nsec, + uint64_t seq, const char *level, + const char *msg) { + Log *l = static_cast(arg); + Mutex::Locker locker(l->lock); + l->log.push_back(line); + l->cond.Signal(); + cout << "got: " << line << std::endl; +} + +TEST(LibRadosCmd, WatchLog) { + rados_t cluster; + std::string pool_name = get_temp_pool_name(); + ASSERT_EQ("", create_one_pool(pool_name, &cluster)); + + char *buf, *st; + char *cmd[2]; + cmd[1] = NULL; + size_t buflen, stlen; + Log l; + + ASSERT_EQ(0, rados_monitor_log(cluster, "info", log_cb, &l)); + cmd[0] = (char *)"{\"prefix\":\"log\", \"logtext\":[\"onexx\"]}"; + ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + for (int i=0; !l.contains("onexx"); i++) { + ASSERT_TRUE(i<100); + sleep(1); + } + ASSERT_TRUE(l.contains("onexx")); + + /* + changing the subscribe level is currently broken. + + cmd[0] = (char *)"{\"prefix\":\"log\", \"logtext\":[\"twoxx\"]}"; + ASSERT_EQ(0, rados_monitor_log(cluster, "err", log_cb, &l)); + ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + sleep(2); + ASSERT_FALSE(l.contains("twoxx")); + */ + + ASSERT_EQ(0, rados_monitor_log(cluster, "info", log_cb, &l)); + cmd[0] = (char *)"{\"prefix\":\"log\", \"logtext\":[\"threexx\"]}"; + ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + for (int i=0; !l.contains("threexx"); i++) { + ASSERT_TRUE(i<100); + sleep(1); + } + + ASSERT_EQ(0, rados_monitor_log(cluster, "info", NULL, NULL)); + cmd[0] = (char *)"{\"prefix\":\"log\", \"logtext\":[\"fourxx\"]}"; + ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + sleep(2); + ASSERT_FALSE(l.contains("fourxx")); + + + ASSERT_EQ(0, destroy_one_pool(pool_name, &cluster)); +}