Skip to content

Commit

Permalink
PISTON-973: big acdc_queue_manager refactor + queue manager diag tool (
Browse files Browse the repository at this point in the history
…#6727)

- rename some queue manager funs and call/cast messages for clarity
- refactor agent ID list and count funs
  - replace ss_size
- big refactor of update_strategy_with_agent
- diagnostics launcher funs
- diagnostics receiver attachment to queue manager
- ?DIAG macro and payload transformation
- create acdc_queue_manager_diag_sup
- update unit tests
  • Loading branch information
danielfinke authored and jamesaimonetti committed Jul 16, 2021
1 parent df0cbf7 commit bcd6865
Show file tree
Hide file tree
Showing 11 changed files with 864 additions and 226 deletions.
40 changes: 40 additions & 0 deletions applications/acdc/doc/troubleshooting.md
Original file line number Diff line number Diff line change
@@ -1 +1,41 @@
### Troubleshooting

#### Queue Manager Diagnostics

The `acdc_queue_manager_diag` and `acdc_queue_manager_diag_sup` modules provide a real-time play-by-play of strategy state changes for a queue manager. This tool enables more reliable diagnosis of issues related to unexpected strategy states (e.g. agents in incorrect ringing/busy/available/unavailable states).

To start diagnostics for a queue, open a remote shell to the Kazoo node using `kazoo-applications connect`. On the remote shell, run

```
acdc_maintenance:start_queue_diagnostics(AccountId, QueueId).
```

replacing `AccountId` and `QueueId` with the binary representations of those values. For example: `acdc_maintenance:start_queue_diagnostics(<<"3d24e4ba001a096df9d925e1c2dda09b">>, <<"854ebe1f6871cff2e292cfcab6bfbff6">>).`

You will see output like:

```
| Timestamp | Message |
|-------------|----------------------------------------------------------------|
This diagnostic process is expensive. Remember to stop diagnostics when you are done!
To stop: `acdc_maintenance:stop_queue_diagnostics(list_to_pid("<0.15215.0>")).`
ok
```

As mentioned in the output, you can run `acdc_maintenance:stop_queue_diagnostics` to stop the diagnostics stream at any point. The PID in the output will match the PID for your particular diagnostics stream so you don't have to guess!

A stream of events like the following will be displayed as agents change state, calls come/go, and calls are distributed to agents. The kinds of emitted events will vary depending on the selected strategy for the observed queue.

```
|-------------|----------------------------------------------------------------|
| 63790338474 | got next winner from [<<"37bc41c35f1738781cb63c57709b73d5">>] |
|-------------|----------------------------------------------------------------|
| 63790338474 | agent 37bc41c35f1738781cb63c57709b73d5 updated in SS |
| | |
| | ringing agents: [<<"37bc41c35f1738781cb63c57709b73d5">>] |
| | |
| | busy agents: [] |
| | |
| | agent queue: [] |
|-------------|----------------------------------------------------------------|
```
33 changes: 32 additions & 1 deletion applications/acdc/src/acdc_maintenance.erl
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
,agent_resume/2
,agent_queue_login/3
,agent_queue_logout/3

,start_queue_diagnostics/2
,stop_queue_diagnostics/1
]).

-include("acdc.hrl").
Expand Down Expand Up @@ -111,7 +114,7 @@ log_current_agent(QueueSup) ->
QueueM = acdc_queue_sup:manager(QueueSup),
{_AccountId, QueueId} = acdc_queue_manager:config(QueueM),
io:format(" ~35s | ~s~n", [QueueId
,kz_binary:join(acdc_queue_manager:current_agents(QueueM))
,kz_binary:join(acdc_queue_manager:agents(QueueM))
]).

-spec current_calls(kz_term:ne_binary()) -> 'ok'.
Expand Down Expand Up @@ -522,3 +525,31 @@ agent_queue_logout(AcctId, AgentId, QueueId) ->
]),
_ = kz_amqp_worker:cast(Update, fun kapi_acdc_agent:publish_logout_queue/1),
lager:info("published logout update for agent").

%%------------------------------------------------------------------------------
%% @doc Start queue diagnostics for the specified queue. Changes to the strategy
%% state of the queue will be printed to the Erlang console.
%% @end
%%------------------------------------------------------------------------------
-spec start_queue_diagnostics(kz_term:text(), kz_term:text()) -> 'ok'.
start_queue_diagnostics(AccountId, QueueId) ->
case acdc_queue_manager_diag_sup:start_diagnostics(kz_term:to_binary(AccountId), kz_term:to_binary(QueueId)) of
{'ok', Pid} ->
io:format("This diagnostic process is expensive. Remember to stop diagnostics when you are done!~n"
++ "To stop: `~p:~p(list_to_pid(\"~s\")).`~n"
,[?MODULE, 'stop_queue_diagnostics', pid_to_list(Pid)]
);
{'error', E} ->
io:format("Failed to start queue diagnostics: ~p~n", [E])
end.

%%------------------------------------------------------------------------------
%% @doc Stop queue diagnostics previously started by `start_queue_diagnostics'.
%% @end
%%------------------------------------------------------------------------------
-spec stop_queue_diagnostics(pid() | kz_term:text()) -> any().
stop_queue_diagnostics(DiagnosticsPid) when is_pid(DiagnosticsPid) ->
acdc_queue_manager_diag:stop(DiagnosticsPid);
stop_queue_diagnostics(DiagnosticsPid) ->
PidStr = kz_term:to_list(DiagnosticsPid),
stop_queue_diagnostics(list_to_pid(PidStr)).
4 changes: 2 additions & 2 deletions applications/acdc/src/acdc_queue_fsm.erl
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ connect_req('cast', {'member_call_cancel', JObj}, State) ->
connect_req('cast', {'agent_resp', Resp}, #state{connect_resps=CRs
,manager_proc=MgrSrv
}=State) ->
Agents = acdc_queue_manager:current_agents(MgrSrv),
Agents = acdc_queue_manager:agents(MgrSrv),
Resps = [Resp | CRs],
State1 = State#state{connect_resps=Resps},
case have_agents_responded(Resps, Agents) of
Expand Down Expand Up @@ -757,7 +757,7 @@ maybe_abort_connect_req(OnContinue, CallbackArgs, #state{listener_proc=ListenerS
,queue_id=QueueId
,member_call=Call
}=State) ->
case acdc_queue_manager:are_agents_available(MgrSrv) of
case acdc_queue_manager:has_agents(MgrSrv) of
'true' -> apply(OnContinue, CallbackArgs ++ [State]);
'false' ->
lager:debug("all agents have left the queue, failing call"),
Expand Down
Loading

0 comments on commit bcd6865

Please sign in to comment.