Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
fix deadlock in job-exporter by disabling GCCollector (#2115)
Browse files Browse the repository at this point in the history
  • Loading branch information
xudifsd authored Feb 1, 2019
1 parent 2d915eb commit 746ccb5
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 27 deletions.
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ matrix:
- python -m unittest discover test/

- language: python
python: 2.7
python: 3.6
before_install:
- cd src/watchdog/test
install:
- pip install paramiko pyyaml requests prometheus_client
- pip install paramiko pyyaml requests prometheus_client twisted
script:
- python -m unittest discover .

Expand Down
2 changes: 1 addition & 1 deletion src/job-exporter/build/job-exporter.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ RUN curl -SL https://download.docker.com/linux/static/stable/x86_64/docker-17.06
mkdir -p /job_exporter && \
rm -rf /var/lib/apt/lists/*

RUN pip3 install prometheus_client
RUN pip3 install prometheus_client twisted

COPY --from=0 infilter/infilter /usr/bin
COPY src/*.py /job_exporter/
9 changes: 5 additions & 4 deletions src/job-exporter/deploy/job-exporter.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,18 @@ spec:
app: job-exporter
annotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/"
prometheus.io/path: "/metrics"
prometheus.io/port: "{{ cluster_cfg["job-exporter"]["port"] }}"
name: job-exporter
spec:
containers:
- image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}job-exporter:{{ cluster_cfg["cluster"]["docker-registry"]["tag"] }}
imagePullPolicy: Always
readinessProbe:
tcpSocket: # because http get will trigger job-exporter abandom old metrics, so we use tcp instead of http
livenessProbe: # in case job-exporter hangs
httpGet:
path: '/healthz'
port: {{ cluster_cfg["job-exporter"]["port"] }}
initialDelaySeconds: 3
initialDelaySeconds: 30
periodSeconds: 30
timeoutSeconds: 10
command:
Expand Down
47 changes: 42 additions & 5 deletions src/job-exporter/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,18 @@
import os
import json
import threading
import signal
import faulthandler
import gc

from wsgiref.simple_server import make_server
from prometheus_client import make_wsgi_app, Gauge
import prometheus_client
from prometheus_client import Gauge
from prometheus_client.core import REGISTRY
from prometheus_client.twisted import MetricsResource

from twisted.web.server import Site
from twisted.web.resource import Resource
from twisted.internet import reactor

import collector

Expand Down Expand Up @@ -85,7 +93,33 @@ def get_gpu_count(path):
return 0


def register_stack_trace_dump():
faulthandler.register(signal.SIGTRAP, all_threads=True, chain=False)


# https://github.com/prometheus/client_python/issues/322#issuecomment-428189291
def burninate_gc_collector():
for callback in gc.callbacks[:]:
if callback.__qualname__.startswith("GCCollector."):
gc.callbacks.remove(callback)

for name, collector in list(prometheus_client.REGISTRY._names_to_collectors.items()):
if name.startswith("python_gc_"):
try:
prometheus_client.REGISTRY.unregister(collector)
except KeyError: # probably gone already
pass


class HealthResource(Resource):
def render_GET(self, request):
request.setHeader("Content-Type", "text/html; charset=utf-8")
return "<html>Ok</html>".encode("utf-8")


def main(args):
register_stack_trace_dump()
burninate_gc_collector()
config_environ()
try_remove_old_prom_file(args.log + "/gpu_exporter.prom")
try_remove_old_prom_file(args.log + "/job_exporter.prom")
Expand Down Expand Up @@ -118,9 +152,12 @@ def main(args):

REGISTRY.register(CustomCollector(refs))

app = make_wsgi_app(REGISTRY)
httpd = make_server("", int(args.port), app)
httpd.serve_forever()
root = Resource()
root.putChild(b"metrics", MetricsResource())
root.putChild(b"healthz", HealthResource())
factory = Site(root)
reactor.listenTCP(int(args.port), factory)
reactor.run()


if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions src/watchdog/build/watchdog.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

FROM python:2.7
FROM python:3.7

RUN pip install PyYAML requests paramiko prometheus_client
RUN pip3 install PyYAML requests paramiko prometheus_client twisted

COPY src/watchdog.py /
5 changes: 3 additions & 2 deletions src/watchdog/deploy/watchdog.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,16 @@ spec:
app: watchdog
annotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/"
prometheus.io/path: "/metrics"
prometheus.io/port: "9101"
spec:
containers:
- name: watchdog
image: {{ cluster_cfg["cluster"]["docker-registry"]["prefix"] }}watchdog:{{cluster_cfg["cluster"]["docker-registry"]["tag"]}}
imagePullPolicy: Always
readinessProbe:
tcpSocket: # because http get will trigger job-exporter abandom old metrics, so we use tcp instead of http
httpGet:
path: '/healthz'
port: 9101
initialDelaySeconds: 3
periodSeconds: 30
Expand Down
59 changes: 48 additions & 11 deletions src/watchdog/src/watchdog.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,28 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

import argparse
import urlparse
import urllib.parse
import os
import json
import sys
import requests
import logging
import time
import threading
import signal
import faulthandler
import gc

import paramiko
import yaml
from wsgiref.simple_server import make_server
from prometheus_client import make_wsgi_app, Counter, Summary, Histogram
import prometheus_client
from prometheus_client import Counter, Summary, Histogram
from prometheus_client.core import GaugeMetricFamily, CounterMetricFamily, Summary, REGISTRY
from prometheus_client.twisted import MetricsResource

from twisted.web.server import Site
from twisted.web.resource import Resource
from twisted.internet import reactor

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -124,7 +132,7 @@ def parse_pod_item(pai_pod_gauge, pai_container_gauge, pod):

pod_name = pod["metadata"]["name"]
labels = pod["metadata"].get("labels")
if labels is None or "app" not in labels.keys():
if labels is None or "app" not in list(labels.keys()):
logger.warning("unkown pod %s", pod["metadata"]["name"])
return None

Expand Down Expand Up @@ -182,7 +190,7 @@ def parse_pod_item(pai_pod_gauge, pai_container_gauge, pod):
logger.error("unexpected state %s in container %s",
json.dumps(state), container_name)
else:
container_state = state.keys()[0].lower()
container_state = list(state.keys())[0].lower()

pai_container_gauge.add_metric([service_name, pod_name, container_name,
container_state, host_ip, str(ready).lower()], 1)
Expand All @@ -197,7 +205,7 @@ def _map_fn(item):
None,
pai_pod_gauge, pai_container_gauge, item)

map(_map_fn, podsJsonObject["items"])
list(map(_map_fn, podsJsonObject["items"]))


def collect_healthz(gauge, histogram, service_name, scheme, address, port, url, ca_path, headers):
Expand Down Expand Up @@ -260,7 +268,7 @@ def _map_fn(item):
None,
pai_node_gauge, item)

map(_map_fn, nodesJsonObject["items"])
list(map(_map_fn, nodesJsonObject["items"]))


def load_machine_list(configFilePath):
Expand All @@ -282,13 +290,37 @@ def try_remove_old_prom_file(path):
except Exception as e:
log.warning("can not remove old prom file %s", path)

def register_stack_trace_dump():
faulthandler.register(signal.SIGTRAP, all_threads=True, chain=False)

# https://github.com/prometheus/client_python/issues/322#issuecomment-428189291
def burninate_gc_collector():
for callback in gc.callbacks[:]:
if callback.__qualname__.startswith("GCCollector."):
gc.callbacks.remove(callback)

for name, collector in list(prometheus_client.REGISTRY._names_to_collectors.items()):
if name.startswith("python_gc_"):
try:
prometheus_client.REGISTRY.unregister(collector)
except KeyError: # probably gone already
pass

class HealthResource(Resource):
def render_GET(self, request):
request.setHeader("Content-Type", "text/html; charset=utf-8")
return "<html>Ok</html>".encode("utf-8")


def main(args):
register_stack_trace_dump()
burninate_gc_collector()
logDir = args.log

try_remove_old_prom_file(logDir + "/watchdog.prom")

address = args.k8s_api
parse_result = urlparse.urlparse(address)
parse_result = urllib.parse.urlparse(address)
api_server_scheme = parse_result.scheme
api_server_ip = parse_result.hostname
api_server_port = parse_result.port or 80
Expand All @@ -313,9 +345,14 @@ def main(args):

REGISTRY.register(CustomCollector(atomic_ref))

app = make_wsgi_app(REGISTRY)
httpd = make_server("", int(args.port), app)
t = threading.Thread(target=httpd.serve_forever)
root = Resource()
root.putChild(b"metrics", MetricsResource())
root.putChild(b"healthz", HealthResource())

factory = Site(root)
reactor.listenTCP(int(args.port), factory)

t = threading.Thread(target=reactor.run, name="twisted")
t.daemon = True
t.start()

Expand Down

0 comments on commit 746ccb5

Please sign in to comment.