前面我们简单的聊了一下ganglia的架构, 以及它和其他监控软件如nagios, zabbix对比的优缺点.
gmond只整合了一些常用的监控metric, 如果需要监控不在gmond范围内的metric, 那么ganglia提供了一个接口, 可以用于扩展.
例如你可以用C/C++写扩展模块, 也可以用python写扩展模块, 如果你不想写扩展模块, 也可以使用gmetric直接向send channel发送自定义的metric数据.
本文将拿一个开源的ganglia module for postgresql为例, 结合PostgreSQL常用的监控指标来实现ganglia采样postgresql监控指标数据.
这个模块的代码请参见 :
https://github.com/ganglia/gmond_python_modules
https://github.com/mentalblock/ganglia_postgresql/tree/master/python_modules
https://github.com/elecnix/postgres_gmetric
postgresql python module的详解, 参考
http://blog.163.com/digoal@126/blog/static/1638770402014823102256805/
这个模块提到的监控项有 :
# Create your queries here. Keys whose names match those defined in the default
# set are overridden. Any additional key-value pairs (i.e. query) will not be
# added to the Ganglia metric definition but can be useful for data purposes.
metric_defs = {
"pg_backends_waiting": {
"description": "Number of postgres backends that are waiting",
"units": "backends",
"query": "SELECT count(*) AS backends_waiting FROM " + \
"pg_stat_activity WHERE waiting = 't';"
},
"pg_database_size": {
"description": "Total size of all databases in bytes",
"value_type": "double",
"format": "%.0f",
"units": "bytes",
"query": "SELECT sum(pg_database_size(d.oid)) AS " + \
"size_database FROM pg_database d ORDER BY 1 DESC;"
},
"pg_idx_blks_read": {
"description": "Total index blocks read",
"slope": "positive",
"units": "blocks",
"query": "SELECT sum(idx_blks_read) AS idx_blks_read " + \
"FROM pg_statio_all_indexes;"
},
"pg_idx_blks_hit": {
"description": "Total index blocks hit",
"slope": "positive",
"units": "blocks",
"query": "SELECT sum(idx_blks_hit) AS idx_blks_hit " + \
"FROM pg_statio_all_indexes;"
},
"pg_locks": {
"description": "Number of locks held",
"units": "locks",
"query": "SELECT count(*) FROM pg_locks;"
},
"pg_query_time_idle_in_txn": {
"description": 'Age of longest _idle in transaction_ transaction',
"units": "seconds",
"query": "SELECT COALESCE(max(COALESCE(ROUND(EXTRACT(epoch " + \
"FROM now()-query_start)),0)),0) AS " + \
"query_time_idle_in_txn FROM pg_stat_activity " + \
"WHERE current_query = '% in transaction';"
},
"pg_max_idle_txn_time": {
"description": "Age of longest idle transaction",
"units": "seconds",
"query": "SELECT COALESCE(max(COALESCE(ROUND(EXTRACT(epoch " + \
"FROM now()-query_start)),0)),0) as query_time_max FROM " + \
"pg_stat_activity WHERE current_query <> '<IDLE>';"
},
"pg_txn_time_max": {
"description": "Age of longest transaction",
"units": "seconds",
"query": "SELECT max(COALESCE(ROUND(EXTRACT(epoch " + \
"FROM now()-xact_start)),0)) as txn_time_max " + \
"FROM pg_stat_activity WHERE xact_start IS NOT NULL;"
},
"pg_connections": {
"description": "Number of connections",
"units": "connctions",
"query": "SELECT sum(numbackends) FROM pg_stat_database;"
},
"pg_wal_files": {
"description": "number of wal files in pg_xlog directory",
"units": "# wal files",
"query": "SELECT count(*) AS wal_files FROM " + \
"pg_ls_dir('pg_xlog') WHERE pg_ls_dir ~ E'^[0-9A-F]{24}$';"
},
"pg_xact_commit": {
"description": "Transactions committed",
"slope": "positive",
"units": "transactions",
"query": "SELECT sum(xact_commit) as xact_commit FROM " + \
"pg_stat_database;",
},
"pg_xact_rollback": {
"description": "Transactions rolled back",
"slope": "positive",
"units": "transactions",
"query": "SELECT sum(xact_rollback) as xact_rollback FROM " + \
"pg_stat_database;",
},
"pg_blks_read": {
"description": "Blocks read",
"slope": "positive",
"units": "blocks",
"query": "SELECT sum(blks_read) as blks_read FROM " + \
"pg_stat_database;",
},
"pg_blks_hit": {
"description": "Blocks hit",
"slope": "positive",
"units": "blocks",
"query": "SELECT sum(blks_hit) as blks_hit FROM " + \
"pg_stat_database;",
},
"pg_tup_returned": {
"description": "Tuples returned",
"slope": "positive",
"units": "tuples",
"query": "SELECT sum(tup_returned) as tup_returned FROM " + \
"pg_stat_database;",
},
"pg_tup_fetched": {
"description": "Tuples fetched",
"slope" : "positive",
"units": "tuples",
"query": "SELECT sum(tup_fetched) as tup_fetched FROM " + \
"pg_stat_database;",
},
"pg_tup_inserted": {
"description": "Tuples inserted",
"slope": "positive",
"units": "tuples",
"query": "SELECT sum(tup_inserted) as tup_inserted FROM " + \
"pg_stat_database;",
},
"pg_tup_updated": {
"description": "Tuples updated",
"slope": "positive",
"units": "tuples",
"query": "SELECT sum(tup_updated) as tup_updated FROM " + \
"pg_stat_database;",
},
"pg_tup_deleted": {
"description": "Tuples deleted",
"slope": "positive",
"units": "tuples",
"query": "SELECT sum(tup_deleted) as tup_deleted FROM " + \
"pg_stat_database;",
},
"pg_heap_blks_read": {
"description": "Heap blocks read",
"slope": "positive",
"units": "blocks",
"query": "SELECT sum(heap_blks_read) as heap_blks_read FROM " + \
"pg_statio_all_tables;",
},
"pg_heap_blks_hit": {
"description": "Heap blocks hit",
"slope": "positive",
"units": "blocks",
"query": "SELECT sum(heap_blks_hit) as heap_blks_hit FROM " + \
"pg_statio_all_tables;",
},
"pg_idx_blks_read_tbl": {
"description": "Index blocks read",
"slope": "positive",
"units": "blocks",
"query": "SELECT sum(idx_blks_read) as idx_blks_read_tbl FROM " + \
"pg_statio_all_tables;",
},
"pg_idx_blks_hit_tbl": {
"description": "Index blocks hit",
"slope": "positive",
"units": "blocks",
"query": "SELECT sum(idx_blks_hit) as idx_blks_hit_tbl FROM " + \
"pg_statio_all_tables;",
},
"pg_toast_blks_read": {
"description": "Toast blocks read",
"slope": "positive",
"units": "blocks",
"query": "SELECT sum(toast_blks_read) as toast_blks_read FROM " + \
"pg_statio_all_tables;",
},
"pg_toast_blks_hit": {
"description": "Toast blocks hit",
"slope": "positive",
"units": "blocks",
"query": "SELECT sum(toast_blks_hit) as toast_blks_hit FROM " + \
"pg_statio_all_tables;",
},
"pg_tidx_blks_read": {
"description": "Toast index blocks read",
"slope": "positive",
"units": "blocks",
"query": "SELECT sum(tidx_blks_read) as tidx_blks_read FROM " + \
"pg_statio_all_tables;",
},
"pg_tidx_blks_hit": {
"description": "Toast index blocks hit",
"slope": "positive",
"units": "blocks",
"query": "SELECT sum(tidx_blks_hit) as tidx_blks_hit FROM " + \
"pg_statio_all_tables;",
},
}
除此以外, 还可以丰富一下监控项. 如下 :
postgresql常见监控项 :
http://blog.163.com/digoal@126/blog/static/163877040201412763135184/
1. 是否打开归档, 布尔逻辑值.
2. 是否打开autovacuum, 布尔逻辑值.
3. 数据库年龄, 数字, 大于2, 低于20亿(大约).
4. 连接数, 低于最大连接数
5. 提交和回滚的比例(问题修复后,手工清除统计信息pg_stat_reset(), 连接到对应的库执行).
6. standby延迟, 字节数, 延迟高说明网络或STANDBY的IO有问题.
7. 锁等待, 锁等待时间, 长则说明业务或在SQL的处理效率上有问题.
8. 长事务/空闲事务, 长事务一般可能是业务设计的问题.
9. prepared事务, 长时间的PREPARED事务可能是业务层出了问题, 例如未及时提交, 或业务以及挂了, prepared事务变成了僵尸事务.
10. 序列剩余量(每个库查询), 序列如果没有开启cycle, 那么需要注意是否耗尽的问题.
11. 未使用的索引(每个库查询), 如果索引不常使用, 那么说明索引不如不建.
以上监控指标都可以量化, 所以使用ganglia来监控是可以的.
常见巡检项 :
http://blog.163.com/digoal@126/blog/static/1638770402014252816497/
[参考]
1. https://github.com/mentalblock/ganglia_postgresql
2. http://blog.163.com/digoal@126/blog/static/1638770402014833639454/
3. http://blog.163.com/digoal@126/blog/static/1638770402014252816497/
4. http://blog.163.com/digoal@126/blog/static/163877040201412763135184/
5. http://bucardo.org/wiki/Check_postgres