clickhouse - GitHub 事件数据集 clickhouse
GitHub 事件数据集
--点击数排序
SELECT action, count() FROM github_events WHERE event_type = 'WatchEvent' GROUP BY action
┌─action──┬───count()─┐
│ started │ 232118474 │
└─────────┴───────────┘
---ClickHouse/ClickHouse仓库点击数排序
SELECT count() FROM github_events WHERE event_type = 'WatchEvent' AND repo_name IN ('ClickHouse/ClickHouse', 'yandex/ClickHouse') GROUP BY action
┌─count()─┐
│ 14359 │
└─────────┘
---所有仓库点击数排序
SET output_format_pretty_row_numbers = 1
SELECT repo_name, count() AS stars FROM github_events WHERE event_type = 'WatchEvent' GROUP BY repo_name ORDER BY stars DESC LIMIT 50
---只有1600万个储存库至少有一颗星。只有140万个10+星的知识库....
SELECT
exp10(floor(log10(c))) AS stars,
uniq(k)
FROM
(
SELECT
repo_name AS k,
count() AS c
FROM github_events
WHERE event_type = 'WatchEvent'
GROUP BY k
)
GROUP BY stars
ORDER BY stars ASC
┌──stars─┬──uniq(k)─┐
│ 1 │ 14946505 │
│ 10 │ 1196622 │
│ 100 │ 213026 │
---存储库的总数
SELECT uniq(repo_name) FROM github_events
┌─uniq(repo_name)─┐
│ 164059648 │
---2020年各各仓库的星数
SELECT repo_name, count() AS stars
FROM github_events
WHERE event_type = 'WatchEvent' AND toYear(created_at) = '2020'
GROUP BY repo_name ORDER BY stars DESC LIMIT 50
---每年的各各仓库的星数(所有年份,最多十条,降序)
SELECT
year,
lower(repo_name) AS repo,
count()
FROM github_events
WHERE (event_type = 'WatchEvent') AND (year >= 2015)
GROUP BY
repo,
toYear(created_at) AS year
ORDER BY
year ASC,
count() DESC
LIMIT 10 BY year
---multi-line graph - 多线图
SELECT
repo AS name,
groupArrayInsertAt(toUInt32(c), toUInt64(dateDiff('month', toDate('2015-01-01'), month))) AS data
FROM
(
SELECT
lower(repo_name) AS repo,
toStartOfMonth(created_at) AS month,
count() AS c
FROM github_events
WHERE (event_type = 'WatchEvent') AND (toYear(created_at) >= 2015) AND (repo IN
(
SELECT lower(repo_name) AS repo
FROM github_events
WHERE (event_type = 'WatchEvent') AND (toYear(created_at) >= 2015)
GROUP BY repo
ORDER BY count() DESC
LIMIT 10
))
GROUP BY
repo,
month
)
GROUP BY repo
ORDER BY repo ASC
---随着时间的推移,恒星的总数是如何变化的
SELECT toYear(created_at) AS year, count() AS stars, bar(stars, 0, 50000000, 10) AS bar FROM github_events
WHERE event_type = 'WatchEvent'
GROUP BY year
ORDER BY year
┌─year─┬────stars─┬─bar────────┐
│ 2011 │ 1831742 │ ▎ │
│ 2012 │ 4048676 │ ▋ │
│ 2013 │ 7432800 │ █▍ │
│ 2014 │ 11952935 │ ██▍ │
│ 2015 │ 18994833 │ ███▋ │
│ 2016 │ 26166310 │ █████▏ │
---作者的星星的人有多少
SELECT actor_login, count() AS stars FROM github_events WHERE event_type = 'WatchEvent' GROUP BY actor_login ORDER BY stars DESC LIMIT 50
┌─actor_login─────┬──stars─┐
1. │ 4148 │ 232492 │
2. │ salifm │ 202534 │
3. │ x0rzkov │ 73531 │
4. │ fly51fly │ 57756 │
---alexey-milovidov的星星多少
SELECT actor_login, count() AS stars FROM github_events WHERE event_type = 'WatchEvent' AND actor_login = 'alexey-milovidov' GROUP BY actor_login ORDER BY stars DESC LIMIT 50
┌─actor_login──────┬─stars─┐
│ alexey-milovidov │ 203 │
---alexey-milovidov最喜欢的按星星总数
SELECT
repo_name,
count() AS stars
FROM github_events
WHERE (event_type = 'WatchEvent') AND (repo_name IN
(
SELECT repo_name
FROM github_events
WHERE (event_type = 'WatchEvent') AND (actor_login = 'alexey-milovidov')
))
GROUP BY repo_name
ORDER BY stars DESC
LIMIT 50
---相似性列表
SELECT
repo_name,
count() AS stars
FROM github_events
WHERE (event_type = 'WatchEvent') AND (actor_login IN
(
SELECT actor_login
FROM github_events
WHERE (event_type = 'WatchEvent') AND (repo_name IN ('ClickHouse/ClickHouse', 'yandex/ClickHouse'))
)) AND (repo_name NOT IN ('ClickHouse/ClickHouse', 'yandex/ClickHouse'))
GROUP BY repo_name
ORDER BY stars DESC
LIMIT 50
---
SELECT
repo_name,
uniq(actor_login) AS total_stars,
uniqIf(actor_login, actor_login IN
(
SELECT actor_login
FROM github_events
WHERE (event_type = 'WatchEvent') AND (repo_name IN ('ClickHouse/ClickHouse', 'yandex/ClickHouse'))
)) AS clickhouse_stars,
round(clickhouse_stars / total_stars, 2) AS ratio
FROM github_events
WHERE (event_type = 'WatchEvent') AND (repo_name NOT IN ('ClickHouse/ClickHouse', 'yandex/ClickHouse'))
GROUP BY repo_name
HAVING total_stars >= 100
ORDER BY ratio DESC
LIMIT 50
┌─repo_name─────────────────────────────┬─total_stars─┬─clickhouse_stars─┬─ratio─┐
│ yandex/clickhouse-jdbc │ 340 │ 240 │ 0.71 │
│ yandex/clickhouse-presentations │ 244 │ 170 │ 0.7 │
│ f1yegor/clickhouse_exporter │ 180 │ 125 │ 0.69 │
--通过星数找朋友
WITH repo_name IN
(
SELECT repo_name
FROM github_events
WHERE (event_type = 'WatchEvent') AND (actor_login IN ('alexey-milovidov'))
) AS is_my_repo
SELECT
actor_login,
sum(is_my_repo) AS stars_my,
sum(NOT is_my_repo) AS stars_other,
round(stars_my / (203 + stars_other), 3) AS ratio
FROM github_events
WHERE event_type = 'WatchEvent'
GROUP BY actor_login
ORDER BY ratio DESC
LIMIT 50
┌─actor_login───────┬─stars_my─┬─stars_other─┬─ratio─┐
1. │ alexey-milovidov │ 203 │ 0 │ 1 │
2. │ leicsss │ 14 │ 14 │ 0.065 │
3. │ exitNA │ 31 │ 293 │ 0.062 │
4. │ filimonov │ 19 │ 152 │ 0.054 │
--贡献的相似性, 贡献给ClickHouse的作者也贡献给了什么库
SELECT
repo_name,
count() AS prs,
uniq(actor_login) AS authors
FROM github_events
WHERE (event_type = 'PullRequestEvent') AND (action = 'opened') AND (actor_login IN
(
SELECT actor_login
FROM github_events
WHERE (event_type = 'PullRequestEvent') AND (action = 'opened') AND (repo_name IN ('yandex/ClickHouse', 'ClickHouse/ClickHouse'))
)) AND (repo_name NOT ILIKE '%ClickHouse%')
GROUP BY repo_name
ORDER BY authors DESC
LIMIT 50
┌─repo_name─────────────────────────┬──prs─┬─authors─┐
│ Homebrew/homebrew-core │ 33 │ 15 │
│ pocoproject/poco │ 28 │ 14 │
│ saltstack/salt │ 38 │ 13 │
│ apache/flink │ 54 │ 12 |
-在ClickHouse提交问题的作者也在哪些存储库中提交了问题?
SELECT
repo_name,
count() AS prs,
uniq(actor_login) AS authors
FROM github_events
WHERE (event_type = 'IssuesEvent') AND (action = 'opened') AND (actor_login IN
(
SELECT actor_login
FROM github_events
WHERE (event_type = 'IssuesEvent') AND (action = 'opened') AND (repo_name IN ('yandex/ClickHouse', 'ClickHouse/ClickHouse'))
)) AND (repo_name NOT ILIKE '%ClickHouse%')
GROUP BY repo_name
ORDER BY authors DESC
LIMIT 50
┌─repo_name─────────────────────────┬─prs─┬─authors─┐
│ grafana/grafana │ 224 │ 68 │
│ golang/go │ 186 │ 54 │
│ apache/incubator-superset │ 124 │ 36 │
│ kubernetes/kubernetes │ 98 │ 34 │
│ elastic/elasticsearch │ 66 │ 30 │
---一天内拥有最多星星的仓库
SELECT
repo_name,
toDate(created_at) AS day,
count() AS stars
FROM github_events
WHERE event_type = 'WatchEvent'
GROUP BY
repo_name,
day
ORDER BY count() DESC
LIMIT 1 BY repo_name
LIMIT 50
┌─repo_name──────────────────────────────────────┬────────day─┬─stars─┐
1. │ 996icu/996.ICU │ 2019-03-28 │ 76056 │
2. │ M4cs/BabySploit │ 2019-09-08 │ 46985 │
3. │ x64dbg/x64dbg │ 2018-01-06 │ 26459 │
---仅在一秒钟内获得最多星星的存储库进行了类似的查询:
SELECT repo_name, created_at, count() AS stars
FROM github_events
WHERE event_type = 'WatchEvent'
GROUP BY repo_name, created_at
ORDER BY count() DESC LIMIT 50
---年增长率最高的存储库
WITH toYear(created_at) AS year
SELECT
repo_name,
sum(year = 2020) AS stars2020,
sum(year = 2019) AS stars2019,
round(stars2020 / stars2019, 3) AS yoy,
min(created_at) AS first_seen
FROM github_events
WHERE event_type = 'WatchEvent'
GROUP BY repo_name
HAVING (min(created_at) <= '2019-01-01 00:00:00') AND (stars2019 >= 1000)
ORDER BY yoy DESC
LIMIT 50
┌─repo_name──────────────────────────────────┬─stars2020─┬─stars2019─┬───yoy─┬──────────first_seen─┐
1. │ puppeteer/puppeteer │ 11310 │ 1255 │ 9.012 │ 2014-03-14 12:01:39 │
2. │ ohmyzsh/ohmyzsh │ 21705 │ 2529 │ 8.582 │ 2017-03-22 16:44:20 │
---停滞最严重的仓库
WITH toYear(created_at) AS year
SELECT
repo_name,
sum(year = 2020) AS stars2020,
sum(year = 2019) AS stars2019,
round(stars2020 / stars2019, 3) AS yoy,
min(created_at) AS first_seen
FROM github_events
WHERE event_type = 'WatchEvent'
GROUP BY repo_name
HAVING (min(created_at) <= '2019-01-01 00:00:00') AND (max(created_at) >= '2020-06-01 00:00:00') AND (stars2019 >= 1000)
ORDER BY yoy ASC
LIMIT 50
┌─repo_name─────────────────────────────────────┬─stars2020─┬─stars2019─┬───yoy─┬──────────first_seen─┐
1. │ DoubleLabyrinth/navicat-keygen │ 1 │ 6459 │ 0 │ 2017-12-08 02:04:48 │
2. │ arpitjindal97/technology_books │ 1 │ 4001 │ 0 │ 2017-08-16 19:08:59 │
3. │ b3log/pipe │ 2 │ 2380 │ 0.001 │ 2017-12-20 05:08:15 |
---随着时间的推移增长最稳定的存储库
SELECT
repo_name, max(stars) AS daily_stars, sum(stars) AS total_stars, total_stars / daily_stars AS rate
FROM
(
SELECT repo_name, toDate(created_at) AS day, count() AS stars
FROM github_events WHERE event_type = 'WatchEvent'
GROUP BY repo_name, day
)
GROUP BY repo_name ORDER BY rate DESC LIMIT 50
┌─repo_name───────────────────────────────┬─daily_stars─┬─total_stars─┬───rate─┐
1. │ mongodb/mongo │ 24 │ 21412 │ 892.17 │
2. │ plataformatec/devise │ 24 │ 21212 │ 883.83 │
3. │ senchalabs/connect │ 11 │ 9636 │ 876 │
---一周内最适合点赞
SELECT toDayOfWeek(created_at) AS day, count() AS stars, bar(stars, 0, 50000000, 10) AS bar
FROM github_events
WHERE event_type = 'WatchEvent' GROUP BY day ORDER BY day
┌─day─┬────stars─┬─bar──────┐
│ 1 │ 36491986 │ ███████▎ │
│ 2 │ 38094378 │ ███████▌ │
│ 3 │ 37570733 │ ███████▌ │
│ 4 │ 37208005 │ ███████▍ │
│ 5 │ 34924484 │ ██████▊ │
│ 6 │ 23726322 │ ████▋ │
│ 7 │ 24102566 │ ████▋ │
---GitHub上的用户总数
SELECT uniq(actor_login) FROM github_events
┌─uniq(actor_login)─┐
│ 34138551 │
└───────────────────┘
---活跃用户星星统计
SELECT
repo_name,
count()
FROM github_events
WHERE (event_type = 'WatchEvent') AND (actor_login IN
(
SELECT actor_login
FROM github_events
WHERE (event_type = 'PullRequestEvent') AND (action = 'opened')
))
GROUP BY repo_name
ORDER BY count() DESC
LIMIT 50
┌─repo_name──────────────────────────────┬─count()─┐
1. │ facebook/react │ 121976 │
2. │ vuejs/vue │ 109518 │
---最大拉请求量的存储库
SELECT repo_name, count(), uniq(actor_login) FROM github_events WHERE event_type = 'PullRequestEvent' AND action = 'opened' GROUP BY repo_name ORDER BY count() DESC LIMIT 50
┌─repo_name───────────────────────────────┬─count()─┬─uniq(actor_login)─┐
1. │ google-test/signcla-probe-repo │ 351806 │ 4 │
2. │ everypolitician/everypolitician-data │ 158134 │ 18 │
3. │ brianchandotcom/liferay-portal │ 93222 │ 338 │
-add the number of stars
... WITH (event_type = 'IssuesEvent') AND (action = 'opened') AS issue_created
SELECT
repo_name,
sum(issue_created) AS c,
uniqIf(actor_login, issue_created) AS u,
sum(event_type = 'WatchEvent') AS stars
FROM github_events
WHERE event_type IN ('IssuesEvent', 'WatchEvent')
GROUP BY repo_name
ORDER BY c DESC
LIMIT 50
┌─repo_name───────────────────────────────┬──────c─┬─────u─┬──stars─┐
1. │ koorellasuresh/UKRegionTest │ 379379 │ 4 │ 1 │
2. │ pddemo/demo │ 216215 │ 1 │ 1 │
-问题,stars >= 1000
... WITH (event_type = 'IssuesEvent') AND (action = 'opened') AS issue_created
SELECT
repo_name,
sum(issue_created) AS c,
uniqIf(actor_login, issue_created) AS u,
sum(event_type = 'WatchEvent') AS stars
FROM github_events
WHERE event_type IN ('IssuesEvent', 'WatchEvent')
GROUP BY repo_name
HAVING stars >= 1000
ORDER BY c DESC
LIMIT 50
-问题,数量
WITH (event_type = 'IssuesEvent') AND (action = 'opened') AS issue_created
SELECT
repo_name,
sum(issue_created) AS c,
uniqIf(actor_login, issue_created) AS u,
sum(event_type = 'WatchEvent') AS stars
FROM github_events
WHERE event_type IN ('IssuesEvent', 'WatchEvent')
GROUP BY repo_name
ORDER BY u DESC
LIMIT 50
┌─repo_name───────────────────────────────────────────────────┬──────c─┬─────u─┬──stars─┐
1. │ Microsoft/vscode │ 65122 │ 27038 │ 82043 │
2. │ MicrosoftDocs/azure-docs │ 44131 │ 21036 │ 4888 │
3. │ microsoft/vscode │ 34798 │ 19479 │ 38395 │
---拥有最多推送访问权限的人的存储库
SELECT repo_name, uniqIf(actor_login, event_type = 'PushEvent') AS u, sum(event_type = 'WatchEvent') AS stars FROM github_events WHERE event_type IN ('PushEvent', 'WatchEvent') AND repo_name != '/' GROUP BY repo_name ORDER BY u DESC LIMIT 50
┌─repo_name───────────────────────────────────────────────────┬────u─┬─stars─┐
1. │ githubschool/open-enrollment-classes-introduction-to-github │ 7869 │ 921 │
2. │ githubschool/on-demand-github-pages │ 1100 │ 87 │
3. │ llvm/llvm-project │ 826 │ 7228 |
---对主分支推送最多的人数,星数
SELECT
repo_name,
uniqIf(actor_login, (event_type = 'PushEvent') AND match(ref, '/(main|master)$')) AS u,
sum(event_type = 'WatchEvent') AS stars
FROM github_events
WHERE (event_type IN ('PushEvent', 'WatchEvent')) AND (repo_name != '/')
GROUP BY repo_name
ORDER BY u DESC
LIMIT 50
┌─repo_name───────────────────────────────────────────────────┬────u─┬─stars─┐
1. │ githubschool/open-enrollment-classes-introduction-to-github │ 5603 │ 921 │
2. │ llvm/llvm-project │ 824 │ 7228 │
---检查一下星星和分支的比例
SELECT
repo_name,
sum(event_type = 'ForkEvent') AS forks,
sum(event_type = 'WatchEvent') AS stars,
round(stars / forks, 3) AS ratio
FROM github_events
WHERE event_type IN ('ForkEvent', 'WatchEvent')
GROUP BY repo_name
ORDER BY forks DESC
LIMIT 50
┌─repo_name───────────────────────────────┬──forks─┬──stars─┬──ratio─┐
1. │ jtleek/datasharing │ 262926 │ 6364 │ 0.024 │
2. │ octocat/Spoon-Knife │ 198031 │ 4601 │ 0.023 │
---多星少分支
SELECT
repo_name,
sum(event_type = 'ForkEvent') AS forks,
sum(event_type = 'WatchEvent') AS stars,
round(stars / forks, 2) AS ratio
FROM github_events
WHERE event_type IN ('ForkEvent', 'WatchEvent')
GROUP BY repo_name
HAVING (stars > 100) AND (forks > 100)
ORDER BY ratio DESC
LIMIT 50
┌─repo_name─────────────────────────────┬─forks─┬─stars─┬──ratio─┐
1. │ M4cs/BabySploit │ 147 │ 71572 │ 486.88 │
2. │ tipsy/github-profile-summary │ 330 │ 22397 │ 67.87 │
3. │ doctrine/inflector │ 155 │ 10236 │ 66.04 │
-星叉之间的整体比例
SELECT sum(event_type = 'ForkEvent') AS forks, sum(event_type = 'WatchEvent') AS stars, round(stars / forks, 2) AS ratio FROM github_events WHERE event_type IN ('ForkEvent', 'WatchEvent')
┌────forks─┬─────stars─┬─ratio─┐
│ 84709181 │ 232118474 │ 2.74 │
-流行,他的值很高
SELECT
sum(stars) AS stars,
sum(forks) AS forks,
round(stars / forks, 2) AS ratio
FROM
(
SELECT
sum(event_type = 'ForkEvent') AS forks,
sum(event_type = 'WatchEvent') AS stars
FROM github_events
WHERE event_type IN ('ForkEvent', 'WatchEvent')
GROUP BY repo_name
HAVING stars > 100
)
┌─────stars─┬────forks─┬─ratio─┐
│ 171567035 │ 44944118 │ 3.82 │
---问题评论与问题之间的比例:
SELECT
repo_name,
count() AS comments,
uniq(number) AS issues,
round(comments / issues, 2) AS ratio
FROM github_events
WHERE event_type = 'IssueCommentEvent'
GROUP BY repo_name
ORDER BY count() DESC
LIMIT 50
┌─repo_name────────────────────────────┬─comments─┬─issues─┬─ratio─┐
1. │ kubernetes/kubernetes │ 1450081 │ 85379 │ 16.98 │
2. │ apache/spark │ 790480 │ 26868 │ 29.42 │
3. │ rust-lang/rust │ 502960 │ 58464 │ 8.6 │
- 数量倒序
ORDER BY count() DESC
-人数大于10
HAVING authors >= 10
-MySQL中having的用法
-mysql中,当我们用到聚合函数,如sum,count后,又需要筛选条件时,having就派上用场了,
-因为WHERE是在聚合前筛选记录的,having和group by是组合着用的
---每个顶级存储库的顶级注释问题
SELECT
concat('https://github.com/', repo_name, '/issues/', toString(number)) AS URL,
max(comments),
argMax(authors, comments) AS authors,
argMax(number, comments) AS number,
sum(stars) AS stars
FROM
(
SELECT *
FROM
(
SELECT
repo_name,
number,
count() AS comments,
uniq(actor_login) AS authors
FROM github_events
WHERE (event_type = 'IssueCommentEvent') AND (action = 'created') AND (number > 10)
GROUP BY
repo_name,
number
HAVING authors >= 10
) AS t1
INNER JOIN
(
SELECT
repo_name,
count() AS stars
FROM github_events
WHERE event_type = 'WatchEvent'
GROUP BY repo_name
HAVING stars > 10000
) AS t2 USING (repo_name)
)
GROUP BY repo_name
ORDER BY stars DESC
LIMIT 50
┌─URL───────────────────────────────────────────────────────────┬─max(comments)─┬─authors─┬─number─┬─────stars─┐
1. │ https://github.com/tensorflow/tensorflow/issues/22 │ 632 │ 156 │ 22 │ 221964318 │
2. │ https://github.com/kubernetes/kubernetes/issues/46254 │ 1345 │ 11 │ 46254 │ 209981996 │
---最热门”的提交
SELECT
concat('https://github.com/', repo_name, '/commit/', commit_id) AS URL,
count() AS comments,
uniq(actor_login) AS authors
FROM github_events
WHERE (event_type = 'CommitCommentEvent') AND notEmpty(commit_id)
GROUP BY
repo_name,
commit_id
HAVING authors >= 10
ORDER BY count() DESC
LIMIT 50
┌─URL─────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─comments─┬─authors─┐
1. │ https://github.com/nixxquality/WebMConverter/commit/c1ac0baac06fa7175677a4a1bf65860a84708d67 │ 467 │ 156 │
2. │ https://github.com/torvalds/linux/commit/8a104f8b5867c682d994ffa7a74093c54469c11f │ 410 │ 181 │
3. │ https://github.com/SEI-ATL/select-best-player/commit/fdc6e0e073fa30acf2041ca0b1c9fecd662d88c8 │ 408 │ 31 │
---过滤掉机器人?让我们为存储库的数量添加一个阈值。
-我们也只统计那些创造了至少两期,并且至少给了两颗星的。
-并且还输出每个用户的收藏库。并且也只统计前10k个存储库
SELECT
actor_login,
sum(event_type = 'PushEvent') AS c,
uniqIf(repo_name, event_type = 'PushEvent') AS repos,
sum(event_type = 'IssuesEvent') AS issues,
sum(event_type = 'WatchEvent') AS stars,
anyHeavy(repo_name)
FROM github_events
WHERE (event_type IN ('PushEvent', 'IssuesEvent', 'WatchEvent')) AND (repo_name IN
(
SELECT repo_name
FROM github_events
WHERE event_type = 'WatchEvent'
GROUP BY repo_name
ORDER BY count() DESC
LIMIT 10000
))
GROUP BY actor_login
HAVING (repos < 10000) AND (issues > 1) AND (stars > 1)
ORDER BY c DESC
LIMIT 50
┌─actor_login──────┬─────c─┬─repos─┬─issues─┬─stars─┬─anyHeavy(repo_name)──────────────────────────┐
1. │ bgamari │ 64829 │ 1 │ 99 │ 11 │ syl20bnr/spacemacs │
2. │ ornicar │ 50108 │ 3 │ 4341 │ 94 │ ornicar/lila │
---按星级数组织
SELECT
lower(substring(repo_name, 1, position(repo_name, '/'))) AS org,
count() AS stars
FROM github_events
WHERE event_type = 'WatchEvent'
GROUP BY org
ORDER BY stars DESC
LIMIT 50
┌─org──────────────────┬───stars─┐
1. │ google/ │ 1414877 │
2. │ microsoft/ │ 1361303 │
3. │ facebook/ │ 1123380 │
---按存储库数量划分的组织
SELECT
lower(substring(repo_name, 1, position(repo_name, '/'))) AS org,
uniq(repo_name) AS repos
FROM
(
SELECT repo_name
FROM github_events
WHERE event_type = 'WatchEvent'
GROUP BY repo_name
HAVING count() >= 10
)
GROUP BY org
ORDER BY repos DESC
LIMIT 50
┌─org──────────────────┬─repos─┐
1. │ microsoft/ │ 3359 │
2. │ google/ │ 1599 │
---社区组织大小
SELECT
lower(substring(repo_name, 1, position(repo_name, '/'))) AS org,
uniq(actor_login) AS authors,
uniqIf(actor_login, event_type = 'PullRequestEvent') AS pr_authors,
uniqIf(actor_login, event_type = 'IssuesEvent') AS issue_authors,
uniqIf(actor_login, event_type = 'IssueCommentEvent') AS comment_authors,
uniqIf(actor_login, event_type = 'PullRequestReviewCommentEvent') AS review_authors,
uniqIf(actor_login, event_type = 'PushEvent') AS push_authors
FROM github_events
WHERE event_type IN ('PullRequestEvent', 'IssuesEvent', 'IssueCommentEvent', 'PullRequestReviewCommentEvent', 'PushEvent')
GROUP BY org
ORDER BY authors DESC
LIMIT 50
┌─org─────────────────────┬─authors─┬─pr_authors─┬─issue_authors─┬─comment_authors─┬─review_authors─┬─push_authors─┐
│ microsoft/ │ 241626 │ 31468 │ 143654 │ 183013 │ 10727 │ 7020 │
│ facebook/ │ 114166 │ 19094 │ 49710 │ 93241 │ 4966 │ 1168 │
│ google/ │ 99459 │ 29227 │ 49833 │ 69777 │ 6777 │ 3074 │
---按修改代码的数量存储库
SELECT
repo_name,
count() AS prs,
uniq(actor_login) AS authors,
sum(additions) AS adds,
sum(deletions) AS dels
FROM github_events
WHERE (event_type = 'PullRequestEvent') AND (action = 'opened') AND (additions < 10000) AND (deletions < 10000)
GROUP BY repo_name
HAVING (adds / dels) < 10
ORDER BY adds + dels DESC
LIMIT 50
┌─repo_name──────────────────────────────────┬────prs─┬─authors─┬─────adds─┬─────dels─┐
│ everypolitician/everypolitician-data │ 150531 │ 18 │ 66782324 │ 71203492 │
│ brianchandotcom/liferay-portal │ 91962 │ 337 │ 29304605 │ 13799025 │
│ googleapis/google-api-java-client-services │ 4689 │ 17 │ 12184021 │ 9874444 │
---按推送次数存储库
SELECT
repo_name,
count() AS prs,
uniq(actor_login) AS authors,
sum(additions) AS adds,
sum(deletions) AS dels
FROM github_events
WHERE (event_type = 'PullRequestEvent') AND (action = 'opened') AND (additions < 10000) AND (deletions < 10000)
GROUP BY repo_name
HAVING (adds / dels) < 10
ORDER BY adds + dels DESC
LIMIT 50
┌─repo_name──────────────────────────────────┬────prs─┬─authors─┬─────adds─┬─────dels─┐
│ everypolitician/everypolitician-data │ 150531 │ 18 │ 66782324 │ 71203492 │
│ brianchandotcom/liferay-portal │ 91962 │ 337 │ 29304605 │ 13799025 │
│ googleapis/google-api-java-client-services │ 4689 │ 17 │ 12184021 │ 9874444 │
---拥有最多代码评审的作者
SELECT
actor_login,
count(),
uniq(repo_name) AS repos,
uniq(repo_name, number) AS prs,
replaceRegexpAll(substringUTF8(anyHeavy(body), 1, 100), '[\r\n]', ' ') AS comment
FROM github_events
WHERE (event_type = 'PullRequestReviewCommentEvent') AND (action = 'created')
GROUP BY actor_login
ORDER BY count() DESC
LIMIT 50
┌─actor_login──────────────┬─count()─┬─repos─┬───prs─┬─comment─────────────────────────────────────────────────────────────────────────────────────────
│ houndci-bot │ 991954 │ 5877 │ 78600 │ Trailing Whitespace Violation: Lines should not have trailing whitespace. (trailing_whitespace) │
│ houndci │ 342114 │ 3102 │ 28679 │ Prefer double-quoted strings unless you need single quotes to avoid extra backslashes for escaping.
---问题和拉动式请求最受欢迎的标签
SELECT
arrayJoin(labels) AS label,
count() AS c
FROM github_events
WHERE (event_type IN ('IssuesEvent', 'PullRequestEvent', 'IssueCommentEvent')) AND (action IN ('created', 'opened', 'labeled'))
GROUP BY label
ORDER BY c DESC
LIMIT 50
┌─label─────────────────────────┬────────c─┐
1. │ bug │ 11193148 │
2. │ enhancement │ 10885289 │
3. │ dependencies │ 7752896 │
-bugs and features多样性:
WHERE (event_type IN ('IssuesEvent', 'PullRequestEvent', 'IssueCommentEvent'))
AND (action IN ('created', 'opened', 'labeled')) AND ((label ILIKE '%bug%') OR (label ILIKE '%feature%'))
-
WITH arrayJoin(labels) AS label
SELECT
sum(label ILIKE '%bug%') AS bugs,
sum(label ILIKE '%feature%') AS features,
bugs / features AS ratio
FROM github_events
WHERE (event_type IN ('IssuesEvent', 'PullRequestEvent', 'IssueCommentEvent')) AND (action IN ('created', 'opened', 'labeled')) AND ((label ILIKE '%bug%') OR (label ILIKE '%feature%'))
┌─────bugs─┬─features─┬──────────────ratio─┐
│ 17430607 │ 4828695 │ 3.6097966427782247 │
└──────────┴──────────┴────────────────────┘
---最长的存储库名称
SELECT count(), repo_name FROM github_events
WHERE event_type = 'WatchEvent' GROUP BY repo_name
ORDER BY length(repo_name) DESC LIMIT 50
---最短的存储库名称
SELECT repo_name, count() FROM github_events WHERE event_type = 'WatchEvent' AND repo_name LIKE '%_/_%' GROUP BY repo_name ORDER BY length(repo_name) ASC LIMIT 50
┌─repo_name─┬─count()─┐
│ 2/t │ 1 │
│ s/s │ 1 │
---ClickHouse相关的库
SELECT repo_name, count() FROM github_events
WHERE body ILIKE '%ClickHouse%' GROUP BY repo_name ORDER BY count() DESC LIMIT 50
┌─repo_name──────────────────────────┬─count()─┐
│ ClickHouse/ClickHouse │ 12661 │
│ yandex/ClickHouse │ 7412 │
│ traceon/ClickHouse │ 2339 │
-ClickHouse相关的库,按星号
SELECT
repo_name,
sum(event_type = 'WatchEvent') AS num_stars,
sum(body ILIKE '%ClickHouse%') AS num_comments
FROM github_events
WHERE (body ILIKE '%ClickHouse%') OR (event_type = 'WatchEvent')
GROUP BY repo_name
HAVING num_comments > 0
ORDER BY num_stars DESC
LIMIT 50
┌─repo_name───────────────────┬─num_stars─┬─num_comments─┐
1. │ 996icu/996.ICU │ 354850 │ 1 │
2. │ golang/go │ 92407 │ 6 │
---GitHub上最受欢迎的评论
SELECT body, count() FROM github_events
WHERE notEmpty(body) AND length(body) < 100 GROUP BY body ORDER BY count() DESC LIMIT
---ORDER BY rand
SELECT repo_name FROM github_events WHERE event_type = 'WatchEvent' ORDER BY rand() LIMIT 50
###
###
create a table
###
CREATE TABLE github_events
(
file_time DateTime,
event_type Enum('CommitCommentEvent' = 1, 'CreateEvent' = 2, 'DeleteEvent' = 3, 'ForkEvent' = 4,
'GollumEvent' = 5, 'IssueCommentEvent' = 6, 'IssuesEvent' = 7, 'MemberEvent' = 8,
'PublicEvent' = 9, 'PullRequestEvent' = 10, 'PullRequestReviewCommentEvent' = 11,
'PushEvent' = 12, 'ReleaseEvent' = 13, 'SponsorshipEvent' = 14, 'WatchEvent' = 15,
'GistEvent' = 16, 'FollowEvent' = 17, 'DownloadEvent' = 18, 'PullRequestReviewEvent' = 19,
'ForkApplyEvent' = 20, 'Event' = 21, 'TeamAddEvent' = 22),
actor_login LowCardinality(String),
repo_name LowCardinality(String),
created_at DateTime,
updated_at DateTime,
action Enum('none' = 0, 'created' = 1, 'added' = 2, 'edited' = 3, 'deleted' = 4, 'opened' = 5, 'closed' = 6, 'reopened' = 7, 'assigned' = 8, 'unassigned' = 9,
'labeled' = 10, 'unlabeled' = 11, 'review_requested' = 12, 'review_request_removed' = 13, 'synchronize' = 14, 'started' = 15, 'published' = 16, 'update' = 17, 'create' = 18, 'fork' = 19, 'merged' = 20),
comment_id UInt64,
body String,
path String,
position Int32,
line Int32,
ref LowCardinality(String),
ref_type Enum('none' = 0, 'branch' = 1, 'tag' = 2, 'repository' = 3, 'unknown' = 4),
creator_user_login LowCardinality(String),
number UInt32,
title String,
labels Array(LowCardinality(String)),
state Enum('none' = 0, 'open' = 1, 'closed' = 2),
locked UInt8,
assignee LowCardinality(String),
assignees Array(LowCardinality(String)),
comments UInt32,
author_association Enum('NONE' = 0, 'CONTRIBUTOR' = 1, 'OWNER' = 2, 'COLLABORATOR' = 3, 'MEMBER' = 4, 'MANNEQUIN' = 5),
closed_at DateTime,
merged_at DateTime,
merge_commit_sha String,
requested_reviewers Array(LowCardinality(String)),
requested_teams Array(LowCardinality(String)),
head_ref LowCardinality(String),
head_sha String,
base_ref LowCardinality(String),
base_sha String,
merged UInt8,
mergeable UInt8,
rebaseable UInt8,
mergeable_state Enum('unknown' = 0, 'dirty' = 1, 'clean' = 2, 'unstable' = 3, 'draft' = 4),
merged_by LowCardinality(String),
review_comments UInt32,
maintainer_can_modify UInt8,
commits UInt32,
additions UInt32,
deletions UInt32,
changed_files UInt32,
diff_hunk String,
original_position UInt32,
commit_id String,
original_commit_id String,
push_size UInt32,
push_distinct_size UInt32,
member_login LowCardinality(String),
release_tag_name String,
release_name String,
review_state Enum('none' = 0, 'approved' = 1, 'changes_requested' = 2, 'commented' = 3, 'dismissed' = 4, 'pending' = 5)
)
ENGINE = MergeTree
ORDER BY (event_type, repo_name, created_at)
###插入数据
pixz -d < github_events.native.xz | clickhouse-client --query "INSERT INTO github_events FORMAT Native"
-转换格式jsonlines
xz -d < github_events.native.xz | clickhouse-local --input-format Native --output-format JSONEachRow --query "SELECT * FROM table" --structure "event_type Enum('CommitCommentEvent' = 1, 'CreateEvent' = 2, 'DeleteEvent' = 3, 'ForkEvent' = 4, 'GollumEvent' = 5, 'IssueCommentEvent' = 6, 'IssuesEvent' = 7, 'MemberEvent' = 8, 'PublicEvent' = 9, 'PullRequestEvent' = 10, 'PullRequestReviewCommentEvent' = 11, 'PushEvent' = 12, 'ReleaseEvent' = 13, 'SponsorshipEvent' = 14, 'WatchEvent' = 15, 'GistEvent' = 16, 'FollowEvent' = 17, 'DownloadEvent' = 18, 'PullRequestReviewEvent' = 19, 'ForkApplyEvent' = 20), actor_login LowCardinality(String), repo_name LowCardinality(String), created_at DateTime, updated_at DateTime, action Enum('none' = 0, 'created' = 1, 'added' = 2, 'edited' = 3, 'deleted' = 4, 'opened' = 5, 'closed' = 6, 'reopened' = 7, 'assigned' = 8, 'unassigned' = 9, 'labeled' = 10, 'unlabeled' = 11, 'review_requested' = 12, 'review_request_removed' = 13, 'synchronize' = 14, 'started' = 15, 'published' = 16, 'update' = 17, 'create' = 18, 'fork' = 19), comment_id UInt64, body String, path String, position Int32, line UInt32, ref LowCardinality(String), ref_type Enum('none' = 0, 'branch' = 1, 'tag' = 2, 'repository' = 3), creator_user_login LowCardinality(String), number UInt32, title String, labels Array(LowCardinality(String)), state Enum('none' = 0, 'open' = 1, 'closed' = 2), locked UInt8, assignee LowCardinality(String), assignees Array(LowCardinality(String)), comments UInt32, author_association Enum('NONE' = 0, 'CONTRIBUTOR' = 1, 'OWNER' = 2, 'COLLABORATOR' = 3, 'MEMBER' = 4, 'MANNEQUIN' = 5), closed_at DateTime, merged_at DateTime, merge_commit_sha String, requested_reviewers Array(LowCardinality(String)), requested_teams Array(LowCardinality(String)), head_ref LowCardinality(String), head_sha String, base_ref LowCardinality(String), base_sha String, merged UInt8, mergeable UInt8, rebaseable UInt8, mergeable_state Enum('unknown' = 0, 'dirty' = 1, 'clean' = 2, 'unstable' = 3, 'draft' = 4), merged_by LowCardinality(String), review_comments UInt32, maintainer_can_modify UInt8, commits UInt32, additions UInt32, deletions UInt32, changed_files UInt32, diff_hunk String, original_position UInt32, commit_id String, original_commit_id String, push_size UInt32, push_distinct_size UInt32, member_login LowCardinality(String), release_tag_name String, release_name String, review_state Enum('none' = 0, 'approved' = 1, 'changes_requested' = 2, 'commented' = 3, 'dismissed' = 4, 'pending' = 5)"
-read data from a URL
CREATE TABLE github_events_url
ENGINE = URL('https://clickhouse-public-datasets.s3.amazonaws.com/github_events_v2.native.xz');
-Create the destination table and insert data:
CREATE TABLE github_events ENGINE = MergeTree ORDER BY (event_type, repo_name, created_at) AS SELECT * FROM github_events_url;
---无需预处理的数据分析方法是使用clickhouse-local工具。
-它类似于用ClickHouse SQL引擎替代awk、sed和grep
clickhouse-local --query "
SELECT count() FROM file('*.json.gz', LineAsString, 'data String')
WHERE JSONExtractString(data, 'actor', 'login') = 'alexey-milovidov'"
###
CREATE TABLE github_events
###
CREATE TABLE github_events
(
file_time DateTime,
event_type Enum('CommitCommentEvent' = 1, 'CreateEvent' = 2, 'DeleteEvent' = 3, 'ForkEvent' = 4,
'GollumEvent' = 5, 'IssueCommentEvent' = 6, 'IssuesEvent' = 7, 'MemberEvent' = 8,
'PublicEvent' = 9, 'PullRequestEvent' = 10, 'PullRequestReviewCommentEvent' = 11,
'PushEvent' = 12, 'ReleaseEvent' = 13, 'SponsorshipEvent' = 14, 'WatchEvent' = 15,
'GistEvent' = 16, 'FollowEvent' = 17, 'DownloadEvent' = 18, 'PullRequestReviewEvent' = 19,
'ForkApplyEvent' = 20, 'Event' = 21, 'TeamAddEvent' = 22),
actor_login LowCardinality(String),
repo_name LowCardinality(String),
created_at DateTime,
updated_at DateTime,
action Enum('none' = 0, 'created' = 1, 'added' = 2, 'edited' = 3, 'deleted' = 4, 'opened' = 5, 'closed' = 6, 'reopened' = 7, 'assigned' = 8, 'unassigned' = 9,
'labeled' = 10, 'unlabeled' = 11, 'review_requested' = 12, 'review_request_removed' = 13, 'synchronize' = 14, 'started' = 15, 'published' = 16, 'update' = 17, 'create' = 18, 'fork' = 19, 'merged' = 20),
comment_id UInt64,
body String,
path String,
position Int32,
line Int32,
ref LowCardinality(String),
ref_type Enum('none' = 0, 'branch' = 1, 'tag' = 2, 'repository' = 3, 'unknown' = 4),
creator_user_login LowCardinality(String),
number UInt32,
title String,
labels Array(LowCardinality(String)),
state Enum('none' = 0, 'open' = 1, 'closed' = 2),
locked UInt8,
assignee LowCardinality(String),
assignees Array(LowCardinality(String)),
comments UInt32,
author_association Enum('NONE' = 0, 'CONTRIBUTOR' = 1, 'OWNER' = 2, 'COLLABORATOR' = 3, 'MEMBER' = 4, 'MANNEQUIN' = 5),
closed_at DateTime,
merged_at DateTime,
merge_commit_sha String,
requested_reviewers Array(LowCardinality(String)),
requested_teams Array(LowCardinality(String)),
head_ref LowCardinality(String),
head_sha String,
base_ref LowCardinality(String),
base_sha String,
merged UInt8,
mergeable UInt8,
rebaseable UInt8,
mergeable_state Enum('unknown' = 0, 'dirty' = 1, 'clean' = 2, 'unstable' = 3, 'draft' = 4),
merged_by LowCardinality(String),
review_comments UInt32,
maintainer_can_modify UInt8,
commits UInt32,
additions UInt32,
deletions UInt32,
changed_files UInt32,
diff_hunk String,
original_position UInt32,
commit_id String,
original_commit_id String,
push_size UInt32,
push_distinct_size UInt32,
member_login LowCardinality(String),
release_tag_name String,
release_name String,
review_state Enum('none' = 0, 'approved' = 1, 'changes_requested' = 2, 'commented' = 3, 'dismissed' = 4, 'pending' = 5)
) ENGINE = MergeTree ORDER BY (event_type, repo_name, created_at);
###
何将数据从JSON转换成平面表
###
find . -name '*.json.gz' | xargs -P$(nproc) -I{} bash -c "
gzip -cd {} | jq -c '
[
(\"{}\" | scan(\"[0-9]+-[0-9]+-[0-9]+-[0-9]+\")),
.type,
.actor.login? // .actor_attributes.login? // (.actor | strings) // null,
.repo.name? // (.repository.owner? + \"/\" + .repository.name?) // null,
.created_at,
.payload.updated_at? // .payload.comment?.updated_at? // .payload.issue?.updated_at? // .payload.pull_request?.updated_at? // null,
.payload.action,
.payload.comment.id,
.payload.review.body // .payload.comment.body // .payload.issue.body? // .payload.pull_request.body? // .payload.release.body? // null,
.payload.comment?.path? // null,
.payload.comment?.position? // null,
.payload.comment?.line? // null,
.payload.ref? // null,
.payload.ref_type? // null,
.payload.comment.user?.login? // .payload.issue.user?.login? // .payload.pull_request.user?.login? // null,
.payload.issue.number? // .payload.pull_request.number? // .payload.number? // null,
.payload.issue.title? // .payload.pull_request.title? // null,
[.payload.issue.labels?[]?.name // .payload.pull_request.labels?[]?.name],
.payload.issue.state? // .payload.pull_request.state? // null,
.payload.issue.locked? // .payload.pull_request.locked? // null,
.payload.issue.assignee?.login? // .payload.pull_request.assignee?.login? // null,
[.payload.issue.assignees?[]?.login? // .payload.pull_request.assignees?[]?.login?],
.payload.issue.comments? // .payload.pull_request.comments? // null,
.payload.review.author_association // .payload.issue.author_association? // .payload.pull_request.author_association? // null,
.payload.issue.closed_at? // .payload.pull_request.closed_at? // null,
.payload.pull_request.merged_at? // null,
.payload.pull_request.merge_commit_sha? // null,
[.payload.pull_request.requested_reviewers?[]?.login],
[.payload.pull_request.requested_teams?[]?.name],
.payload.pull_request.head?.ref? // null,
.payload.pull_request.head?.sha? // null,
.payload.pull_request.base?.ref? // null,
.payload.pull_request.base?.sha? // null,
.payload.pull_request.merged? // null,
.payload.pull_request.mergeable? // null,
.payload.pull_request.rebaseable? // null,
.payload.pull_request.mergeable_state? // null,
.payload.pull_request.merged_by?.login? // null,
.payload.pull_request.review_comments? // null,
.payload.pull_request.maintainer_can_modify? // null,
.payload.pull_request.commits? // null,
.payload.pull_request.additions? // null,
.payload.pull_request.deletions? // null,
.payload.pull_request.changed_files? // null,
.payload.comment.diff_hunk? // null,
.payload.comment.original_position? // null,
.payload.comment.commit_id? // null,
.payload.comment.original_commit_id? // null,
.payload.size? // null,
.payload.distinct_size? // null,
.payload.member.login? // .payload.member? // null,
.payload.release?.tag_name? // null,
.payload.release?.name? // null,
.payload.review?.state? // null
]' | clickhouse-client --input_format_null_as_default 1 --date_time_input_format best_effort --query 'INSERT INTO github_events FORMAT JSONCompactEachRow' || echo 'File {} has issues'
###
如何持续更新数据库
###
-cron
# Assuming raw data is located in './gharchive' directory.
mkdir gharchive_new
cd gharchive_new
ls -1 ../gharchive | clickhouse-local --structure 'file String' --query "WITH (SELECT max(parseDateTimeBestEffort(extract(file, '^(.+)\.json\.gz$'), 'UTC')) FROM table) AS last SELECT toString(toDate(last + INTERVAL arrayJoin(range(0, 24)) - 12 HOUR AS t)) || '-' || toString(toHour(t)) || '.json.gz' WHERE t < now()" | xargs -I{} bash -c "[ -f ../gharchive/{} ] || wget --continue 'https://data.gharchive.org/{}'"
find . -name '*.json.gz' | xargs -P$(nproc) -I{} bash -c "
gzip -cd {} | jq -c '
[
(\"{}\" | scan(\"[0-9]+-[0-9]+-[0-9]+-[0-9]+\")),
.type,
.actor.login? // .actor_attributes.login? // (.actor | strings) // null,
.repo.name? // (.repository.owner? + \"/\" + .repository.name?) // null,
.created_at,
.payload.updated_at? // .payload.comment?.updated_at? // .payload.issue?.updated_at? // .payload.pull_request?.updated_at? // null,
.payload.action,
.payload.comment.id,
.payload.review.body // .payload.comment.body // .payload.issue.body? // .payload.pull_request.body? // .payload.release.body? // null,
.payload.comment?.path? // null,
.payload.comment?.position? // null,
.payload.comment?.line? // null,
.payload.ref? // null,
.payload.ref_type? // null,
.payload.comment.user?.login? // .payload.issue.user?.login? // .payload.pull_request.user?.login? // null,
.payload.issue.number? // .payload.pull_request.number? // .payload.number? // null,
.payload.issue.title? // .payload.pull_request.title? // null,
[.payload.issue.labels?[]?.name // .payload.pull_request.labels?[]?.name],
.payload.issue.state? // .payload.pull_request.state? // null,
.payload.issue.locked? // .payload.pull_request.locked? // null,
.payload.issue.assignee?.login? // .payload.pull_request.assignee?.login? // null,
[.payload.issue.assignees?[]?.login? // .payload.pull_request.assignees?[]?.login?],
.payload.issue.comments? // .payload.pull_request.comments? // null,
.payload.review.author_association // .payload.issue.author_association? // .payload.pull_request.author_association? // null,
.payload.issue.closed_at? // .payload.pull_request.closed_at? // null,
.payload.pull_request.merged_at? // null,
.payload.pull_request.merge_commit_sha? // null,
[.payload.pull_request.requested_reviewers?[]?.login],
[.payload.pull_request.requested_teams?[]?.name],
.payload.pull_request.head?.ref? // null,
.payload.pull_request.head?.sha? // null,
.payload.pull_request.base?.ref? // null,
.payload.pull_request.base?.sha? // null,
.payload.pull_request.merged? // null,
.payload.pull_request.mergeable? // null,
.payload.pull_request.rebaseable? // null,
.payload.pull_request.mergeable_state? // null,
.payload.pull_request.merged_by?.login? // null,
.payload.pull_request.review_comments? // null,
.payload.pull_request.maintainer_can_modify? // null,
.payload.pull_request.commits? // null,
.payload.pull_request.additions? // null,
.payload.pull_request.deletions? // null,
.payload.pull_request.changed_files? // null,
.payload.comment.diff_hunk? // null,
.payload.comment.original_position? // null,
.payload.comment.commit_id? // null,
.payload.comment.original_commit_id? // null,
.payload.size? // null,
.payload.distinct_size? // null,
.payload.member.login? // .payload.member? // null,
.payload.release?.tag_name? // null,
.payload.release?.name? // null,
.payload.review?.state? // null
]' | clickhouse-client --input_format_null_as_default 1 --date_time_input_format best_effort --query 'INSERT INTO github_events FORMAT JSONCompactEachRow' || echo 'File {} has issues'
" && mv *.json.gz ../gharchive
-How to create .tsv.xz and .native.xz dumps
-xz -z 要压缩的文件,-k参数来保留文件,如果要设置压缩率加入参数 -0 到 -9调节压缩率。如果不设置,默认压缩等级是6
-xz -d 要解压的文件,-k参数来保留文件.
clickhouse-client --progress --max_threads 1 --query "SELECT * FROM github_events FORMAT TSV" | pixz > github_events.xz
clickhouse-client --progress --max_threads 1 --query "SELECT * FROM github_events FORMAT Native" | pixz > github_events.native.xz
追梦赤子心:domsn.com
签名:这个人很懒,什么也没有留下!
收藏的用户(0)
X
正在加载信息~
最新回复 (0)