MongoDB for MySQL People


30 Jul 2020 mongodb

I am a MySQL guy. I can query MySQL, BigQuery or SQL-like databases for my day to day business. Last week, I faced a challenge where I needed to extract some information from a production MongoDB server that involves aggregation. I had no idea how MongoDB works or how to run CRUD operation on such database. So obviously, I spent my whole weekend in learning MongoDB. In this post I am going to share the MongoDB queries I learnt, and the equivalent MySQL queries as well.

I used MongoDB v3.6, for both daemon and cli, so some functions may not work with your current setup

Populate Data and Load into MongoDB

I used the following piece of code to scrap my Github profile information into a single JSON file.

import requests
import json

username = 'mdminhazulhaque'
repos = []
page = 1

while True:
    params = (
        ('page', page),
        ('type', 'owner'),
        ('per_page', 100)
    )
    response = requests.get('https://api.github.com/users/{username}/repos'\
        .format(username=username), params=params)
    for single in response.json():
        repos.append({
            "name": single.get('name', None),
            "description": single.get('description', None),
            "created_at": single.get('created_at', None),
            "language": single.get('language', None),
            "stargazers_count": single.get('stargazers_count', None)
        })
    if len(response.json()) < 100:
        break
    else:
        page += 1
with open("github.json", "w") as fp:
    json.dump(repos, fp, indent=4)

Then I loaded the JSON data into MongoDB using mongoimport.

mongoimport --db local --collection repos --drop --jsonArray --file github.json 

One record from the repo collection looks like the following.

$ mongo 127.0.0.1/local
> db.repos.findOne()
{
        "_id" : ObjectId("5f1cfca24662cbc419b65e29"),
        "name" : "a-pdf-reader",
        "description" : "A poppler based pdf reader written in Qt4",
        "created_at" : "2012-10-07T20:44:48Z",
        "language" : "C++",
        "stargazers_count": null
}

Let’s start playing with MongoDB.

SELECT

select * from repos

> db.repos.find({})

{ "_id" : ObjectId("5f1cfca24662cbc419b65e2a"), "name" : "apt-sync", "created_at" : "2017-11-19T04:57:23Z", "language" : "Python", "stargazers_count" : 0 }
{ "_id" : ObjectId("5f1cfca24662cbc419b65e2b"), "name" : "aws-cli-cheatsheet", "created_at" : "2020-02-27T16:38:21Z", "language" : null, "stargazers_count" : 52 }
{ "_id" : ObjectId("5f1cfca24662cbc419b65e2c"), "name" : "banglalion-wimax-mac", "created_at" : "2015-02-25T19:15:48Z", "language" : "HTML", "stargazers_count" : 0 }
{ "_id" : ObjectId("5f1cfca24662cbc419b65e2d"), "name" : "annoying-bn-text", "created_at" : "2019-06-05T17:25:05Z", "language" : "HTML", "stargazers_count" : 1 }
{ "_id" : ObjectId("5f1cfca24662cbc419b65e2e"), "name" : "banglalionwimaxapi", "created_at" : "2018-10-16T13:03:53Z", "language" : "Python", "stargazers_count" : 0 }
{ "_id" : ObjectId("5f1cfca24662cbc419b65e2f"), "name" : "bcm-wimax-dkms", "created_at" : "2017-05-20T18:22:07Z", "language" : "C", "stargazers_count" : 0 }
{ "_id" : ObjectId("5f1cfca24662cbc419b65e30"), "name" : "bd-mrp-api", "created_at" : "2018-12-31T20:26:48Z", "language" : "Python", "stargazers_count" : 1 }

select name, description from repos

> db.repos.find({}, {_id:0, name:1, description:1})

{ "name" : "a-pdf-reader", "description" : "A poppler based pdf reader written in Qt4" }
{ "name" : "apt-sync", "description" : ":gift: Sync APT installed package list across different machines" }
{ "name" : "aws-cli-cheatsheet", "description" : ":cloud: AWS CLI + JQ = Make life easier" }
{ "name" : "banglalion-wimax-mac", "description" : ":satellite: Banglalion WiMAX on Mac OS X" }
{ "name" : "annoying-bn-text", "description" : "Piss of your friends with awkwardly written Bengali text" }

select name, description from repos limit 2

> db.repos.aggregate([{$project:{_id:0, name:1, description:1}}, {$limit:5}])

{ "name" : "a-pdf-reader", "description" : "A poppler based pdf reader written in Qt4" }
{ "name" : "apt-sync", "description" : ":gift: Sync APT installed package list across different machines" }

select name, description from repos order by name

> db.repos.find({}, {_id:0, name:1, description:1}).sort({name:1})

{ "name" : "Android-Javascript-Bridge", "description" : "Android and Javascript two way communication" }
{ "name" : "Android-JavascriptInterface-Sample", "description" : "Call Java (Android) methods from Webview using Javascript" }
{ "name" : "Bilai-PnP-Gui", "description" : ":cat2: GUI for Bilai PnP Modems" }
{ "name" : "CPPad", "description" : "Write and run C++ codes, instantly!" }
{ "name" : "CSE10Routine", "description" : "Tabbed routine app" }

select name, stargazers_count from repos order by stargazers_count desc

> db.repos.find({}, {_id:0, name:1, stargazers_count:1}).sort({stargazers_count:-1})

{ "name" : "aws-cli-cheatsheet", "stargazers_count" : 52 }
{ "name" : "probhat-osx", "stargazers_count" : 23 }
{ "name" : "PyQt-BPNN", "stargazers_count" : 22 }
{ "name" : "foreach", "stargazers_count" : 14 }
{ "name" : "html-table-to-json", "stargazers_count" : 12 }

select name, stargazers_count from repos where stargazers_count > 20 order by stargazers_count

> db.repos.find({ stargazers_count: { $gt: 20} }, {_id:0, name:1, stargazers_count:1}).sort({stargazers_count:-1})

{ "name" : "aws-cli-cheatsheet", "stargazers_count" : 52 }
{ "name" : "probhat-osx", "stargazers_count" : 23 }
{ "name" : "PyQt-BPNN", "stargazers_count" : 22 }

select name, stargazers_count from repos where language = “Python”

> db.repos.find({ language: "Python" }, {_id:0, name:1, stargazers_count:1})

{ "name" : "apt-sync", "stargazers_count" : 0 }
{ "name" : "banglalionwimaxapi", "stargazers_count" : 0 }
{ "name" : "bd-mrp-api", "stargazers_count" : 1 }
{ "name" : "flightaware-cli", "stargazers_count" : 0 }
{ "name" : "html-table-to-json", "stargazers_count" : 12 }

select name, description from repos where description like “%cli%”

> db.repos.find({ description: /cli/i }, {_id:0, name:1, description:1})

{ "name" : "aws-cli-cheatsheet", "description" : ":cloud: AWS CLI + JQ = Make life easier" }
{ "name" : "ClipMonitor", "description" : "A tiny clip monitor app built with Qt5" }
{ "name" : "foreach", "description" : ":repeat: Read lines from file/stdin and execute them as CLI argument" }
{ "name" : "kakitangan-cli", "description" : ":briefcase: CLI for Kakitangan app, the online HR Software for Malaysian businesses" }
{ "name" : "librs232", "description" : ":paperclip: RS232 Library in C" }

select language, count(*) as count, description from repos group by language order by count

> db.repos.aggregate([{ $group: { _id: "$language", count: { $sum: 1 } } },{ $sort: { count: -1 } }])

{ "_id" : "C++", "count" : 61 }
{ "_id" : "Python", "count" : 25 }
{ "_id" : "HTML", "count" : 12 }
{ "_id" : "Java", "count" : 12 }
{ "_id" : "Shell", "count" : 10 }

select language, sum(stargazers_count) as stars, description from repos group by language order by stars

> db.repos.aggregate([{ $group: { _id: "$language", stars: { $sum: "$stargazers_count" } } },{ $sort: { stars: -1 } }])

{ "_id" : "C++", "stars" : 72 }
{ "_id" : null, "stars" : 52 }
{ "_id" : "Python", "stars" : 47 }
{ "_id" : "Shell", "stars" : 43 }
{ "_id" : "HTML", "stars" : 19 }

select name, substr(created_at, 0, 4) as created_at_year from repos order by created_at_year desc

> db.repos.aggregate([{$addFields:{"created_at_year":{$substr:["$created_at", 0, 4]}}}, {$project: {_id:0, name:1, created_at_year:1}}, {$sort: {created_at_year:-1}}])

{ "name" : "aws-cli-cheatsheet", "created_at_year" : "2020" }
{ "name" : "gitfolio", "created_at_year" : "2020" }
{ "name" : "messenger-bot", "created_at_year" : "2020" }
{ "name" : "urctl", "created_at_year" : "2020" }
{ "name" : "annoying-bn-text", "created_at_year" : "2019" }
{ "name" : "esp32-mqtt-ir-remote", "created_at_year" : "2019" }

select substr(created_at, 0, 4)) as year, count(*) as count from repos group by year order by count desc

> db.repos.aggregate([{$addFields:{"created_at_year":{$substr:["$created_at", 0, 4]}}}, { $group: { _id: "$created_at_year", count: { $sum: 1 } } },{ $sort: { count: -1 } }])

{ "_id" : "2015", "count" : 41 }
{ "_id" : "2016", "count" : 35 }
{ "_id" : "2012", "count" : 18 }
{ "_id" : "2014", "count" : 14 }
{ "_id" : "2013", "count" : 12 }
{ "_id" : "2019", "count" : 10 }

select name, unix_timestamp(str_to_date(created_at, ‘%Y-%M-%dT%h:%m:%sz’)) from repos

> db.repos.aggregate([{$addFields: {date: { $dateFromString: { dateString: "$created_at" }}}}, {$project: {_id:0, created_at_usec: { $subtract: [ "$date", new Date("1970-01-01") ] }, name:1}}])

{ "name" : "a-pdf-reader", "created_at_usec" : NumberLong("1349642688000") }
{ "name" : "apt-sync", "created_at_usec" : NumberLong("1511067443000") }
{ "name" : "aws-cli-cheatsheet", "created_at_usec" : NumberLong("1582821501000") }
{ "name" : "banglalion-wimax-mac", "created_at_usec" : NumberLong("1424891748000") }
{ "name" : "annoying-bn-text", "created_at_usec" : NumberLong("1559755505000") }

DELETE

delete from repose where language = “C++”

> db.repos.remove({language:"C++"})

ALTER

alter table repos drop column created_at

> db.repos.update({},{$unset:{created_at:""}})

alter table repos insert column foo int

> db.repos.update({}, {$set: {foo:0}}, {multi:1})

That’s all for today. Feel free to share if any of the queries can be improved. Thanks for reading.