<!DOCTYPE html> <html lang="en"><head> <meta http-equiv="content-type" content="text/html; charset=UTF-8"><script type="text/javascript" async="" src="Creating%20containers%20-%20Part%201_files/ga.js"></script><script src="Creating%20containers%20-%20Part%201_files/analytics.js" type="text/javascript"></script> <script type="text/javascript">window.addEventListener('DOMContentLoaded',function(){var v=archive_analytics.values;v.service='wb';v.server_name='wwwb-app102.us.archive.org';v.server_ms=1019;archive_analytics.send_pageview({});});</script><script type="text/javascript" src="Creating%20containers%20-%20Part%201_files/playback.js" charset="utf-8"></script> <script type="text/javascript" src="Creating%20containers%20-%20Part%201_files/wombat.js" charset="utf-8"></script> <script type="text/javascript"> if (window._WBWombatInit) { wbinfo = {} wbinfo.url = "http://crosbymichael.com:80/creating-containers-part-1.html"; wbinfo.timestamp = "20191223021405"; wbinfo.request_ts = "20191223021405"; wbinfo.prefix = "http://web.archive.org/web/"; wbinfo.mod = "if_"; wbinfo.is_framed = false; wbinfo.is_live = false; wbinfo.coll = "web"; wbinfo.proxy_magic = ""; wbinfo.static_prefix = "/_static/"; wbinfo.enable_auto_fetch = true; wbinfo.auto_fetch_worker_prefix = "http://web.archive.org/web/"; wbinfo.wombat_ts = "20191223021405"; wbinfo.wombat_sec = "1577067245"; wbinfo.wombat_scheme = "https"; wbinfo.wombat_host = "crosbymichael.com:80"; wbinfo.ignore_prefixes = ["/__wb/", "/_static/", "/web/", "http://analytics.archive.org/", "https://analytics.archive.org/", "//analytics.archive.org/", "http://archive.org/", "https://archive.org/", "//archive.org/", "http://faq.web.archive.org/", "http://web.archive.org/", "https://web.archive.org/" ]; wbinfo.wombat_opts = {}; window._WBWombatInit(wbinfo); } __wm.init("http://web.archive.org/web"); </script> <link rel="stylesheet" type="text/css" href="Creating%20containers%20-%20Part%201_files/banner-styles.css"> <link rel="stylesheet" type="text/css" href="Creating%20containers%20-%20Part%201_files/iconochive.css"> <!-- End Wayback Rewrite JS Include --> <meta charset="utf-8"> <title>Creating containers - Part 1</title> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <meta name="description" content=""> <meta name="author" content=""> <link rel="stylesheet" href="Creating%20containers%20-%20Part%201_files/main.css" type="text/css"> <link href="Creating%20containers%20-%20Part%201_files/css.css" rel="stylesheet" type="text/css"> <link id="elemento-theme" href="Creating%20containers%20-%20Part%201_files/bootstrap.css" rel="stylesheet"> <link href="http://web.archive.org/web/20191223021405/http://crosbymichael.com/feeds/all.atom.xml" type="application/atom+xml" rel="alternate" title="Michael Crosby ATOM Feed"> <script src="Creating%20containers%20-%20Part%201_files/jquery_002.js"></script> <script> $(document).ready(function() { $('#walter').on('click', function(e) { e.stopPropagation(); window.location = 'http://web.archive.org/web/20191223021405/http://docker.io'; }); $('#scene').parallax(); }); </script> <style type="text/css"> body { padding-top: 20px; padding-bottom: 40px; } /* Custom container */ .container-narrow { margin: 0 auto; max-width: 700px; } .container-narrow > hr { margin: 30px 0; } /* Supporting marketing content */ .marketing { margin: 60px 0; } .marketing p + h4 { margin-top: 28px; } #scene { position:absolute; top: 0; right: 0; } </style> <script type="text/javascript"> var _gaq = _gaq || []; _gaq.push(['_setAccount', 'UA-21167181-1']); _gaq.push(['_trackPageview']); (function() { var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true; ga.src = ('https:' == document.location.protocol ? 'http://web.archive.org/web/20191223021405/https://ssl' : 'http://web.archive.org/web/20191223021405/http://www') + '.google-analytics.com/ga.js'; var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s); })(); </script> <link href="Creating%20containers%20-%20Part%201_files/bootstrap-responsive.css" rel="stylesheet"> <!-- HTML5 shim, for IE6-8 support of HTML5 elements --> <!--[if lt IE 9]> <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script> <![endif]--> <!--[if IE]> <script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script> <script src="http://code.onion.com/fartscroll.js"></script> <script type="text/javascript"> fartscroll(50); </script> <![endif]--> <script type="text/javascript" async="" src="Creating%20containers%20-%20Part%201_files/embed.html"></script></head> <body><!-- BEGIN WAYBACK TOOLBAR INSERT --> <style type="text/css"> body { margin-top:0 !important; padding-top:0 !important; /*min-width:800px !important;*/ } </style> <div id="wm-ipp-base" style="display: block; direction: ltr;" lang="en"> </div><div id="donato" style="position:relative;width:100%;"> <div id="donato-base"> <iframe id="donato-if" src="Creating%20containers%20-%20Part%201_files/donate.html" scrolling="no" style="width:100%; height:100%" frameborder="0"> </iframe> </div> </div><script type="text/javascript"> __wm.bt(625,27,25,2,"web","http://crosbymichael.com/creating-containers-part-1.html","20191223021405",1996,"/_static/",["/_static/css/banner-styles.css?v=HyR5oymJ","/_static/css/iconochive.css?v=qtvMKcIJ"]); </script> <!-- END WAYBACK TOOLBAR INSERT --> <div class="container-narrow"> <div class="masthead"> <ul class="nav nav-pills pull-right"> <li><a href="http://web.archive.org/web/20191223021405/http://crosbymichael.com/feeds/all.atom.xml" rel="alternate">atom feed</a></li> <li><a href="http://web.archive.org/web/20191223021405/http://github.com/crosbymichael">github</a></li> </ul> <h3 class="muted"><a href="http://web.archive.org/web/20191223021405/http://crosbymichael.com/index.html">Michael Crosby</a></h3> </div> <ul id="scene" style="transform: translate3d(0px, 0px, 0px); transform-style: preserve-3d; backface-visibility: hidden;"> <li class="layer" data-depth="1.0" style="position: relative; display: block; height: 100%; width: 100%; left: 0px; top: 0px; transform: translate3d(-4.55115%, 2.10234%, 0px); transform-style: preserve-3d; backface-visibility: hidden;"> <img src="Creating%20containers%20-%20Part%201_files/docker-logo.png" id="walter"> </li> </ul> <hr> <ul class="nav nav-pills pull-left"> <li><a href="http://web.archive.org/web/20191223021405/http://crosbymichael.com/category/dev.html">dev</a></li> <li class="active"><a href="http://web.archive.org/web/20191223021405/http://crosbymichael.com/category/docker.html">docker</a></li> <li><a href="http://web.archive.org/web/20191223021405/http://crosbymichael.com/category/go.html">go</a></li> <li><a href="http://web.archive.org/web/20191223021405/http://crosbymichael.com/category/productivity.html">productivity</a></li> <hr> </ul> <div class="row-fluid marketing"> <div class="span12"> <section id="content" class="body"> <article> <header> <h1 class="entry-title"><a href="http://web.archive.org/web/20191223021405/http://crosbymichael.com/creating-containers-part-1.html" rel="bookmark" title="Permalink to Creating containers - Part 1">Creating containers - Part 1</a></h1> </header> <div class="entry-content"> <footer class="post-info"> <abbr class="published" title="2014-11-16T00:00:00+00:00"> Sun 16 November 2014 </abbr> <address class="vcard author"> By <a class="url fn" href="http://web.archive.org/web/20191223021405/http://crosbymichael.com/author/Michael%20Crosby.html">Michael Crosby</a> </address> <p>In <a href="http://web.archive.org/web/20191223021405/http://crosbymichael.com/category/docker.html">docker</a>. </p> <p></p></footer><!-- /.post-info --><!-- /.post-info --> <p>This is part one of a series of blog posts detailing how docker creates containers. We will dig deep into the various pieces that are stitched together to see what it takes to make <code>docker run ...</code> awesome.</p> <h2>First, what is a container?</h2> <p>I think the various pieces of technology that goes into creating a container is fairly commonplace. You should have seen a few cool looking charts in various presentations about Docker where you get a quick "Docker uses namespaces, cgroups, chroot, etc." to create containers. But why does it take all these pieces to create a contaienr?<br> Why is it not a simple syscall and it's all done for me? The fact is that container's don't exist, they are made up. There is no such thing as a "linux container" in the kernel. A container is a userland concept.</p> <h2>Namespaces</h2> <p>In part one I'll talk about how to create Linux namespaces in the context of how they are used within docker. In later posts we will look into how namespaces are combined with other features like cgroups and an isolated filesystem to create something useful.</p> <p>First off we need a high level explanation of what a namespace does and why it's useful. Basically, a namespace is a scoped view of your underlying Linux system. There are a few different types of namespaces implemented inside the kernel. As we dig into each of the different namespaces below you can follow along by running <code>docker run -it --privileged --net host crosbymichael/make-containers</code>. This has a few preloaded files and configuration to get your started. Even though we will be creating namespaces inside an container that docker runs for us, don't let that trip you up. I opted for this approach as providing a container preloaded with all the dependencies that you need to run the examples is why we are doing this in the first place. To make things a little easier, I'm using the <code>--net host</code> flag so that we are able to see your host's network interfaces within our demo container. This will be useful in the network examples. We also need to provide the <code>--privilged</code> flag so that we have the correct permissions to create new namespaces within our container.</p> <p>If you are interested in what the Dockerfile looks like then here it is:</p> <div class="highlight"><pre>FROM debian:jessie RUN apt-get update <span class="o">&&</span> apt-get install -y <span class="se">\</span> gcc <span class="se">\</span> vim <span class="se">\</span> emacs COPY containers/ /containers/ WORKDIR /containers CMD <span class="o">[</span><span class="s2">"bash"</span><span class="o">]</span> </pre></div> <p>I'll be doing the examples in C, as it's sometimes easier to explain the lower level details better than the abstractions that Go provides. So lets start...</p> <h3>NET Namespace</h3> <p>The network namespaces provides your own view of the network stack of your system. This can include your very own <code>localhost</code>. Make sure you are in the <code>crosbymichael/make-containers</code> and run the command <code>ip a</code> to view all the network interfaces of your host machine.</p> <div class="highlight"><pre>> ip a root@development:/containers# ip a 1: lo: <LOOPBACK,UP,LOWER_UP> mtu <span class="m">65536</span> qdisc noqueue state UNKNOWN group default link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 127.0.0.1/8 scope host lo valid_lft forever preferred_lft forever inet6 ::1/128 scope host valid_lft forever preferred_lft forever 2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu <span class="m">1500</span> qdisc pfifo_fast state UP group default qlen 1000 link/ether 08:00:27:19:ca:f2 brd ff:ff:ff:ff:ff:ff inet 10.0.2.15/24 brd 10.0.2.255 scope global eth0 valid_lft forever preferred_lft forever inet6 fe80::a00:27ff:fe19:caf2/64 scope link valid_lft forever preferred_lft forever 3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu <span class="m">1500</span> qdisc pfifo_fast state UP group default qlen 1000 link/ether 08:00:27:20:84:47 brd ff:ff:ff:ff:ff:ff inet 192.168.56.103/24 brd 192.168.56.255 scope global eth1 valid_lft forever preferred_lft forever inet6 fe80::a00:27ff:fe20:8447/64 scope link valid_lft forever preferred_lft forever 4: docker0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu <span class="m">1500</span> qdisc noqueue state DOWN group default link/ether 56:84:7a:fe:97:99 brd ff:ff:ff:ff:ff:ff inet 172.17.42.1/16 scope global docker0 valid_lft forever preferred_lft forever inet6 fe80::5484:7aff:fefe:9799/64 scope link valid_lft forever preferred_lft forever </pre></div> <p>Ok cool, so this is all the network interfaces currently on <strong>my</strong> host system. Yours may look a little different but you get the idea. Now let's write some code to create a new network namespace. For this we will write a skeleton of a small C binary that uses the <code>clone</code> syscall. We will start by using clone to run binaries that are already installed inside our demo container. The file <code>skeleton.c</code> should be in the working directory of the demo container. We will use this file as the basis of all our examples. Here is the code incase you don't feel like running the container right now.</p> <div class="highlight"><pre><span class="cp">#define _GNU_SOURCE</span> <span class="cp">#include <stdio.h></span> <span class="cp">#include <stdlib.h></span> <span class="cp">#include <sched.h></span> <span class="cp">#include <sys/wait.h></span> <span class="cp">#include <errno.h></span> <span class="cp">#define STACKSIZE (1024*1024)</span> <span class="k">static</span> <span class="kt">char</span> <span class="n">child_stack</span><span class="p">[</span><span class="n">STACKSIZE</span><span class="p">];</span> <span class="k">struct</span> <span class="n">clone_args</span> <span class="p">{</span> <span class="kt">char</span> <span class="o">**</span><span class="n">argv</span><span class="p">;</span> <span class="p">};</span> <span class="c1">// child_exec is the func that will be executed as the result of clone</span> <span class="k">static</span> <span class="kt">int</span> <span class="nf">child_exec</span><span class="p">(</span><span class="kt">void</span> <span class="o">*</span><span class="n">stuff</span><span class="p">)</span> <span class="p">{</span> <span class="k">struct</span> <span class="n">clone_args</span> <span class="o">*</span><span class="n">args</span> <span class="o">=</span> <span class="p">(</span><span class="k">struct</span> <span class="n">clone_args</span> <span class="o">*</span><span class="p">)</span><span class="n">stuff</span><span class="p">;</span> <span class="k">if</span> <span class="p">(</span><span class="n">execvp</span><span class="p">(</span><span class="n">args</span><span class="o">-></span><span class="n">argv</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">args</span><span class="o">-></span><span class="n">argv</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">)</span> <span class="p">{</span> <span class="n">fprintf</span><span class="p">(</span><span class="n">stderr</span><span class="p">,</span> <span class="s">"failed to execvp argments %s</span><span class="se">\n</span><span class="s">"</span><span class="p">,</span> <span class="n">strerror</span><span class="p">(</span><span class="n">errno</span><span class="p">));</span> <span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">);</span> <span class="p">}</span> <span class="c1">// we should never reach here!</span> <span class="n">exit</span><span class="p">(</span><span class="n">EXIT_FAILURE</span><span class="p">);</span> <span class="p">}</span> <span class="kt">int</span> <span class="nf">main</span><span class="p">(</span><span class="kt">int</span> <span class="n">argc</span><span class="p">,</span> <span class="kt">char</span> <span class="o">**</span><span class="n">argv</span><span class="p">)</span> <span class="p">{</span> <span class="k">struct</span> <span class="n">clone_args</span> <span class="n">args</span><span class="p">;</span> <span class="n">args</span><span class="p">.</span><span class="n">argv</span> <span class="o">=</span> <span class="o">&</span><span class="n">argv</span><span class="p">[</span><span class="mi">1</span><span class="p">];</span> <span class="kt">int</span> <span class="n">clone_flags</span> <span class="o">=</span> <span class="n">SIGCHLD</span><span class="p">;</span> <span class="c1">// the result of this call is that our child_exec will be run in another</span> <span class="c1">// process returning it's pid</span> <span class="kt">pid_t</span> <span class="n">pid</span> <span class="o">=</span> <span class="n">clone</span><span class="p">(</span><span class="n">child_exec</span><span class="p">,</span> <span class="n">child_stack</span> <span class="o">+</span> <span class="n">STACKSIZE</span><span class="p">,</span> <span class="n">clone_flags</span><span class="p">,</span> <span class="o">&</span><span class="n">args</span><span class="p">);</span> <span class="k">if</span> <span class="p">(</span><span class="n">pid</span> <span class="o"><</span> <span class="mi">0</span><span class="p">)</span> <span class="p">{</span> <span class="n">fprintf</span><span class="p">(</span><span class="n">stderr</span><span class="p">,</span> <span class="s">"clone failed WTF!!!! %s</span><span class="se">\n</span><span class="s">"</span><span class="p">,</span> <span class="n">strerror</span><span class="p">(</span><span class="n">errno</span><span class="p">));</span> <span class="n">exit</span><span class="p">(</span><span class="n">EXIT_FAILURE</span><span class="p">);</span> <span class="p">}</span> <span class="c1">// lets wait on our child process here before we, the parent, exits</span> <span class="k">if</span> <span class="p">(</span><span class="n">waitpid</span><span class="p">(</span><span class="n">pid</span><span class="p">,</span> <span class="nb">NULL</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> <span class="o">==</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="p">{</span> <span class="n">fprintf</span><span class="p">(</span><span class="n">stderr</span><span class="p">,</span> <span class="s">"failed to wait pid %d</span><span class="se">\n</span><span class="s">"</span><span class="p">,</span> <span class="n">pid</span><span class="p">);</span> <span class="n">exit</span><span class="p">(</span><span class="n">EXIT_FAILURE</span><span class="p">);</span> <span class="p">}</span> <span class="n">exit</span><span class="p">(</span><span class="n">EXIT_SUCCESS</span><span class="p">);</span> <span class="p">}</span> </pre></div> <p>This is a small C binary that will allow you to run processes like <code>./a.out ip a</code>. It uses the arguments that you pass on the cli as the arguments to whatever process you want to use. Don't worry about the specific implementation too much as it's the changes we will be making that are the interesting aspects. Remember, this will execute the binary and arguments of whatever program you want, this means if you want to run one of these demos below and have it spawn a shell session so that you can poke around in your new namespace then go ahead. It is a great way to explore and inspect these different namespaces at your own pace. So to get started let's make a copy of this file to start working with the network namespace.</p> <div class="highlight"><pre>> cp skeleton.c network.c </pre></div> <p>Ok, within this file there is a very special var called <code>clone_flags</code>. This is where most of our changes will happen throughout this post. Namespaces are primarily controlled via the clone flags. The clone flag for the network namespace is <code>CLONE_NEWNET</code>. We need to change the line in the file <code>int clone_flags = SIGCHLD;</code> to <code>int clone_flags = CLONE_NEWNET | SIGCHLD;</code> so that the call to <code>clone</code> creates a new network namespace for our process. Make this change in <code>network.c</code> then compile and run.</p> <div class="highlight"><pre>> gcc -o net network.c > ./net ip a 1: lo: <LOOPBACK> mtu <span class="m">65536</span> qdisc noop state DOWN group default link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 </pre></div> <p>The result of this run now looks very different from the first time we ran the <code>ip a</code> command. We only see a <code>loopback</code> interface in the output. This is because our process that was created only has a view of its network namespace and not of the host. And that's it. That is how you create a new network namespace. </p> <p>Right now this is pretty useless as you don't have any usable interfaces. Docker uses this new network namespace to setup a <code>veth</code> interface so that your container has it's own ip address allocated on a bridge, usually <code>docker0</code>. We won't go down the path of how to setup interfaces in namespaces at this time. We can save that for another post.</p> <p>So now that we know how a network namespace is created lets look at the mount namespace.</p> <h3>MNT Namespace</h3> <p>The mount namespace gives you a scoped view of the mounts on your system. It's often confused with jailing a process inside a <code>chroot</code> or similar. This is not true! The mount namespaces does not equal a filesystem jail. So the next time you hear someone say that a container uses the mount namespace to "jail" the process inside it's own root filesystem you can call bullshit because they don't know what they are talking about. Do it, it's fun :)</p> <p>Let's start by making a copy of <code>skeleton.c</code> again for our mount related changes. We can do a quick build and run to see what our current mount points looks like with the <code>mount</code> command.</p> <div class="highlight"><pre>> cp skeleton.c mount.c > gcc -o mount mount.c > ./mount mount proc on /proc <span class="nb">type </span>proc <span class="o">(</span>rw,nosuid,nodev,noexec,relatime<span class="o">)</span> tmpfs on /dev <span class="nb">type </span>tmpfs <span class="o">(</span>rw,nosuid,mode<span class="o">=</span>755<span class="o">)</span> shm on /dev/shm <span class="nb">type </span>tmpfs <span class="o">(</span>rw,nosuid,nodev,noexec,relatime,size<span class="o">=</span>65536k<span class="o">)</span> mqueue on /dev/mqueue <span class="nb">type </span>mqueue <span class="o">(</span>rw,nosuid,nodev,noexec,relatime<span class="o">)</span> devpts on /dev/pts <span class="nb">type </span>devpts <span class="o">(</span>rw,nosuid,noexec,relatime,gid<span class="o">=</span>5,mode<span class="o">=</span>620,ptmxmode<span class="o">=</span>666<span class="o">)</span> sysfs on /sys <span class="nb">type </span>sysfs <span class="o">(</span>rw,nosuid,nodev,noexec,relatime<span class="o">)</span> /dev/disk/by-uuid/d3aa2880-c290-4586-9da6-2f526e381f41 on /etc/resolv.conf <span class="nb">type </span>ext4 <span class="o">(</span>rw,relatime,errors<span class="o">=</span>remount-ro,data<span class="o">=</span>ordered<span class="o">)</span> /dev/disk/by-uuid/d3aa2880-c290-4586-9da6-2f526e381f41 on /etc/hostname <span class="nb">type </span>ext4 <span class="o">(</span>rw,relatime,errors<span class="o">=</span>remount-ro,data<span class="o">=</span>ordered<span class="o">)</span> /dev/disk/by-uuid/d3aa2880-c290-4586-9da6-2f526e381f41 on /etc/hosts <span class="nb">type </span>ext4 <span class="o">(</span>rw,relatime,errors<span class="o">=</span>remount-ro,data<span class="o">=</span>ordered<span class="o">)</span> devpts on /dev/console <span class="nb">type </span>devpts <span class="o">(</span>rw,nosuid,noexec,relatime,gid<span class="o">=</span>5,mode<span class="o">=</span>620,ptmxmode<span class="o">=</span>000<span class="o">)</span> </pre></div> <p>This is what the mount points look like from within my demo container, yours may look different. In order to create a new mount namespace we use the flag <code>CLONE_NEWNS</code>. You may notice something may look weird here. Why is this flag not <code>CLONE_NEWMOUNT</code> or <code>CLONE_NEWMNT</code>? This is because the mount namespace was the first Linux namespace introduced and the name was just an undersight. If you write code you will understand that as you start building a feature or an application, you don't often have a full picture of the end result. Anyway, let's add <code>CLONE_NEWNS</code> to our <code>clone_flags</code> variable. It should look something like <code>int clone_flags = CLONE_NEWNS | SIGCHLD;</code>.</p> <p>Lets go ahead and build <code>mount.c</code> again and run the same command.</p> <div class="highlight"><pre>> cp skeleton.c mount.c > gcc -o mount mount.c > ./mount mount </pre></div> <p>Nothing changed. Whaaat??? This is because the process that we run inside the new mount namespace still has a view on <code>/proc</code> of the underlying system. The result is that the new process sort of <em>inherits</em> a view on the underlying mounts. There are a few ways that we can prevent this, like using <code>pivot_root</code>, but we will leave that for an additional post on filesystem jails and how <code>chroot</code> / <code>pivot_root</code> interact with the container's mount namespace.</p> <p>However, one way that we can try out our new mount namespace is to, well, mount something. Let's create a new <code>tmpfs</code> mount in <code>/mytmp</code> for this demo. We will do this mount in C and continue to run our same <code>mount</code> command as the args to our mount binary. In order to do a mount <strong>inside</strong> of our mount namespace we need to add the code to the <code>child_exec</code> function, before the call to <code>execvp</code>. The code within the <code>child_exec</code> function is run inside the newly created process, i.e., inside our new namespace. The code in <code>child_exec</code> should look like this:</p> <div class="highlight"><pre><span class="c1">// child_exec is the func that will be executed as the result of clone</span> <span class="k">static</span> <span class="kt">int</span> <span class="nf">child_exec</span><span class="p">(</span><span class="kt">void</span> <span class="o">*</span><span class="n">stuff</span><span class="p">)</span> <span class="p">{</span> <span class="k">struct</span> <span class="n">clone_args</span> <span class="o">*</span><span class="n">args</span> <span class="o">=</span> <span class="p">(</span><span class="k">struct</span> <span class="n">clone_args</span> <span class="o">*</span><span class="p">)</span><span class="n">stuff</span><span class="p">;</span> <span class="k">if</span> <span class="p">(</span><span class="n">mount</span><span class="p">(</span><span class="s">"none"</span><span class="p">,</span> <span class="s">"/mytmp"</span><span class="p">,</span> <span class="s">"tmpfs"</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="s">""</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">)</span> <span class="p">{</span> <span class="n">fprintf</span><span class="p">(</span><span class="n">stderr</span><span class="p">,</span> <span class="s">"failed to mount tmpfs %s</span><span class="se">\n</span><span class="s">"</span><span class="p">,</span> <span class="n">strerror</span><span class="p">(</span><span class="n">errno</span><span class="p">));</span> <span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">);</span> <span class="p">}</span> <span class="k">if</span> <span class="p">(</span><span class="n">execvp</span><span class="p">(</span><span class="n">args</span><span class="o">-></span><span class="n">argv</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">args</span><span class="o">-></span><span class="n">argv</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">)</span> <span class="p">{</span> <span class="n">fprintf</span><span class="p">(</span><span class="n">stderr</span><span class="p">,</span> <span class="s">"failed to execvp argments %s</span><span class="se">\n</span><span class="s">"</span><span class="p">,</span> <span class="n">strerror</span><span class="p">(</span><span class="n">errno</span><span class="p">));</span> <span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">);</span> <span class="p">}</span> <span class="c1">// we should never reach here!</span> <span class="n">exit</span><span class="p">(</span><span class="n">EXIT_FAILURE</span><span class="p">);</span> <span class="p">}</span> </pre></div> <p>We need to first create a directory <code>/mytmp</code> before we compile and run with the changes above.</p> <div class="highlight"><pre>> mkdir /mytmp > gcc -o mount mount.c > ./mount mount <span class="c"># cutting out the common output...</span> none on /mytmp <span class="nb">type </span>tmpfs <span class="o">(</span>rw,relatime<span class="o">)</span> </pre></div> <p>I cut out the common output above from the first time I ran <code>mount</code>.<br> The result is that you should see a new mount point for our <code>tmpfs</code> mount. Nice! Go ahead and run <code>mount</code> in the current shell just for comparison.<br> Notice how the <code>tmpfs</code> mount is not displayed? That is because we created the mount inside our own mount namespace, not in the parent's namespace. </p> <p>Remember how I said that the mount namepaces does not equal a filesystem jail? Go ahead and run our <code>./mount</code> binary with the <code>ls</code> command. Everything is there. Now you have proof!</p> <h3>UTS Namespace</h3> <p>The next namespace is the UTS namespace that is responsible for system identification. This includes the <code>hostname</code> and <code>domainname</code>. It allows a container to have it's own hostname independently from the host system along with other containers. Let's start by making a copy of <code>skeleton.c</code> and running the <code>hostname</code> command with it.</p> <div class="highlight"><pre>> cp skeleton.c uts.c > gcc -o uts uts.c > ./uts hostname development </pre></div> <p>This should display your system's hostname (<code>development</code> in my case). Like earlier, let's add the clone flag for the UTS namespace to the <code>clone_flags</code> variable. The flag should be <code>CLONE_NEWUTS</code>. If you compile and run then you should see the exact same output. This is totally fine. The values in the UTS namespace are inherited from the <em>parent</em>. However, within this new namespace, we can change the hostname without it affecting the <em>parent</em> or other container's that have a separate UTS namespace.</p> <p>Let's modify the hostname in the <code>child_exec</code> function. To do that, you will need to add the <code>#include <unistd.h></code> header to gain access to the <code>sethostname</code> function, as well as the <code>#include <string.h></code> header to use <code>strlen</code> needed by <code>sethostname</code>. The new body of the <code>child_exec</code> function should look like the following:</p> <div class="highlight"><pre><span class="c1">// child_exec is the func that will be executed as the result of clone</span> <span class="k">static</span> <span class="kt">int</span> <span class="nf">child_exec</span><span class="p">(</span><span class="kt">void</span> <span class="o">*</span><span class="n">stuff</span><span class="p">)</span> <span class="p">{</span> <span class="k">struct</span> <span class="n">clone_args</span> <span class="o">*</span><span class="n">args</span> <span class="o">=</span> <span class="p">(</span><span class="k">struct</span> <span class="n">clone_args</span> <span class="o">*</span><span class="p">)</span><span class="n">stuff</span><span class="p">;</span> <span class="k">const</span> <span class="kt">char</span> <span class="o">*</span> <span class="n">new_hostname</span> <span class="o">=</span> <span class="s">"myhostname"</span><span class="p">;</span> <span class="k">if</span> <span class="p">(</span><span class="n">sethostname</span><span class="p">(</span><span class="n">new_hostname</span><span class="p">,</span> <span class="n">strlen</span><span class="p">(</span><span class="n">new_hostname</span><span class="p">))</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">)</span> <span class="p">{</span> <span class="n">fprintf</span><span class="p">(</span><span class="n">stderr</span><span class="p">,</span> <span class="s">"failed to execvp argments %s</span><span class="se">\n</span><span class="s">"</span><span class="p">,</span> <span class="n">strerror</span><span class="p">(</span><span class="n">errno</span><span class="p">));</span> <span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">);</span> <span class="p">}</span> <span class="k">if</span> <span class="p">(</span><span class="n">execvp</span><span class="p">(</span><span class="n">args</span><span class="o">-></span><span class="n">argv</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">args</span><span class="o">-></span><span class="n">argv</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">)</span> <span class="p">{</span> <span class="n">fprintf</span><span class="p">(</span><span class="n">stderr</span><span class="p">,</span> <span class="s">"failed to execvp argments %s</span><span class="se">\n</span><span class="s">"</span><span class="p">,</span> <span class="n">strerror</span><span class="p">(</span><span class="n">errno</span><span class="p">));</span> <span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">);</span> <span class="p">}</span> <span class="c1">// we should never reach here!</span> <span class="n">exit</span><span class="p">(</span><span class="n">EXIT_FAILURE</span><span class="p">);</span> <span class="p">}</span> </pre></div> <p>Ensure that <code>clone_flags</code> within your main look like <code>int clone_flags = CLONE_NEWUTS | SIGCHLD;</code> then compile and run the binary with the same args. You should now see the value that we set returned from the <code>hostname</code> command. To verify that this change did not affect our current shell go ahead and run <code>hostname</code> and make sure that you have your original value back.</p> <div class="highlight"><pre>> gcc -o uts uts.c > ./uts hostname myhostname > hostname development </pre></div> <p>Awesome! We are doing good.</p> <h3>IPC Namespace</h3> <p>The IPC namespace is used for isolating interprocess communication, things like SysV message queues. Let's make a copy of <code>skeleton.c</code> for this namespace.</p> <div class="highlight"><pre>> cp skeleton.c ipc.c </pre></div> <p>The way we are going to test the IPC namespace is by creating a message queue on the host, and ensuring that we cannot see it when we spawn a new process inside it's own IPC namespace. Let's first create a message queue in our current shell then compile and run our copy of the skeleton code to view the queue.</p> <div class="highlight"><pre>> ipcmk -Q Message queue id: 65536 > gcc -o ipc ipc.c > ./ipc ipcs -q ------ Message Queues -------- key msqid owner perms used-bytes message 0xfe7f09d1 <span class="m">65536</span> root <span class="m">644</span> <span class="m">0</span> 0 </pre></div> <p>Without a new IPC namespace you can see the same message queue that was created. Now let's add the <code>CLONE_NEWIPC</code> flag to our <code>clone_flags</code> var to create a new IPC namespace for our process. The <code>clone_flags</code> var should look like <code>int clone_flags = CLONE_NEWIPC | SIGCHLD;</code>. Recompile and run the same command again:</p> <div class="highlight"><pre>> gcc -o ipc ipc.c > ./ipc ipcs -q ------ Message Queues -------- key msqid owner perms used-bytes message </pre></div> <p>Done! The child process is now in a new IPC namespace and has completely separate view and access to message queues.</p> <h3>PID Namespace</h3> <p>This one is fun. The PID namespace is a way to carve up the PIDs that one process can view and interact with. When we create a new PID namespace the first process will get to be the loved PID 1. If this process exits the kernel kills everyone else within the namespace. Let's start by making a copy of <code>skeleton.c</code> for our changes.</p> <div class="highlight"><pre>> cp skeleton.c pid.c </pre></div> <p>To create a new PID namespace, we will have to set the <code>clone_flags</code> with <code>CLONE_NEWPID</code>. The variable should look like <code>int clone_flags = CLONE_NEWPID | SIGCHLD;</code>. Let's test by running <code>ps aux</code> in our shell and then compile and run our <code>pid.c</code> binary with the same arguments.</p> <div class="highlight"><pre>> ps aux USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND root <span class="m">1</span> 0.0 0.1 <span class="m">20332</span> <span class="m">3388</span> ? Ss 21:50 0:00 bash root <span class="m">147</span> 0.0 0.1 <span class="m">17492</span> <span class="m">2088</span> ? R+ 22:49 0:00 ps aux > gcc -o pid pid.c > ./pid ps aux USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND root <span class="m">1</span> 0.0 0.1 <span class="m">20332</span> <span class="m">3388</span> ? Ss 21:50 0:00 bash root <span class="m">153</span> 0.0 0.0 <span class="m">5092</span> <span class="m">728</span> ? S+ 22:50 0:00 ./pid ps aux root <span class="m">154</span> 0.0 0.1 <span class="m">17492</span> <span class="m">2064</span> ? R+ 22:50 0:00 ps aux </pre></div> <p>WTF??? We expected <code>ps aux</code> to be PID 1 or atleast not see any other pids from the parent. Why is that? <strong>proc</strong>. The process that we spawned still has a view of <code>/proc</code> from the parent, i.e. <code>/proc</code> mounted on the host system. So how do we fix this? How to we ensure that our new process can only view pids within it's namespace? We can start by remounting <code>/proc</code>.<br> Because we will be dealing with mounts, we can take the opportunity to take what we learned from the MNT namespace and combine it with our PID namespace so that we don't mess with the <code>/proc</code> of our host system.</p> <p>We can start by including the clone flag for the mount namespace along side the clone flag for pid. It should look something like <code>int clone_flags = CLONE_NEWPID | CLONE_NEWNS | SIGCHLD;</code>. We need to edit the <code>child_exec</code> function and remount proc. This will be a simple <code>unmount</code> and <code>mount</code> syscall for the proc filesystem. Because we are creating a new mount namespace we know that this will not mess up our host system. The result should look like this:</p> <div class="highlight"><pre>// child_exec is the func that will be executed as the result of clone static int child_exec<span class="o">(</span>void *stuff<span class="o">)</span> <span class="o">{</span> struct clone_args *args <span class="o">=</span> <span class="o">(</span>struct clone_args *<span class="o">)</span>stuff<span class="p">;</span> <span class="k">if</span> <span class="o">(</span>umount<span class="o">(</span><span class="s2">"/proc"</span>, 0<span class="o">)</span> !<span class="o">=</span> 0<span class="o">)</span> <span class="o">{</span> fprintf<span class="o">(</span>stderr, <span class="s2">"failed unmount /proc %s\n"</span>, strerror<span class="o">(</span>errno<span class="o">))</span><span class="p">;</span> <span class="nb">exit</span><span class="o">(</span>-1<span class="o">)</span><span class="p">;</span> <span class="o">}</span> <span class="k">if</span> <span class="o">(</span>mount<span class="o">(</span><span class="s2">"proc"</span>, <span class="s2">"/proc"</span>, <span class="s2">"proc"</span>, 0, <span class="s2">""</span><span class="o">)</span> !<span class="o">=</span> 0<span class="o">)</span> <span class="o">{</span> fprintf<span class="o">(</span>stderr, <span class="s2">"failed mount /proc %s\n"</span>, strerror<span class="o">(</span>errno<span class="o">))</span><span class="p">;</span> <span class="nb">exit</span><span class="o">(</span>-1<span class="o">)</span><span class="p">;</span> <span class="o">}</span> <span class="k">if</span> <span class="o">(</span>execvp<span class="o">(</span>args->argv<span class="o">[</span>0<span class="o">]</span>, args->argv<span class="o">)</span> !<span class="o">=</span> 0<span class="o">)</span> <span class="o">{</span> fprintf<span class="o">(</span>stderr, <span class="s2">"failed to execvp argments %s\n"</span>, strerror<span class="o">(</span>errno<span class="o">))</span><span class="p">;</span> <span class="nb">exit</span><span class="o">(</span>-1<span class="o">)</span><span class="p">;</span> <span class="o">}</span> // we should never reach here! <span class="nb">exit</span><span class="o">(</span>EXIT_FAILURE<span class="o">)</span><span class="p">;</span> <span class="o">}</span> </pre></div> <p>Build and run this again to see what happens.</p> <div class="highlight"><pre>> gcc -o pid pid.c > ./pid ps aux USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND root <span class="m">1</span> 0.0 0.0 <span class="m">9076</span> <span class="m">784</span> ? R+ 23:05 0:00 ps aux </pre></div> <p>Perfect! Our new PID namespace is now fully operational with the help of the mount namespace!</p> <h3>USER Namespace</h3> <p>The last namespace is the user namespace. This namespace is the new kid on the block and allows you to have users within this namespace that are not equal users outside of the namespace. This is accomplished via GID and UID mappings. </p> <p>This one has a simple demo application without specifying a mapping, even though it's a totally useless demo. If we add the flag <code>CLONE_NEWUSER</code> to our <code>clone_flags</code> then run something like <code>id</code> or <code>ls -la</code> you will notice that we get <code>nobody</code> within the user namespace. This is because the current user is undefined right now.</p> <div class="highlight"><pre>> cp skeleton.c user.c <span class="c"># add the clone flag</span> > gcc -o user user.c > ./user ls -la total 84 drwxr-xr-x <span class="m">1</span> nobody nogroup <span class="m">4096</span> Nov <span class="m">16</span> 23:10 . drwxr-xr-x <span class="m">1</span> nobody nogroup <span class="m">4096</span> Nov <span class="m">16</span> 22:17 .. -rwxr-xr-x <span class="m">1</span> nobody nogroup <span class="m">8336</span> Nov <span class="m">16</span> 22:15 mount -rw-r--r-- <span class="m">1</span> nobody nogroup <span class="m">1577</span> Nov <span class="m">16</span> 22:15 mount.c -rwxr-xr-x <span class="m">1</span> nobody nogroup <span class="m">8064</span> Nov <span class="m">16</span> 21:52 net -rw-r--r-- <span class="m">1</span> nobody nogroup <span class="m">1441</span> Nov <span class="m">16</span> 21:52 network.c -rwxr-xr-x <span class="m">1</span> nobody nogroup <span class="m">8544</span> Nov <span class="m">16</span> 23:05 pid -rw-r--r-- <span class="m">1</span> nobody nogroup <span class="m">1772</span> Nov <span class="m">16</span> 23:02 pid.c -rw-r--r-- <span class="m">1</span> nobody nogroup <span class="m">1426</span> Nov <span class="m">16</span> 21:59 skeleton.c -rwxr-xr-x <span class="m">1</span> nobody nogroup <span class="m">8056</span> Nov <span class="m">16</span> 23:10 user -rw-r--r-- <span class="m">1</span> nobody nogroup <span class="m">1442</span> Nov <span class="m">16</span> 23:10 user.c -rwxr-xr-x <span class="m">1</span> nobody nogroup <span class="m">8408</span> Nov <span class="m">16</span> 22:40 uts -rw-r--r-- <span class="m">1</span> nobody nogroup <span class="m">1694</span> Nov <span class="m">16</span> 22:36 uts.c </pre></div> <p>This is a very simple example of the user namespace but you can go much deeper with it. We will save this for another post but the idea and hopes for the user namespace is that this will allow us to run as "root" within the container but not as "root" on the host system. Don't forget you can always change <code>ls -la</code> to <code>bash</code> and have a shell inside the namespace to poke around and learn more.</p> <h3>In the end...</h3> <p>So to recap we went over the mount, network, user, PID, UTS, and IPC Linux namespaces. The majority of the code that we changed was not much, just adding a flag most of the time.<br> The "hard work" is mostly managing the interactions between the various kernel subsystems in order to meet our requirements. Like most of the descriptions before this, namespaces are just one of the tools that we use to make a container. I hope the PID example is a glimpse of how we use multiple namespaces together in order isolate and begin the creation of a container.</p> <p>In future posts we will go into detail on how we jail the container's processes inside a root filesystem, aka a docker image, as well as using cgroups and Linux capabilities. By the end we should be able to pull all these things together to create a container.</p> <p>Also a thanks to tibor and everyone that helps review my brain dump of a first draft ;)</p> </div><!-- /.entry-content --> </article> <div id="disqus_thread"></div> <script type="text/javascript"> /* * * CONFIGURATION VARIABLES: EDIT BEFORE PASTING INTO YOUR WEBPAGE * * */ var disqus_shortname = 'crosbymichael'; // required: replace example with your forum shortname /* * * DON'T EDIT BELOW THIS LINE * * */ (function() { var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true; dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js'; (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq); })(); </script> <noscript>Please enable JavaScript to view the <a href="http://web.archive.org/web/20191223021405/http://disqus.com/?ref_noscript">comments powered by Disqus.</a></noscript> <a href="http://web.archive.org/web/20191223021405/http://disqus.com/" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a> </section> </div> </div> <hr> <div class="footer"> <p>© 2014 Michael Crosby</p> </div> </div> <script src="Creating%20containers%20-%20Part%201_files/bootstrap.js"></script> <script src="Creating%20containers%20-%20Part%201_files/jquery.js"></script> </body></html> <!-- FILE ARCHIVED ON 02:14:05 Dec 23, 2019 AND RETRIEVED FROM THE INTERNET ARCHIVE ON 21:09:12 Jun 24, 2020. JAVASCRIPT APPENDED BY WAYBACK MACHINE, COPYRIGHT INTERNET ARCHIVE. ALL OTHER CONTENT MAY ALSO BE PROTECTED BY COPYRIGHT (17 U.S.C. SECTION 108(a)(3)). --> <!-- playback timings (ms): PetaboxLoader3.resolve: 260.51 (4) exclusion.robots.policy: 0.342 load_resource: 714.677 esindex: 0.007 exclusion.robots: 0.353 captures_list: 233.362 LoadShardBlock: 205.234 (3) CDXLines.iter: 14.759 (3) PetaboxLoader3.datanode: 585.283 (4) RedisCDXSource: 7.714 -->