Source file
src/syscall/exec_linux.go
1
2
3
4
5
6
7 package syscall
8
9 import (
10 "internal/itoa"
11 "runtime"
12 "unsafe"
13 )
14
15
16
17 const (
18 CLONE_VM = 0x00000100
19 CLONE_FS = 0x00000200
20 CLONE_FILES = 0x00000400
21 CLONE_SIGHAND = 0x00000800
22 CLONE_PIDFD = 0x00001000
23 CLONE_PTRACE = 0x00002000
24 CLONE_VFORK = 0x00004000
25 CLONE_PARENT = 0x00008000
26 CLONE_THREAD = 0x00010000
27 CLONE_NEWNS = 0x00020000
28 CLONE_SYSVSEM = 0x00040000
29 CLONE_SETTLS = 0x00080000
30 CLONE_PARENT_SETTID = 0x00100000
31 CLONE_CHILD_CLEARTID = 0x00200000
32 CLONE_DETACHED = 0x00400000
33 CLONE_UNTRACED = 0x00800000
34 CLONE_CHILD_SETTID = 0x01000000
35 CLONE_NEWCGROUP = 0x02000000
36 CLONE_NEWUTS = 0x04000000
37 CLONE_NEWIPC = 0x08000000
38 CLONE_NEWUSER = 0x10000000
39 CLONE_NEWPID = 0x20000000
40 CLONE_NEWNET = 0x40000000
41 CLONE_IO = 0x80000000
42
43
44
45 CLONE_CLEAR_SIGHAND = 0x100000000
46 CLONE_INTO_CGROUP = 0x200000000
47
48
49
50
51 CLONE_NEWTIME = 0x00000080
52 )
53
54
55
56 type SysProcIDMap struct {
57 ContainerID int
58 HostID int
59 Size int
60 }
61
62 type SysProcAttr struct {
63 Chroot string
64 Credential *Credential
65
66
67
68 Ptrace bool
69 Setsid bool
70
71
72 Setpgid bool
73
74
75
76
77 Setctty bool
78 Noctty bool
79 Ctty int
80
81
82
83
84
85 Foreground bool
86 Pgid int
87
88
89
90
91 Pdeathsig Signal
92 Cloneflags uintptr
93 Unshareflags uintptr
94 UidMappings []SysProcIDMap
95 GidMappings []SysProcIDMap
96
97
98
99
100 GidMappingsEnableSetgroups bool
101 AmbientCaps []uintptr
102 UseCgroupFD bool
103 CgroupFD int
104 }
105
106 var (
107 none = [...]byte{'n', 'o', 'n', 'e', 0}
108 slash = [...]byte{'/', 0}
109 )
110
111
112 func runtime_BeforeFork()
113 func runtime_AfterFork()
114 func runtime_AfterForkInChild()
115
116
117
118
119
120
121
122
123
124
125
126
127 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
128
129
130 upid, err, mapPipe, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
131 if locked {
132 runtime_AfterFork()
133 }
134 if err != 0 {
135 return 0, err
136 }
137
138
139 pid = int(upid)
140
141 if sys.UidMappings != nil || sys.GidMappings != nil {
142 Close(mapPipe[0])
143 var err2 Errno
144
145
146 if sys.Unshareflags&CLONE_NEWUSER == 0 {
147 if err := writeUidGidMappings(pid, sys); err != nil {
148 err2 = err.(Errno)
149 }
150 }
151 RawSyscall(SYS_WRITE, uintptr(mapPipe[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
152 Close(mapPipe[1])
153 }
154
155 return pid, 0
156 }
157
158 const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
159
160 type capHeader struct {
161 version uint32
162 pid int32
163 }
164
165 type capData struct {
166 effective uint32
167 permitted uint32
168 inheritable uint32
169 }
170 type caps struct {
171 hdr capHeader
172 data [2]capData
173 }
174
175
176 func capToIndex(cap uintptr) uintptr { return cap >> 5 }
177
178
179 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
180
181
182 type cloneArgs struct {
183 flags uint64
184 pidFD uint64
185 childTID uint64
186 parentTID uint64
187 exitSignal uint64
188 stack uint64
189 stackSize uint64
190 tls uint64
191 setTID uint64
192 setTIDSize uint64
193 cgroup uint64
194 }
195
196
197
198
199
200
201
202
203
204
205
206
207 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid uintptr, err1 Errno, mapPipe [2]int, locked bool) {
208
209 const (
210 PR_CAP_AMBIENT = 0x2f
211 PR_CAP_AMBIENT_RAISE = 0x2
212 )
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228 var (
229 err2 Errno
230 nextfd int
231 i int
232 caps caps
233 fd1, flags uintptr
234 puid, psetgroups, pgid []byte
235 uidmap, setgroups, gidmap []byte
236 clone3 *cloneArgs
237 pgrp int32
238 dirfd int
239 cred *Credential
240 ngroups, groups uintptr
241 c uintptr
242 )
243
244 rlim, rlimOK := origRlimitNofile.Load().(Rlimit)
245
246 if sys.UidMappings != nil {
247 puid = []byte("/proc/self/uid_map\000")
248 uidmap = formatIDMappings(sys.UidMappings)
249 }
250
251 if sys.GidMappings != nil {
252 psetgroups = []byte("/proc/self/setgroups\000")
253 pgid = []byte("/proc/self/gid_map\000")
254
255 if sys.GidMappingsEnableSetgroups {
256 setgroups = []byte("allow\000")
257 } else {
258 setgroups = []byte("deny\000")
259 }
260 gidmap = formatIDMappings(sys.GidMappings)
261 }
262
263
264 ppid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
265
266
267
268
269 fd := make([]int, len(attr.Files))
270 nextfd = len(attr.Files)
271 for i, ufd := range attr.Files {
272 if nextfd < int(ufd) {
273 nextfd = int(ufd)
274 }
275 fd[i] = int(ufd)
276 }
277 nextfd++
278
279
280
281 if sys.UidMappings != nil || sys.GidMappings != nil {
282 if err := forkExecPipe(mapPipe[:]); err != nil {
283 err1 = err.(Errno)
284 return
285 }
286 }
287
288 flags = sys.Cloneflags
289 if sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0 {
290 flags |= CLONE_VFORK | CLONE_VM
291 }
292
293 if sys.UseCgroupFD {
294 clone3 = &cloneArgs{
295 flags: uint64(flags) | CLONE_INTO_CGROUP,
296 exitSignal: uint64(SIGCHLD),
297 cgroup: uint64(sys.CgroupFD),
298 }
299 } else if flags&CLONE_NEWTIME != 0 {
300 clone3 = &cloneArgs{
301 flags: uint64(flags),
302 exitSignal: uint64(SIGCHLD),
303 }
304 }
305
306
307
308 runtime_BeforeFork()
309 locked = true
310 if clone3 != nil {
311 pid, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3))
312 } else {
313 flags |= uintptr(SIGCHLD)
314 if runtime.GOARCH == "s390x" {
315
316 pid, err1 = rawVforkSyscall(SYS_CLONE, 0, flags)
317 } else {
318 pid, err1 = rawVforkSyscall(SYS_CLONE, flags, 0)
319 }
320 }
321 if err1 != 0 || pid != 0 {
322
323
324
325
326
327
328 return
329 }
330
331
332
333
334 if len(sys.AmbientCaps) > 0 {
335 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
336 if err1 != 0 {
337 goto childerror
338 }
339 }
340
341
342 if sys.UidMappings != nil || sys.GidMappings != nil {
343 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(mapPipe[1]), 0, 0); err1 != 0 {
344 goto childerror
345 }
346 pid, _, err1 = RawSyscall(SYS_READ, uintptr(mapPipe[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
347 if err1 != 0 {
348 goto childerror
349 }
350 if pid != unsafe.Sizeof(err2) {
351 err1 = EINVAL
352 goto childerror
353 }
354 if err2 != 0 {
355 err1 = err2
356 goto childerror
357 }
358 }
359
360
361 if sys.Setsid {
362 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
363 if err1 != 0 {
364 goto childerror
365 }
366 }
367
368
369 if sys.Setpgid || sys.Foreground {
370
371 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
372 if err1 != 0 {
373 goto childerror
374 }
375 }
376
377 if sys.Foreground {
378 pgrp = int32(sys.Pgid)
379 if pgrp == 0 {
380 pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
381
382 pgrp = int32(pid)
383 }
384
385
386 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
387 if err1 != 0 {
388 goto childerror
389 }
390 }
391
392
393
394 runtime_AfterForkInChild()
395
396
397 if sys.Unshareflags != 0 {
398 _, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
399 if err1 != 0 {
400 goto childerror
401 }
402
403 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
404 dirfd = int(_AT_FDCWD)
405 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
406 goto childerror
407 }
408 pid, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
409 if err1 != 0 {
410 goto childerror
411 }
412 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
413 goto childerror
414 }
415
416 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
417 goto childerror
418 }
419 pid, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
420 if err1 != 0 {
421 goto childerror
422 }
423 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
424 goto childerror
425 }
426 }
427
428 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
429 dirfd = int(_AT_FDCWD)
430 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
431 goto childerror
432 }
433 pid, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
434 if err1 != 0 {
435 goto childerror
436 }
437 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
438 goto childerror
439 }
440 }
441
442
443
444
445
446
447
448
449 if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
450 _, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
451 if err1 != 0 {
452 goto childerror
453 }
454 }
455 }
456
457
458 if chroot != nil {
459 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
460 if err1 != 0 {
461 goto childerror
462 }
463 }
464
465
466 if cred = sys.Credential; cred != nil {
467 ngroups = uintptr(len(cred.Groups))
468 groups = uintptr(0)
469 if ngroups > 0 {
470 groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
471 }
472 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
473 _, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
474 if err1 != 0 {
475 goto childerror
476 }
477 }
478 _, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
479 if err1 != 0 {
480 goto childerror
481 }
482 _, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
483 if err1 != 0 {
484 goto childerror
485 }
486 }
487
488 if len(sys.AmbientCaps) != 0 {
489
490
491 caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
492
493 if _, _, err1 = RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
494 goto childerror
495 }
496
497 for _, c = range sys.AmbientCaps {
498
499
500 caps.data[capToIndex(c)].permitted |= capToMask(c)
501 caps.data[capToIndex(c)].inheritable |= capToMask(c)
502 }
503
504 if _, _, err1 = RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
505 goto childerror
506 }
507
508 for _, c = range sys.AmbientCaps {
509 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
510 if err1 != 0 {
511 goto childerror
512 }
513 }
514 }
515
516
517 if dir != nil {
518 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
519 if err1 != 0 {
520 goto childerror
521 }
522 }
523
524
525 if sys.Pdeathsig != 0 {
526 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
527 if err1 != 0 {
528 goto childerror
529 }
530
531
532
533
534 pid, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
535 if pid != ppid {
536 pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
537 _, _, err1 = RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
538 if err1 != 0 {
539 goto childerror
540 }
541 }
542 }
543
544
545
546 if pipe < nextfd {
547 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(pipe), uintptr(nextfd), O_CLOEXEC)
548 if err1 != 0 {
549 goto childerror
550 }
551 pipe = nextfd
552 nextfd++
553 }
554 for i = 0; i < len(fd); i++ {
555 if fd[i] >= 0 && fd[i] < i {
556 if nextfd == pipe {
557 nextfd++
558 }
559 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
560 if err1 != 0 {
561 goto childerror
562 }
563 fd[i] = nextfd
564 nextfd++
565 }
566 }
567
568
569 for i = 0; i < len(fd); i++ {
570 if fd[i] == -1 {
571 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
572 continue
573 }
574 if fd[i] == i {
575
576
577 _, _, err1 = RawSyscall(fcntl64Syscall, uintptr(fd[i]), F_SETFD, 0)
578 if err1 != 0 {
579 goto childerror
580 }
581 continue
582 }
583
584
585 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(i), 0)
586 if err1 != 0 {
587 goto childerror
588 }
589 }
590
591
592
593
594
595 for i = len(fd); i < 3; i++ {
596 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
597 }
598
599
600 if sys.Noctty {
601 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
602 if err1 != 0 {
603 goto childerror
604 }
605 }
606
607
608 if sys.Setctty {
609 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
610 if err1 != 0 {
611 goto childerror
612 }
613 }
614
615
616 if rlimOK && rlim.Cur != 0 {
617 rawSetrlimit(RLIMIT_NOFILE, &rlim)
618 }
619
620
621
622
623 if sys.Ptrace {
624 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
625 if err1 != 0 {
626 goto childerror
627 }
628 }
629
630
631 _, _, err1 = RawSyscall(SYS_EXECVE,
632 uintptr(unsafe.Pointer(argv0)),
633 uintptr(unsafe.Pointer(&argv[0])),
634 uintptr(unsafe.Pointer(&envv[0])))
635
636 childerror:
637
638 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
639 for {
640 RawSyscall(SYS_EXIT, 253, 0, 0)
641 }
642 }
643
644 func formatIDMappings(idMap []SysProcIDMap) []byte {
645 var data []byte
646 for _, im := range idMap {
647 data = append(data, itoa.Itoa(im.ContainerID)+" "+itoa.Itoa(im.HostID)+" "+itoa.Itoa(im.Size)+"\n"...)
648 }
649 return data
650 }
651
652
653 func writeIDMappings(path string, idMap []SysProcIDMap) error {
654 fd, err := Open(path, O_RDWR, 0)
655 if err != nil {
656 return err
657 }
658
659 if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
660 Close(fd)
661 return err
662 }
663
664 if err := Close(fd); err != nil {
665 return err
666 }
667
668 return nil
669 }
670
671
672
673
674
675 func writeSetgroups(pid int, enable bool) error {
676 sgf := "/proc/" + itoa.Itoa(pid) + "/setgroups"
677 fd, err := Open(sgf, O_RDWR, 0)
678 if err != nil {
679 return err
680 }
681
682 var data []byte
683 if enable {
684 data = []byte("allow")
685 } else {
686 data = []byte("deny")
687 }
688
689 if _, err := Write(fd, data); err != nil {
690 Close(fd)
691 return err
692 }
693
694 return Close(fd)
695 }
696
697
698
699 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
700 if sys.UidMappings != nil {
701 uidf := "/proc/" + itoa.Itoa(pid) + "/uid_map"
702 if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
703 return err
704 }
705 }
706
707 if sys.GidMappings != nil {
708
709 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
710 return err
711 }
712 gidf := "/proc/" + itoa.Itoa(pid) + "/gid_map"
713 if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
714 return err
715 }
716 }
717
718 return nil
719 }
720
View as plain text