Source file
src/syscall/exec_linux.go
1
2
3
4
5
6
7 package syscall
8
9 import (
10 "internal/itoa"
11 "runtime"
12 "unsafe"
13 )
14
15
16
17 const (
18 CLONE_VM = 0x00000100
19 CLONE_FS = 0x00000200
20 CLONE_FILES = 0x00000400
21 CLONE_SIGHAND = 0x00000800
22 CLONE_PIDFD = 0x00001000
23 CLONE_PTRACE = 0x00002000
24 CLONE_VFORK = 0x00004000
25 CLONE_PARENT = 0x00008000
26 CLONE_THREAD = 0x00010000
27 CLONE_NEWNS = 0x00020000
28 CLONE_SYSVSEM = 0x00040000
29 CLONE_SETTLS = 0x00080000
30 CLONE_PARENT_SETTID = 0x00100000
31 CLONE_CHILD_CLEARTID = 0x00200000
32 CLONE_DETACHED = 0x00400000
33 CLONE_UNTRACED = 0x00800000
34 CLONE_CHILD_SETTID = 0x01000000
35 CLONE_NEWCGROUP = 0x02000000
36 CLONE_NEWUTS = 0x04000000
37 CLONE_NEWIPC = 0x08000000
38 CLONE_NEWUSER = 0x10000000
39 CLONE_NEWPID = 0x20000000
40 CLONE_NEWNET = 0x40000000
41 CLONE_IO = 0x80000000
42
43
44
45 CLONE_CLEAR_SIGHAND = 0x100000000
46 CLONE_INTO_CGROUP = 0x200000000
47
48
49
50
51 CLONE_NEWTIME = 0x00000080
52 )
53
54
55
56
57
58
59
60 type SysProcIDMap struct {
61 ContainerID int
62 HostID int
63 Size int
64 }
65
66 type SysProcAttr struct {
67 Chroot string
68 Credential *Credential
69
70
71
72 Ptrace bool
73 Setsid bool
74
75
76 Setpgid bool
77
78
79
80
81 Setctty bool
82 Noctty bool
83 Ctty int
84
85
86
87
88
89 Foreground bool
90 Pgid int
91
92
93
94
95 Pdeathsig Signal
96 Cloneflags uintptr
97 Unshareflags uintptr
98 UidMappings []SysProcIDMap
99 GidMappings []SysProcIDMap
100
101
102
103
104 GidMappingsEnableSetgroups bool
105 AmbientCaps []uintptr
106 UseCgroupFD bool
107 CgroupFD int
108
109
110
111 PidFD *int
112 }
113
114 var (
115 none = [...]byte{'n', 'o', 'n', 'e', 0}
116 slash = [...]byte{'/', 0}
117
118 forceClone3 = false
119 )
120
121
122 func runtime_BeforeFork()
123 func runtime_AfterFork()
124 func runtime_AfterForkInChild()
125
126
127
128
129
130
131
132
133
134
135
136
137 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
138
139
140 upid, pidfd, err, mapPipe, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
141 if locked {
142 runtime_AfterFork()
143 }
144 if err != 0 {
145 return 0, err
146 }
147
148
149 pid = int(upid)
150 if sys.PidFD != nil {
151 *sys.PidFD = int(pidfd)
152 }
153
154 if sys.UidMappings != nil || sys.GidMappings != nil {
155 Close(mapPipe[0])
156 var err2 Errno
157
158
159 if sys.Unshareflags&CLONE_NEWUSER == 0 {
160 if err := writeUidGidMappings(pid, sys); err != nil {
161 err2 = err.(Errno)
162 }
163 }
164 RawSyscall(SYS_WRITE, uintptr(mapPipe[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
165 Close(mapPipe[1])
166 }
167
168 return pid, 0
169 }
170
171 const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
172
173 type capHeader struct {
174 version uint32
175 pid int32
176 }
177
178 type capData struct {
179 effective uint32
180 permitted uint32
181 inheritable uint32
182 }
183 type caps struct {
184 hdr capHeader
185 data [2]capData
186 }
187
188
189 func capToIndex(cap uintptr) uintptr { return cap >> 5 }
190
191
192 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
193
194
195 type cloneArgs struct {
196 flags uint64
197 pidFD uint64
198 childTID uint64
199 parentTID uint64
200 exitSignal uint64
201 stack uint64
202 stackSize uint64
203 tls uint64
204 setTID uint64
205 setTIDSize uint64
206 cgroup uint64
207 }
208
209
210
211
212
213
214
215
216
217
218
219
220 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid uintptr, pidfd int32, err1 Errno, mapPipe [2]int, locked bool) {
221
222 const (
223 PR_CAP_AMBIENT = 0x2f
224 PR_CAP_AMBIENT_RAISE = 0x2
225 )
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241 var (
242 err2 Errno
243 nextfd int
244 i int
245 caps caps
246 fd1, flags uintptr
247 puid, psetgroups, pgid []byte
248 uidmap, setgroups, gidmap []byte
249 clone3 *cloneArgs
250 pgrp int32
251 dirfd int
252 cred *Credential
253 ngroups, groups uintptr
254 c uintptr
255 )
256 pidfd = -1
257
258 rlim := origRlimitNofile.Load()
259
260 if sys.UidMappings != nil {
261 puid = []byte("/proc/self/uid_map\000")
262 uidmap = formatIDMappings(sys.UidMappings)
263 }
264
265 if sys.GidMappings != nil {
266 psetgroups = []byte("/proc/self/setgroups\000")
267 pgid = []byte("/proc/self/gid_map\000")
268
269 if sys.GidMappingsEnableSetgroups {
270 setgroups = []byte("allow\000")
271 } else {
272 setgroups = []byte("deny\000")
273 }
274 gidmap = formatIDMappings(sys.GidMappings)
275 }
276
277
278 ppid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
279
280
281
282
283 fd := make([]int, len(attr.Files))
284 nextfd = len(attr.Files)
285 for i, ufd := range attr.Files {
286 if nextfd < int(ufd) {
287 nextfd = int(ufd)
288 }
289 fd[i] = int(ufd)
290 }
291 nextfd++
292
293
294
295 if sys.UidMappings != nil || sys.GidMappings != nil {
296 if err := forkExecPipe(mapPipe[:]); err != nil {
297 err1 = err.(Errno)
298 return
299 }
300 }
301
302 flags = sys.Cloneflags
303 if sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0 {
304 flags |= CLONE_VFORK | CLONE_VM
305 }
306 if sys.PidFD != nil {
307 flags |= CLONE_PIDFD
308 }
309
310 if sys.UseCgroupFD || flags&CLONE_NEWTIME != 0 || forceClone3 {
311 clone3 = &cloneArgs{
312 flags: uint64(flags),
313 exitSignal: uint64(SIGCHLD),
314 }
315 if sys.UseCgroupFD {
316 clone3.flags |= CLONE_INTO_CGROUP
317 clone3.cgroup = uint64(sys.CgroupFD)
318 }
319 if sys.PidFD != nil {
320 clone3.pidFD = uint64(uintptr(unsafe.Pointer(&pidfd)))
321 }
322 }
323
324
325
326 runtime_BeforeFork()
327 locked = true
328 if clone3 != nil {
329 pid, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3), 0)
330 } else {
331 flags |= uintptr(SIGCHLD)
332 if runtime.GOARCH == "s390x" {
333
334 pid, err1 = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(&pidfd)))
335 } else {
336 pid, err1 = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(&pidfd)))
337 }
338 }
339 if err1 != 0 || pid != 0 {
340
341
342
343
344
345
346 return
347 }
348
349
350
351
352 if len(sys.AmbientCaps) > 0 {
353 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
354 if err1 != 0 {
355 goto childerror
356 }
357 }
358
359
360 if sys.UidMappings != nil || sys.GidMappings != nil {
361 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(mapPipe[1]), 0, 0); err1 != 0 {
362 goto childerror
363 }
364 pid, _, err1 = RawSyscall(SYS_READ, uintptr(mapPipe[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
365 if err1 != 0 {
366 goto childerror
367 }
368 if pid != unsafe.Sizeof(err2) {
369 err1 = EINVAL
370 goto childerror
371 }
372 if err2 != 0 {
373 err1 = err2
374 goto childerror
375 }
376 }
377
378
379 if sys.Setsid {
380 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
381 if err1 != 0 {
382 goto childerror
383 }
384 }
385
386
387 if sys.Setpgid || sys.Foreground {
388
389 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
390 if err1 != 0 {
391 goto childerror
392 }
393 }
394
395 if sys.Foreground {
396 pgrp = int32(sys.Pgid)
397 if pgrp == 0 {
398 pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
399
400 pgrp = int32(pid)
401 }
402
403
404 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
405 if err1 != 0 {
406 goto childerror
407 }
408 }
409
410
411
412 runtime_AfterForkInChild()
413
414
415 if sys.Unshareflags != 0 {
416 _, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
417 if err1 != 0 {
418 goto childerror
419 }
420
421 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
422 dirfd = int(_AT_FDCWD)
423 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
424 goto childerror
425 }
426 pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
427 if err1 != 0 {
428 goto childerror
429 }
430 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
431 goto childerror
432 }
433
434 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
435 goto childerror
436 }
437 pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
438 if err1 != 0 {
439 goto childerror
440 }
441 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
442 goto childerror
443 }
444 }
445
446 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
447 dirfd = int(_AT_FDCWD)
448 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
449 goto childerror
450 }
451 pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
452 if err1 != 0 {
453 goto childerror
454 }
455 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
456 goto childerror
457 }
458 }
459
460
461
462
463
464
465
466
467 if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
468 _, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
469 if err1 != 0 {
470 goto childerror
471 }
472 }
473 }
474
475
476 if chroot != nil {
477 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
478 if err1 != 0 {
479 goto childerror
480 }
481 }
482
483
484 if cred = sys.Credential; cred != nil {
485 ngroups = uintptr(len(cred.Groups))
486 groups = uintptr(0)
487 if ngroups > 0 {
488 groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
489 }
490 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
491 _, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
492 if err1 != 0 {
493 goto childerror
494 }
495 }
496 _, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
497 if err1 != 0 {
498 goto childerror
499 }
500 _, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
501 if err1 != 0 {
502 goto childerror
503 }
504 }
505
506 if len(sys.AmbientCaps) != 0 {
507
508
509 caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
510
511 if _, _, err1 = RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
512 goto childerror
513 }
514
515 for _, c = range sys.AmbientCaps {
516
517
518 caps.data[capToIndex(c)].permitted |= capToMask(c)
519 caps.data[capToIndex(c)].inheritable |= capToMask(c)
520 }
521
522 if _, _, err1 = RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
523 goto childerror
524 }
525
526 for _, c = range sys.AmbientCaps {
527 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
528 if err1 != 0 {
529 goto childerror
530 }
531 }
532 }
533
534
535 if dir != nil {
536 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
537 if err1 != 0 {
538 goto childerror
539 }
540 }
541
542
543 if sys.Pdeathsig != 0 {
544 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
545 if err1 != 0 {
546 goto childerror
547 }
548
549
550
551
552 pid, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
553 if pid != ppid {
554 pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
555 _, _, err1 = RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
556 if err1 != 0 {
557 goto childerror
558 }
559 }
560 }
561
562
563
564 if pipe < nextfd {
565 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(pipe), uintptr(nextfd), O_CLOEXEC)
566 if err1 != 0 {
567 goto childerror
568 }
569 pipe = nextfd
570 nextfd++
571 }
572 for i = 0; i < len(fd); i++ {
573 if fd[i] >= 0 && fd[i] < i {
574 if nextfd == pipe {
575 nextfd++
576 }
577 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
578 if err1 != 0 {
579 goto childerror
580 }
581 fd[i] = nextfd
582 nextfd++
583 }
584 }
585
586
587 for i = 0; i < len(fd); i++ {
588 if fd[i] == -1 {
589 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
590 continue
591 }
592 if fd[i] == i {
593
594
595 _, _, err1 = RawSyscall(fcntl64Syscall, uintptr(fd[i]), F_SETFD, 0)
596 if err1 != 0 {
597 goto childerror
598 }
599 continue
600 }
601
602
603 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(i), 0)
604 if err1 != 0 {
605 goto childerror
606 }
607 }
608
609
610
611
612
613 for i = len(fd); i < 3; i++ {
614 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
615 }
616
617
618 if sys.Noctty {
619 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
620 if err1 != 0 {
621 goto childerror
622 }
623 }
624
625
626 if sys.Setctty {
627 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
628 if err1 != 0 {
629 goto childerror
630 }
631 }
632
633
634 if rlim != nil {
635 rawSetrlimit(RLIMIT_NOFILE, rlim)
636 }
637
638
639
640
641 if sys.Ptrace {
642 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
643 if err1 != 0 {
644 goto childerror
645 }
646 }
647
648
649 _, _, err1 = RawSyscall(SYS_EXECVE,
650 uintptr(unsafe.Pointer(argv0)),
651 uintptr(unsafe.Pointer(&argv[0])),
652 uintptr(unsafe.Pointer(&envv[0])))
653
654 childerror:
655
656 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
657 for {
658 RawSyscall(SYS_EXIT, 253, 0, 0)
659 }
660 }
661
662 func formatIDMappings(idMap []SysProcIDMap) []byte {
663 var data []byte
664 for _, im := range idMap {
665 data = append(data, itoa.Itoa(im.ContainerID)+" "+itoa.Itoa(im.HostID)+" "+itoa.Itoa(im.Size)+"\n"...)
666 }
667 return data
668 }
669
670
671 func writeIDMappings(path string, idMap []SysProcIDMap) error {
672 fd, err := Open(path, O_RDWR, 0)
673 if err != nil {
674 return err
675 }
676
677 if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
678 Close(fd)
679 return err
680 }
681
682 if err := Close(fd); err != nil {
683 return err
684 }
685
686 return nil
687 }
688
689
690
691
692
693 func writeSetgroups(pid int, enable bool) error {
694 sgf := "/proc/" + itoa.Itoa(pid) + "/setgroups"
695 fd, err := Open(sgf, O_RDWR, 0)
696 if err != nil {
697 return err
698 }
699
700 var data []byte
701 if enable {
702 data = []byte("allow")
703 } else {
704 data = []byte("deny")
705 }
706
707 if _, err := Write(fd, data); err != nil {
708 Close(fd)
709 return err
710 }
711
712 return Close(fd)
713 }
714
715
716
717 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
718 if sys.UidMappings != nil {
719 uidf := "/proc/" + itoa.Itoa(pid) + "/uid_map"
720 if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
721 return err
722 }
723 }
724
725 if sys.GidMappings != nil {
726
727 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
728 return err
729 }
730 gidf := "/proc/" + itoa.Itoa(pid) + "/gid_map"
731 if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
732 return err
733 }
734 }
735
736 return nil
737 }
738
739
740 func forkAndExecFailureCleanup(attr *ProcAttr, sys *SysProcAttr) {
741 if sys.PidFD != nil && *sys.PidFD != -1 {
742 Close(*sys.PidFD)
743 *sys.PidFD = -1
744 }
745 }
746
View as plain text