Source file
src/syscall/exec_linux.go
1
2
3
4
5
6
7 package syscall
8
9 import (
10 errpkg "errors"
11 "internal/strconv"
12 "runtime"
13 "unsafe"
14 )
15
16
17
18 const (
19 CLONE_VM = 0x00000100
20 CLONE_FS = 0x00000200
21 CLONE_FILES = 0x00000400
22 CLONE_SIGHAND = 0x00000800
23 CLONE_PIDFD = 0x00001000
24 CLONE_PTRACE = 0x00002000
25 CLONE_VFORK = 0x00004000
26 CLONE_PARENT = 0x00008000
27 CLONE_THREAD = 0x00010000
28 CLONE_NEWNS = 0x00020000
29 CLONE_SYSVSEM = 0x00040000
30 CLONE_SETTLS = 0x00080000
31 CLONE_PARENT_SETTID = 0x00100000
32 CLONE_CHILD_CLEARTID = 0x00200000
33 CLONE_DETACHED = 0x00400000
34 CLONE_UNTRACED = 0x00800000
35 CLONE_CHILD_SETTID = 0x01000000
36 CLONE_NEWCGROUP = 0x02000000
37 CLONE_NEWUTS = 0x04000000
38 CLONE_NEWIPC = 0x08000000
39 CLONE_NEWUSER = 0x10000000
40 CLONE_NEWPID = 0x20000000
41 CLONE_NEWNET = 0x40000000
42 CLONE_IO = 0x80000000
43
44
45
46 CLONE_CLEAR_SIGHAND = 0x100000000
47 CLONE_INTO_CGROUP = 0x200000000
48
49
50
51
52 CLONE_NEWTIME = 0x00000080
53 )
54
55
56
57
58
59
60
61 type SysProcIDMap struct {
62 ContainerID int
63 HostID int
64 Size int
65 }
66
67 type SysProcAttr struct {
68 Chroot string
69 Credential *Credential
70
71
72
73 Ptrace bool
74 Setsid bool
75
76
77 Setpgid bool
78
79
80
81
82 Setctty bool
83 Noctty bool
84 Ctty int
85
86
87
88
89
90 Foreground bool
91 Pgid int
92
93
94
95
96 Pdeathsig Signal
97 Cloneflags uintptr
98 Unshareflags uintptr
99 UidMappings []SysProcIDMap
100 GidMappings []SysProcIDMap
101
102
103
104
105 GidMappingsEnableSetgroups bool
106 AmbientCaps []uintptr
107 UseCgroupFD bool
108 CgroupFD int
109
110
111
112 PidFD *int
113 }
114
115 var (
116 none = [...]byte{'n', 'o', 'n', 'e', 0}
117 slash = [...]byte{'/', 0}
118
119 forceClone3 = false
120 )
121
122
123 func runtime_BeforeFork()
124 func runtime_AfterFork()
125 func runtime_AfterForkInChild()
126
127
128
129
130
131
132
133
134
135
136
137
138 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
139
140
141 upid, pidfd, err, mapPipe, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
142 if locked {
143 runtime_AfterFork()
144 }
145 if err != 0 {
146 return 0, err
147 }
148
149
150 pid = int(upid)
151 if sys.PidFD != nil {
152 *sys.PidFD = int(pidfd)
153 }
154
155 if sys.UidMappings != nil || sys.GidMappings != nil {
156 Close(mapPipe[0])
157 var err2 Errno
158
159
160 if sys.Unshareflags&CLONE_NEWUSER == 0 {
161 if err := writeUidGidMappings(pid, sys); err != nil {
162 err2 = err.(Errno)
163 }
164 }
165 RawSyscall(SYS_WRITE, uintptr(mapPipe[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
166 Close(mapPipe[1])
167 }
168
169 return pid, 0
170 }
171
172 const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
173
174 type capHeader struct {
175 version uint32
176 pid int32
177 }
178
179 type capData struct {
180 effective uint32
181 permitted uint32
182 inheritable uint32
183 }
184 type caps struct {
185 hdr capHeader
186 data [2]capData
187 }
188
189
190 func capToIndex(cap uintptr) uintptr { return cap >> 5 }
191
192
193 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
194
195
196 type cloneArgs struct {
197 flags uint64
198 pidFD uint64
199 childTID uint64
200 parentTID uint64
201 exitSignal uint64
202 stack uint64
203 stackSize uint64
204 tls uint64
205 setTID uint64
206 setTIDSize uint64
207 cgroup uint64
208 }
209
210
211
212
213
214
215
216
217
218
219
220
221 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid uintptr, pidfd int32, err1 Errno, mapPipe [2]int, locked bool) {
222
223 const (
224 PR_CAP_AMBIENT = 0x2f
225 PR_CAP_AMBIENT_RAISE = 0x2
226 )
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242 var (
243 err2 Errno
244 nextfd int
245 i int
246 caps caps
247 fd1, flags, ppid uintptr
248 puid, psetgroups, pgid []byte
249 uidmap, setgroups, gidmap []byte
250 clone3 *cloneArgs
251 pgrp int32
252 dirfd int
253 cred *Credential
254 ngroups, groups uintptr
255 c uintptr
256 rlim *Rlimit
257 lim Rlimit
258 )
259 pidfd = -1
260
261 rlim = origRlimitNofile.Load()
262
263 if sys.UidMappings != nil {
264 puid = []byte("/proc/self/uid_map\000")
265 uidmap = formatIDMappings(sys.UidMappings)
266 }
267
268 if sys.GidMappings != nil {
269 psetgroups = []byte("/proc/self/setgroups\000")
270 pgid = []byte("/proc/self/gid_map\000")
271
272 if sys.GidMappingsEnableSetgroups {
273 setgroups = []byte("allow\000")
274 } else {
275 setgroups = []byte("deny\000")
276 }
277 gidmap = formatIDMappings(sys.GidMappings)
278 }
279
280
281 if sys.Pdeathsig != 0 {
282 ppid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
283 }
284
285
286
287
288 fd := make([]int, len(attr.Files))
289 nextfd = len(attr.Files)
290 for i, ufd := range attr.Files {
291 if nextfd < int(ufd) {
292 nextfd = int(ufd)
293 }
294 fd[i] = int(ufd)
295 }
296 nextfd++
297
298
299
300 if sys.UidMappings != nil || sys.GidMappings != nil {
301 if err := forkExecPipe(mapPipe[:]); err != nil {
302 err1 = err.(Errno)
303 return
304 }
305 }
306
307 flags = sys.Cloneflags
308 if sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0 {
309 flags |= CLONE_VFORK | CLONE_VM
310 }
311 if sys.PidFD != nil {
312 flags |= CLONE_PIDFD
313 }
314
315 if sys.UseCgroupFD || flags&CLONE_NEWTIME != 0 || forceClone3 {
316 clone3 = &cloneArgs{
317 flags: uint64(flags),
318 exitSignal: uint64(SIGCHLD),
319 }
320 if sys.UseCgroupFD {
321 clone3.flags |= CLONE_INTO_CGROUP
322 clone3.cgroup = uint64(sys.CgroupFD)
323 }
324 if sys.PidFD != nil {
325 clone3.pidFD = uint64(uintptr(unsafe.Pointer(&pidfd)))
326 }
327 }
328
329
330
331 runtime_BeforeFork()
332 locked = true
333 if clone3 != nil {
334 pid, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3), 0)
335 } else {
336
337 flags |= uintptr(SIGCHLD)
338 if runtime.GOARCH == "s390x" {
339
340 pid, err1 = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(&pidfd)))
341 } else {
342 pid, err1 = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(&pidfd)))
343 }
344 }
345 if err1 != 0 || pid != 0 {
346
347
348
349
350
351
352 return
353 }
354
355
356
357
358 if len(sys.AmbientCaps) > 0 {
359 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
360 if err1 != 0 {
361 goto childerror
362 }
363 }
364
365
366 if sys.UidMappings != nil || sys.GidMappings != nil {
367 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(mapPipe[1]), 0, 0); err1 != 0 {
368 goto childerror
369 }
370 c, _, err1 = RawSyscall(SYS_READ, uintptr(mapPipe[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
371 if err1 != 0 {
372 goto childerror
373 }
374 if c != unsafe.Sizeof(err2) {
375 err1 = EINVAL
376 goto childerror
377 }
378 if err2 != 0 {
379 err1 = err2
380 goto childerror
381 }
382 }
383
384
385 if sys.Setsid {
386 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
387 if err1 != 0 {
388 goto childerror
389 }
390 }
391
392
393 if sys.Setpgid || sys.Foreground {
394
395 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
396 if err1 != 0 {
397 goto childerror
398 }
399 }
400
401 if sys.Foreground {
402 pgrp = int32(sys.Pgid)
403 if pgrp == 0 {
404 pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
405
406 pgrp = int32(pid)
407 }
408
409
410 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
411 if err1 != 0 {
412 goto childerror
413 }
414 }
415
416
417
418 runtime_AfterForkInChild()
419
420
421 if sys.Unshareflags != 0 {
422 _, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
423 if err1 != 0 {
424 goto childerror
425 }
426
427 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
428 dirfd = int(_AT_FDCWD)
429 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
430 goto childerror
431 }
432 _, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
433 if err1 != 0 {
434 goto childerror
435 }
436 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
437 goto childerror
438 }
439
440 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
441 goto childerror
442 }
443 _, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
444 if err1 != 0 {
445 goto childerror
446 }
447 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
448 goto childerror
449 }
450 }
451
452 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
453 dirfd = int(_AT_FDCWD)
454 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
455 goto childerror
456 }
457 _, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
458 if err1 != 0 {
459 goto childerror
460 }
461 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
462 goto childerror
463 }
464 }
465
466
467
468
469
470
471
472
473 if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
474 _, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
475 if err1 != 0 {
476 goto childerror
477 }
478 }
479 }
480
481
482 if chroot != nil {
483 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
484 if err1 != 0 {
485 goto childerror
486 }
487 }
488
489
490 if cred = sys.Credential; cred != nil {
491 ngroups = uintptr(len(cred.Groups))
492 groups = uintptr(0)
493 if ngroups > 0 {
494 groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
495 }
496 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
497 _, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
498 if err1 != 0 {
499 goto childerror
500 }
501 }
502 _, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
503 if err1 != 0 {
504 goto childerror
505 }
506 _, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
507 if err1 != 0 {
508 goto childerror
509 }
510 }
511
512 if len(sys.AmbientCaps) != 0 {
513
514
515 caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
516
517 if _, _, err1 = RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
518 goto childerror
519 }
520
521 for _, c = range sys.AmbientCaps {
522
523
524 caps.data[capToIndex(c)].permitted |= capToMask(c)
525 caps.data[capToIndex(c)].inheritable |= capToMask(c)
526 }
527
528 if _, _, err1 = RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
529 goto childerror
530 }
531
532 for _, c = range sys.AmbientCaps {
533 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
534 if err1 != 0 {
535 goto childerror
536 }
537 }
538 }
539
540
541 if dir != nil {
542 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
543 if err1 != 0 {
544 goto childerror
545 }
546 }
547
548
549 if sys.Pdeathsig != 0 {
550 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
551 if err1 != 0 {
552 goto childerror
553 }
554
555
556
557
558 pid, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
559 if pid != ppid {
560 pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
561 _, _, err1 = RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
562 if err1 != 0 {
563 goto childerror
564 }
565 }
566 }
567
568
569
570 if pipe < nextfd {
571 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(pipe), uintptr(nextfd), O_CLOEXEC)
572 if err1 != 0 {
573 goto childerror
574 }
575 pipe = nextfd
576 nextfd++
577 }
578 for i = 0; i < len(fd); i++ {
579 if fd[i] >= 0 && fd[i] < i {
580 if nextfd == pipe {
581 nextfd++
582 }
583 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
584 if err1 != 0 {
585 goto childerror
586 }
587 fd[i] = nextfd
588 nextfd++
589 }
590 }
591
592
593 for i = 0; i < len(fd); i++ {
594 if fd[i] == -1 {
595 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
596 continue
597 }
598 if fd[i] == i {
599
600
601 _, _, err1 = RawSyscall(fcntl64Syscall, uintptr(fd[i]), F_SETFD, 0)
602 if err1 != 0 {
603 goto childerror
604 }
605 continue
606 }
607
608
609 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(i), 0)
610 if err1 != 0 {
611 goto childerror
612 }
613 }
614
615
616
617
618
619 for i = len(fd); i < 3; i++ {
620 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
621 }
622
623
624 if sys.Noctty {
625 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
626 if err1 != 0 {
627 goto childerror
628 }
629 }
630
631
632 if sys.Setctty {
633 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
634 if err1 != 0 {
635 goto childerror
636 }
637 }
638
639
640 if rlim != nil {
641
642
643
644
645
646
647
648
649
650
651 _, _, err1 = RawSyscall6(SYS_PRLIMIT64, 0, RLIMIT_NOFILE, 0, uintptr(unsafe.Pointer(&lim)), 0, 0)
652 if err1 != 0 || (lim.Cur == rlim.Max-1 && lim.Max == rlim.Max) {
653 RawSyscall6(SYS_PRLIMIT64, 0, RLIMIT_NOFILE, uintptr(unsafe.Pointer(rlim)), 0, 0, 0)
654 }
655 }
656
657
658
659
660 if sys.Ptrace {
661 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
662 if err1 != 0 {
663 goto childerror
664 }
665 }
666
667
668 _, _, err1 = RawSyscall(SYS_EXECVE,
669 uintptr(unsafe.Pointer(argv0)),
670 uintptr(unsafe.Pointer(&argv[0])),
671 uintptr(unsafe.Pointer(&envv[0])))
672
673 childerror:
674
675 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
676 for {
677 RawSyscall(SYS_EXIT, 253, 0, 0)
678 }
679 }
680
681 func formatIDMappings(idMap []SysProcIDMap) []byte {
682 var data []byte
683 for _, im := range idMap {
684 data = append(data, strconv.Itoa(im.ContainerID)+" "+strconv.Itoa(im.HostID)+" "+strconv.Itoa(im.Size)+"\n"...)
685 }
686 return data
687 }
688
689
690 func writeIDMappings(path string, idMap []SysProcIDMap) error {
691 fd, err := Open(path, O_RDWR, 0)
692 if err != nil {
693 return err
694 }
695
696 if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
697 Close(fd)
698 return err
699 }
700
701 if err := Close(fd); err != nil {
702 return err
703 }
704
705 return nil
706 }
707
708
709
710
711
712 func writeSetgroups(pid int, enable bool) error {
713 sgf := "/proc/" + strconv.Itoa(pid) + "/setgroups"
714 fd, err := Open(sgf, O_RDWR, 0)
715 if err != nil {
716 return err
717 }
718
719 var data []byte
720 if enable {
721 data = []byte("allow")
722 } else {
723 data = []byte("deny")
724 }
725
726 if _, err := Write(fd, data); err != nil {
727 Close(fd)
728 return err
729 }
730
731 return Close(fd)
732 }
733
734
735
736 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
737 if sys.UidMappings != nil {
738 uidf := "/proc/" + strconv.Itoa(pid) + "/uid_map"
739 if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
740 return err
741 }
742 }
743
744 if sys.GidMappings != nil {
745
746 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
747 return err
748 }
749 gidf := "/proc/" + strconv.Itoa(pid) + "/gid_map"
750 if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
751 return err
752 }
753 }
754
755 return nil
756 }
757
758
759 func forkAndExecFailureCleanup(attr *ProcAttr, sys *SysProcAttr) {
760 if sys.PidFD != nil && *sys.PidFD != -1 {
761 Close(*sys.PidFD)
762 *sys.PidFD = -1
763 }
764 }
765
766
767
768
769
770 func os_checkClonePidfd() error {
771 pidfd := int32(-1)
772 pid, errno := doCheckClonePidfd(&pidfd)
773 if errno != 0 {
774 return errno
775 }
776
777 if pidfd == -1 {
778
779
780
781 var err error
782 for {
783 var status WaitStatus
784
785
786
787
788 flags := uint(WCLONE)
789 _, err = Wait4(int(pid), &status, int(flags), nil)
790 if err != EINTR {
791 break
792 }
793 }
794 if err != nil {
795 return err
796 }
797
798 return errpkg.New("clone(CLONE_PIDFD) failed to return pidfd")
799 }
800
801
802
803 defer Close(int(pidfd))
804
805
806
807
808
809
810 const is64bit = ^uint(0) >> 63
811 type sigInfo struct {
812 Signo int32
813 _ struct {
814 Errno int32
815 Code int32
816 }
817 _ [is64bit]int32
818
819
820
821 Pid int32
822 Uid uint32
823 Status int32
824
825
826 _ [128 - (6+is64bit)*4]byte
827 }
828
829 for {
830 const _P_PIDFD = 3
831 var info sigInfo
832 _, _, errno = Syscall6(SYS_WAITID, _P_PIDFD, uintptr(pidfd), uintptr(unsafe.Pointer(&info)), WEXITED|WCLONE, 0, 0)
833 if errno != EINTR {
834 break
835 }
836 }
837 if errno != 0 {
838 return errno
839 }
840
841 return nil
842 }
843
844
845
846
847
848
849
850
851
852 func doCheckClonePidfd(pidfd *int32) (pid uintptr, errno Errno) {
853 flags := uintptr(CLONE_VFORK | CLONE_VM | CLONE_PIDFD)
854 if runtime.GOARCH == "s390x" {
855
856 pid, errno = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(pidfd)))
857 } else {
858 pid, errno = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(pidfd)))
859 }
860 if errno != 0 || pid != 0 {
861
862
863
864
865
866
867 return
868 }
869
870 for {
871 RawSyscall(SYS_EXIT_GROUP, 0, 0, 0)
872 }
873 }
874
View as plain text