shmmax tops out at 2G?

Kostik Belousov kostikbel at gmail.com
Tue Feb 24 09:39:43 PST 2009


On Mon, Feb 23, 2009 at 01:08:28PM -0600, Christian Peron wrote:
> This issue has come up a number of times.  I was looking into fixing this but I
> just have not had the time.  The basic issue is our shmid_ds structure:
> 
>      struct shmid_ds {
>          struct ipc_perm shm_perm;   /* operation permission structure */
>          int             shm_segsz;  /* size of segment in bytes */
>          pid_t           shm_lpid;   /* process ID of last shared memory op */
>          pid_t           shm_cpid;   /* process ID of creator */
>          short           shm_nattch; /* number of current attaches */
>          time_t          shm_atime;  /* time of last shmat() */
>          time_t          shm_dtime;  /* time of last shmdt() */
>          time_t          shm_ctime;  /* time of last change by shmctl() */
>          void           *shm_internal; /* sysv stupidity */
>      };
> 
> 
> Basically the shm_segsz member needs to be switched from 32 bits (int) to
> 64 bits.  The problem is that this breaks the ABI and older versions of
> postgresql will not work.  The solution is to add additional syscalls.
> 
> However, everytime this issue comes up, the question on whether we should
> fix struct ipc_perm at the same time is asked.  The answer imho is that
> we should, however this is more complex since semaphores, messaages and
> shared memory segments all use it.
> 
> The fixes are straight forward, however making sure we maintain reverse
> compatability is where things become complicated, especially since there
> are multiple layers of reverse compat we need to look after.

Yes, this is the right solution. Meantime, below is what we use ATM to
get over this limitation. The struct shmid_ds is only used for IPC_STAT
call in the usermode, ignoring ipcs(1). Allowing it to break for >2Gb
segments, we get otherwise good workaround. The luck is that shmget
takes size_t instead of int as a segment size.

It might be further tweaked to only allow for >2Gb allocation after some
sysctl is set, by I do not see a point.

diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
index 4e9854d..a945523 100644
--- a/sys/kern/sysv_shm.c
+++ b/sys/kern/sysv_shm.c
@@ -121,7 +121,8 @@ static sy_call_t *shmcalls[] = {
 #define	SHMSEG_ALLOCATED	0x0800
 #define	SHMSEG_WANTED		0x1000
 
-static int shm_last_free, shm_nused, shm_committed, shmalloced;
+static int shm_last_free, shm_nused, shmalloced;
+size_t shm_committed;
 static struct shmid_kernel	*shmsegs;
 
 struct shmmap_state {
@@ -250,7 +251,7 @@ shm_deallocate_segment(shmseg)
 
 	vm_object_deallocate(shmseg->u.shm_internal);
 	shmseg->u.shm_internal = NULL;
-	size = round_page(shmseg->u.shm_segsz);
+	size = round_page(shmseg->shm_bsegsz);
 	shm_committed -= btoc(size);
 	shm_nused--;
 	shmseg->u.shm_perm.mode = SHMSEG_FREE;
@@ -270,7 +271,7 @@ shm_delete_mapping(struct vmspace *vm, struct shmmap_state *shmmap_s)
 
 	segnum = IPCID_TO_IX(shmmap_s->shmid);
 	shmseg = &shmsegs[segnum];
-	size = round_page(shmseg->u.shm_segsz);
+	size = round_page(shmseg->shm_bsegsz);
 	result = vm_map_remove(&vm->vm_map, shmmap_s->va, shmmap_s->va + size);
 	if (result != KERN_SUCCESS)
 		return (EINVAL);
@@ -390,7 +391,7 @@ kern_shmat(td, shmid, shmaddr, shmflg)
 		error = EMFILE;
 		goto done2;
 	}
-	size = round_page(shmseg->u.shm_segsz);
+	size = round_page(shmseg->shm_bsegsz);
 #ifdef VM_PROT_READ_IS_EXEC
 	prot = VM_PROT_READ | VM_PROT_EXECUTE;
 #else
@@ -422,7 +423,8 @@ kern_shmat(td, shmid, shmaddr, shmflg)
 
 	vm_object_reference(shmseg->u.shm_internal);
 	rv = vm_map_find(&p->p_vmspace->vm_map, shmseg->u.shm_internal,
-		0, &attach_va, size, (flags & MAP_FIXED)?0:1, prot, prot, 0);
+	    0, &attach_va, size, (flags & MAP_FIXED) ? VMFS_NO_SPACE :
+	    VMFS_ANY_SPACE, prot, prot, 0);
 	if (rv != KERN_SUCCESS) {
 		vm_object_deallocate(shmseg->u.shm_internal);
 		error = ENOMEM;
@@ -720,7 +722,7 @@ shmget_existing(td, uap, mode, segnum)
 	if (error != 0)
 		return (error);
 #endif
-	if (uap->size && uap->size > shmseg->u.shm_segsz)
+	if (uap->size && uap->size > shmseg->shm_bsegsz)
 		return (EINVAL);
 	td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
 	return (0);
@@ -732,7 +734,8 @@ shmget_allocate_segment(td, uap, mode)
 	struct shmget_args *uap;
 	int mode;
 {
-	int i, segnum, shmid, size;
+	int i, segnum, shmid;
+	size_t size;
 	struct ucred *cred = td->td_ucred;
 	struct shmid_kernel *shmseg;
 	vm_object_t shm_object;
@@ -790,6 +793,7 @@ shmget_allocate_segment(td, uap, mode)
 	shmseg->u.shm_perm.mode = (shmseg->u.shm_perm.mode & SHMSEG_WANTED) |
 	    (mode & ACCESSPERMS) | SHMSEG_ALLOCATED;
 	shmseg->u.shm_segsz = uap->size;
+	shmseg->shm_bsegsz = uap->size;
 	shmseg->u.shm_cpid = td->td_proc->p_pid;
 	shmseg->u.shm_lpid = shmseg->u.shm_nattch = 0;
 	shmseg->u.shm_atime = shmseg->u.shm_dtime = 0;
diff --git a/sys/sys/shm.h b/sys/sys/shm.h
index 33ed7b0..c4b1369 100644
--- a/sys/sys/shm.h
+++ b/sys/sys/shm.h
@@ -108,6 +108,7 @@ struct shminfo {
 struct shmid_kernel {
 	struct shmid_ds u;
 	struct label *label;	/* MAC label */
+	size_t shm_bsegsz;
 };
 
 extern struct shminfo	shminfo;
diff --git a/usr.bin/ipcs/ipcs.c b/usr.bin/ipcs/ipcs.c
index 67364d5..1fd943a 100644
--- a/usr.bin/ipcs/ipcs.c
+++ b/usr.bin/ipcs/ipcs.c
@@ -452,8 +452,8 @@ print_kshmptr(int i, int option, struct shmid_kernel *kshmptr)
 		    kshmptr->u.shm_nattch);
 
 	if (option & BIGGEST)
-		printf(" %12d",
-		    kshmptr->u.shm_segsz);
+		printf(" %12zu",
+		    kshmptr->shm_bsegsz);
 
 	if (option & PID)
 		printf(" %12d %12d",
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 195 bytes
Desc: not available
Url : http://lists.freebsd.org/pipermail/freebsd-hackers/attachments/20090224/a97239cd/attachment.pgp


More information about the freebsd-hackers mailing list