2001-10-30 04:11:53 +03:00
|
|
|
/* $NetBSD: ffs_alloc.c,v 1.53 2001/10/30 01:11:53 lukem Exp $ */
|
1994-06-29 10:39:25 +04:00
|
|
|
|
1994-06-08 15:41:58 +04:00
|
|
|
/*
|
|
|
|
* Copyright (c) 1982, 1986, 1989, 1993
|
|
|
|
* The Regents of the University of California. All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 3. All advertising materials mentioning features or use of this software
|
|
|
|
* must display the following acknowledgement:
|
|
|
|
* This product includes software developed by the University of
|
|
|
|
* California, Berkeley and its contributors.
|
|
|
|
* 4. Neither the name of the University nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
1998-03-01 05:20:01 +03:00
|
|
|
* @(#)ffs_alloc.c 8.19 (Berkeley) 7/13/95
|
1994-06-08 15:41:58 +04:00
|
|
|
*/
|
|
|
|
|
2001-10-30 04:11:53 +03:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.53 2001/10/30 01:11:53 lukem Exp $");
|
|
|
|
|
2001-05-30 15:57:16 +04:00
|
|
|
#if defined(_KERNEL_OPT)
|
1998-11-12 22:51:10 +03:00
|
|
|
#include "opt_ffs.h"
|
1998-06-08 08:27:50 +04:00
|
|
|
#include "opt_quota.h"
|
1998-06-09 11:46:31 +04:00
|
|
|
#endif
|
1998-02-10 17:08:44 +03:00
|
|
|
|
1994-06-08 15:41:58 +04:00
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/systm.h>
|
|
|
|
#include <sys/buf.h>
|
|
|
|
#include <sys/proc.h>
|
|
|
|
#include <sys/vnode.h>
|
|
|
|
#include <sys/mount.h>
|
|
|
|
#include <sys/kernel.h>
|
|
|
|
#include <sys/syslog.h>
|
|
|
|
|
|
|
|
#include <ufs/ufs/quota.h>
|
1998-03-18 18:57:26 +03:00
|
|
|
#include <ufs/ufs/ufsmount.h>
|
1994-06-08 15:41:58 +04:00
|
|
|
#include <ufs/ufs/inode.h>
|
1996-02-10 01:22:18 +03:00
|
|
|
#include <ufs/ufs/ufs_extern.h>
|
1998-03-18 18:57:26 +03:00
|
|
|
#include <ufs/ufs/ufs_bswap.h>
|
1994-06-08 15:41:58 +04:00
|
|
|
|
|
|
|
#include <ufs/ffs/fs.h>
|
|
|
|
#include <ufs/ffs/ffs_extern.h>
|
|
|
|
|
1999-11-15 21:49:07 +03:00
|
|
|
static ufs_daddr_t ffs_alloccg __P((struct inode *, int, ufs_daddr_t, int));
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
static ufs_daddr_t ffs_alloccgblk __P((struct inode *, struct buf *, ufs_daddr_t));
|
1999-11-15 21:49:07 +03:00
|
|
|
static ufs_daddr_t ffs_clusteralloc __P((struct inode *, int, ufs_daddr_t, int));
|
Incorporate the enhanced ffs_dirpref() by Grigoriy Orlov, as found in
FreeBSD (three commits; the initial work, man page updates, and a fix
to ffs_reload()), with the following differences:
- Be consistent between newfs(8) and tunefs(8) as to the options which
set and control the tuning parameters for this work (avgfilesize & avgfpdir)
- Use u_int16_t instead of u_int8_t to keep track of the number of
contiguous directories (suggested by Chuck Silvers)
- Work within our FFS_EI framework
- Ensure that fs->fs_maxclusters and fs->fs_contigdirs don't point to
the same area of memory
The new algorithm has a marked performance increase, especially when
performing tasks such as untarring pkgsrc.tar.gz, etc.
The original FreeBSD commit messages are attached:
=====
mckusick 2001/04/10 01:39:00 PDT
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
=====
=====
iedowse 2001/04/23 17:37:17 PDT
Pre-dirpref versions of fsck may zero out the new superblock fields
fs_contigdirs, fs_avgfilesize and fs_avgfpdir. This could cause
panics if these fields were zeroed while a filesystem was mounted
read-only, and then remounted read-write.
Add code to ffs_reload() which copies the fs_contigdirs pointer
from the previous superblock, and reinitialises fs_avgf* if necessary.
Reviewed by: mckusick
=====
=====
nik 2001/04/10 03:36:44 PDT
Add information about the new options to newfs and tunefs which set the
expected average file size and number of files per directory. Could do
with some fleshing out.
=====
2001-09-06 06:16:00 +04:00
|
|
|
static ino_t ffs_dirpref __P((struct inode *));
|
1999-11-15 21:49:07 +03:00
|
|
|
static ufs_daddr_t ffs_fragextend __P((struct inode *, int, long, int, int));
|
|
|
|
static void ffs_fserr __P((struct fs *, u_int, char *));
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
static u_long ffs_hashalloc __P((struct inode *, int, long, int,
|
|
|
|
ufs_daddr_t (*)(struct inode *, int, ufs_daddr_t, int)));
|
1999-11-15 21:49:07 +03:00
|
|
|
static ufs_daddr_t ffs_nodealloccg __P((struct inode *, int, ufs_daddr_t, int));
|
|
|
|
static ufs_daddr_t ffs_mapsearch __P((struct fs *, struct cg *,
|
|
|
|
ufs_daddr_t, int));
|
1998-03-01 05:20:01 +03:00
|
|
|
#if defined(DIAGNOSTIC) || defined(DEBUG)
|
|
|
|
static int ffs_checkblk __P((struct inode *, ufs_daddr_t, long size));
|
|
|
|
#endif
|
1994-06-08 15:41:58 +04:00
|
|
|
|
2000-04-04 13:23:20 +04:00
|
|
|
/* if 1, changes in optimalization strategy are logged */
|
|
|
|
int ffs_log_changeopt = 0;
|
|
|
|
|
1998-07-28 21:30:01 +04:00
|
|
|
/* in ffs_tables.c */
|
2001-01-18 23:28:15 +03:00
|
|
|
extern const int inside[], around[];
|
|
|
|
extern const u_char * const fragtbl[];
|
1998-07-28 21:30:01 +04:00
|
|
|
|
1994-06-08 15:41:58 +04:00
|
|
|
/*
|
|
|
|
* Allocate a block in the file system.
|
|
|
|
*
|
|
|
|
* The size of the requested block is given, which must be some
|
|
|
|
* multiple of fs_fsize and <= fs_bsize.
|
|
|
|
* A preference may be optionally specified. If a preference is given
|
|
|
|
* the following hierarchy is used to allocate a block:
|
|
|
|
* 1) allocate the requested block.
|
|
|
|
* 2) allocate a rotationally optimal block in the same cylinder.
|
|
|
|
* 3) allocate a block in the same cylinder group.
|
|
|
|
* 4) quadradically rehash into other cylinder groups, until an
|
|
|
|
* available block is located.
|
2001-08-24 14:24:45 +04:00
|
|
|
* If no block preference is given the following hierarchy is used
|
1994-06-08 15:41:58 +04:00
|
|
|
* to allocate a block:
|
|
|
|
* 1) allocate a block in the cylinder group that contains the
|
|
|
|
* inode for the file.
|
|
|
|
* 2) quadradically rehash into other cylinder groups, until an
|
|
|
|
* available block is located.
|
|
|
|
*/
|
1996-02-10 01:22:18 +03:00
|
|
|
int
|
1994-06-08 15:41:58 +04:00
|
|
|
ffs_alloc(ip, lbn, bpref, size, cred, bnp)
|
2000-03-30 16:41:09 +04:00
|
|
|
struct inode *ip;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t lbn, bpref;
|
1994-06-08 15:41:58 +04:00
|
|
|
int size;
|
|
|
|
struct ucred *cred;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t *bnp;
|
1994-06-08 15:41:58 +04:00
|
|
|
{
|
2000-11-27 11:39:39 +03:00
|
|
|
struct fs *fs = ip->i_fs;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t bno;
|
1996-02-10 01:22:18 +03:00
|
|
|
int cg;
|
|
|
|
#ifdef QUOTA
|
|
|
|
int error;
|
|
|
|
#endif
|
1994-06-08 15:41:58 +04:00
|
|
|
|
2000-11-27 11:39:39 +03:00
|
|
|
#ifdef UVM_PAGE_TRKOWN
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
if (ITOV(ip)->v_type == VREG &&
|
|
|
|
lblktosize(fs, (voff_t)lbn) < round_page(ITOV(ip)->v_size)) {
|
2000-11-27 11:39:39 +03:00
|
|
|
struct vm_page *pg;
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
struct uvm_object *uobj = &ITOV(ip)->v_uobj;
|
2001-08-31 07:38:45 +04:00
|
|
|
voff_t off = trunc_page(lblktosize(fs, lbn));
|
|
|
|
voff_t endoff = round_page(lblktosize(fs, lbn) + size);
|
2000-11-27 11:39:39 +03:00
|
|
|
|
|
|
|
simple_lock(&uobj->vmobjlock);
|
|
|
|
while (off < endoff) {
|
|
|
|
pg = uvm_pagelookup(uobj, off);
|
|
|
|
KASSERT(pg != NULL);
|
|
|
|
KASSERT(pg->owner == curproc->p_pid);
|
|
|
|
KASSERT((pg->flags & PG_CLEAN) == 0);
|
|
|
|
off += PAGE_SIZE;
|
|
|
|
}
|
|
|
|
simple_unlock(&uobj->vmobjlock);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
1994-06-08 15:41:58 +04:00
|
|
|
*bnp = 0;
|
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n",
|
1994-06-08 15:41:58 +04:00
|
|
|
ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
|
|
|
|
panic("ffs_alloc: bad size");
|
|
|
|
}
|
|
|
|
if (cred == NOCRED)
|
|
|
|
panic("ffs_alloc: missing credential\n");
|
|
|
|
#endif /* DIAGNOSTIC */
|
|
|
|
if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
|
|
|
|
goto nospace;
|
|
|
|
if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0)
|
|
|
|
goto nospace;
|
|
|
|
#ifdef QUOTA
|
1996-02-10 01:22:18 +03:00
|
|
|
if ((error = chkdq(ip, (long)btodb(size), cred, 0)) != 0)
|
1994-06-08 15:41:58 +04:00
|
|
|
return (error);
|
|
|
|
#endif
|
|
|
|
if (bpref >= fs->fs_size)
|
|
|
|
bpref = 0;
|
|
|
|
if (bpref == 0)
|
|
|
|
cg = ino_to_cg(fs, ip->i_number);
|
|
|
|
else
|
|
|
|
cg = dtog(fs, bpref);
|
1998-03-01 05:20:01 +03:00
|
|
|
bno = (ufs_daddr_t)ffs_hashalloc(ip, cg, (long)bpref, size,
|
1996-02-10 01:22:18 +03:00
|
|
|
ffs_alloccg);
|
1994-06-08 15:41:58 +04:00
|
|
|
if (bno > 0) {
|
1997-06-11 14:09:37 +04:00
|
|
|
ip->i_ffs_blocks += btodb(size);
|
1994-06-08 15:41:58 +04:00
|
|
|
ip->i_flag |= IN_CHANGE | IN_UPDATE;
|
|
|
|
*bnp = bno;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
#ifdef QUOTA
|
|
|
|
/*
|
|
|
|
* Restore user's disk quota because allocation failed.
|
|
|
|
*/
|
|
|
|
(void) chkdq(ip, (long)-btodb(size), cred, FORCE);
|
|
|
|
#endif
|
|
|
|
nospace:
|
|
|
|
ffs_fserr(fs, cred->cr_uid, "file system full");
|
|
|
|
uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
|
|
|
|
return (ENOSPC);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reallocate a fragment to a bigger size
|
|
|
|
*
|
|
|
|
* The number and size of the old block is given, and a preference
|
|
|
|
* and new size is also specified. The allocator attempts to extend
|
|
|
|
* the original block. Failing that, the regular block allocator is
|
|
|
|
* invoked to get an appropriate block.
|
|
|
|
*/
|
1996-02-10 01:22:18 +03:00
|
|
|
int
|
2000-11-27 11:39:39 +03:00
|
|
|
ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp, blknop)
|
2000-03-30 16:41:09 +04:00
|
|
|
struct inode *ip;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t lbprev;
|
|
|
|
ufs_daddr_t bpref;
|
1994-06-08 15:41:58 +04:00
|
|
|
int osize, nsize;
|
|
|
|
struct ucred *cred;
|
|
|
|
struct buf **bpp;
|
2000-11-27 11:39:39 +03:00
|
|
|
ufs_daddr_t *blknop;
|
1994-06-08 15:41:58 +04:00
|
|
|
{
|
2000-11-27 11:39:39 +03:00
|
|
|
struct fs *fs = ip->i_fs;
|
1994-06-08 15:41:58 +04:00
|
|
|
struct buf *bp;
|
|
|
|
int cg, request, error;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t bprev, bno;
|
1998-08-18 10:47:53 +04:00
|
|
|
|
2000-11-27 11:39:39 +03:00
|
|
|
#ifdef UVM_PAGE_TRKOWN
|
|
|
|
if (ITOV(ip)->v_type == VREG) {
|
|
|
|
struct vm_page *pg;
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
struct uvm_object *uobj = &ITOV(ip)->v_uobj;
|
2001-08-31 07:38:45 +04:00
|
|
|
voff_t off = trunc_page(lblktosize(fs, lbprev));
|
|
|
|
voff_t endoff = round_page(lblktosize(fs, lbprev) + osize);
|
2000-11-27 11:39:39 +03:00
|
|
|
|
|
|
|
simple_lock(&uobj->vmobjlock);
|
|
|
|
while (off < endoff) {
|
|
|
|
pg = uvm_pagelookup(uobj, off);
|
|
|
|
KASSERT(pg != NULL);
|
|
|
|
KASSERT(pg->owner == curproc->p_pid);
|
|
|
|
KASSERT((pg->flags & PG_CLEAN) == 0);
|
|
|
|
off += PAGE_SIZE;
|
|
|
|
}
|
|
|
|
simple_unlock(&uobj->vmobjlock);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
1994-06-08 15:41:58 +04:00
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
|
|
|
|
(u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
|
1996-10-13 01:58:44 +04:00
|
|
|
printf(
|
1994-06-08 15:41:58 +04:00
|
|
|
"dev = 0x%x, bsize = %d, osize = %d, nsize = %d, fs = %s\n",
|
|
|
|
ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
|
|
|
|
panic("ffs_realloccg: bad size");
|
|
|
|
}
|
|
|
|
if (cred == NOCRED)
|
|
|
|
panic("ffs_realloccg: missing credential\n");
|
|
|
|
#endif /* DIAGNOSTIC */
|
|
|
|
if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0)
|
|
|
|
goto nospace;
|
1999-11-15 21:49:07 +03:00
|
|
|
if ((bprev = ufs_rw32(ip->i_ffs_db[lbprev], UFS_FSNEEDSWAP(fs))) == 0) {
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("dev = 0x%x, bsize = %d, bprev = %d, fs = %s\n",
|
1994-06-08 15:41:58 +04:00
|
|
|
ip->i_dev, fs->fs_bsize, bprev, fs->fs_fsmnt);
|
|
|
|
panic("ffs_realloccg: bad bprev");
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Allocate the extra space in the buffer.
|
|
|
|
*/
|
2000-11-27 11:39:39 +03:00
|
|
|
if (bpp != NULL &&
|
|
|
|
(error = bread(ITOV(ip), lbprev, osize, NOCRED, &bp)) != 0) {
|
1994-06-08 15:41:58 +04:00
|
|
|
brelse(bp);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
#ifdef QUOTA
|
1996-02-10 01:22:18 +03:00
|
|
|
if ((error = chkdq(ip, (long)btodb(nsize - osize), cred, 0)) != 0) {
|
2001-06-03 20:49:07 +04:00
|
|
|
if (bpp != NULL) {
|
|
|
|
brelse(bp);
|
|
|
|
}
|
1994-06-08 15:41:58 +04:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
/*
|
|
|
|
* Check for extension in the existing location.
|
|
|
|
*/
|
|
|
|
cg = dtog(fs, bprev);
|
1996-02-10 01:22:18 +03:00
|
|
|
if ((bno = ffs_fragextend(ip, cg, (long)bprev, osize, nsize)) != 0) {
|
1997-06-11 14:09:37 +04:00
|
|
|
ip->i_ffs_blocks += btodb(nsize - osize);
|
1994-06-08 15:41:58 +04:00
|
|
|
ip->i_flag |= IN_CHANGE | IN_UPDATE;
|
2000-11-27 11:39:39 +03:00
|
|
|
|
|
|
|
if (bpp != NULL) {
|
|
|
|
if (bp->b_blkno != fsbtodb(fs, bno))
|
|
|
|
panic("bad blockno");
|
|
|
|
allocbuf(bp, nsize);
|
|
|
|
bp->b_flags |= B_DONE;
|
|
|
|
memset(bp->b_data + osize, 0, nsize - osize);
|
|
|
|
*bpp = bp;
|
|
|
|
}
|
|
|
|
if (blknop != NULL) {
|
|
|
|
*blknop = bno;
|
|
|
|
}
|
1994-06-08 15:41:58 +04:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Allocate a new disk location.
|
|
|
|
*/
|
|
|
|
if (bpref >= fs->fs_size)
|
|
|
|
bpref = 0;
|
|
|
|
switch ((int)fs->fs_optim) {
|
|
|
|
case FS_OPTSPACE:
|
|
|
|
/*
|
|
|
|
* Allocate an exact sized fragment. Although this makes
|
|
|
|
* best use of space, we will waste time relocating it if
|
|
|
|
* the file continues to grow. If the fragmentation is
|
|
|
|
* less than half of the minimum free reserve, we choose
|
|
|
|
* to begin optimizing for time.
|
|
|
|
*/
|
|
|
|
request = nsize;
|
|
|
|
if (fs->fs_minfree < 5 ||
|
|
|
|
fs->fs_cstotal.cs_nffree >
|
|
|
|
fs->fs_dsize * fs->fs_minfree / (2 * 100))
|
|
|
|
break;
|
2000-04-04 13:23:20 +04:00
|
|
|
|
|
|
|
if (ffs_log_changeopt) {
|
|
|
|
log(LOG_NOTICE,
|
|
|
|
"%s: optimization changed from SPACE to TIME\n",
|
|
|
|
fs->fs_fsmnt);
|
|
|
|
}
|
|
|
|
|
1994-06-08 15:41:58 +04:00
|
|
|
fs->fs_optim = FS_OPTTIME;
|
|
|
|
break;
|
|
|
|
case FS_OPTTIME:
|
|
|
|
/*
|
|
|
|
* At this point we have discovered a file that is trying to
|
|
|
|
* grow a small fragment to a larger fragment. To save time,
|
|
|
|
* we allocate a full sized block, then free the unused portion.
|
|
|
|
* If the file continues to grow, the `ffs_fragextend' call
|
|
|
|
* above will be able to grow it in place without further
|
|
|
|
* copying. If aberrant programs cause disk fragmentation to
|
|
|
|
* grow within 2% of the free reserve, we choose to begin
|
|
|
|
* optimizing for space.
|
|
|
|
*/
|
|
|
|
request = fs->fs_bsize;
|
|
|
|
if (fs->fs_cstotal.cs_nffree <
|
|
|
|
fs->fs_dsize * (fs->fs_minfree - 2) / 100)
|
|
|
|
break;
|
2000-04-04 13:23:20 +04:00
|
|
|
|
|
|
|
if (ffs_log_changeopt) {
|
|
|
|
log(LOG_NOTICE,
|
|
|
|
"%s: optimization changed from TIME to SPACE\n",
|
|
|
|
fs->fs_fsmnt);
|
|
|
|
}
|
|
|
|
|
1994-06-08 15:41:58 +04:00
|
|
|
fs->fs_optim = FS_OPTSPACE;
|
|
|
|
break;
|
|
|
|
default:
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("dev = 0x%x, optim = %d, fs = %s\n",
|
1994-06-08 15:41:58 +04:00
|
|
|
ip->i_dev, fs->fs_optim, fs->fs_fsmnt);
|
|
|
|
panic("ffs_realloccg: bad optim");
|
|
|
|
/* NOTREACHED */
|
|
|
|
}
|
1998-03-01 05:20:01 +03:00
|
|
|
bno = (ufs_daddr_t)ffs_hashalloc(ip, cg, (long)bpref, request,
|
1996-02-10 01:22:18 +03:00
|
|
|
ffs_alloccg);
|
1994-06-08 15:41:58 +04:00
|
|
|
if (bno > 0) {
|
1999-11-15 21:49:07 +03:00
|
|
|
if (!DOINGSOFTDEP(ITOV(ip)))
|
|
|
|
ffs_blkfree(ip, bprev, (long)osize);
|
1994-06-08 15:41:58 +04:00
|
|
|
if (nsize < request)
|
|
|
|
ffs_blkfree(ip, bno + numfrags(fs, nsize),
|
|
|
|
(long)(request - nsize));
|
1997-06-11 14:09:37 +04:00
|
|
|
ip->i_ffs_blocks += btodb(nsize - osize);
|
1994-06-08 15:41:58 +04:00
|
|
|
ip->i_flag |= IN_CHANGE | IN_UPDATE;
|
2000-11-27 11:39:39 +03:00
|
|
|
if (bpp != NULL) {
|
|
|
|
bp->b_blkno = fsbtodb(fs, bno);
|
|
|
|
allocbuf(bp, nsize);
|
|
|
|
bp->b_flags |= B_DONE;
|
|
|
|
memset(bp->b_data + osize, 0, (u_int)nsize - osize);
|
|
|
|
*bpp = bp;
|
|
|
|
}
|
|
|
|
if (blknop != NULL) {
|
|
|
|
*blknop = bno;
|
|
|
|
}
|
1994-06-08 15:41:58 +04:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
#ifdef QUOTA
|
|
|
|
/*
|
|
|
|
* Restore user's disk quota because allocation failed.
|
|
|
|
*/
|
|
|
|
(void) chkdq(ip, (long)-btodb(nsize - osize), cred, FORCE);
|
|
|
|
#endif
|
2000-11-27 11:39:39 +03:00
|
|
|
if (bpp != NULL) {
|
|
|
|
brelse(bp);
|
|
|
|
}
|
|
|
|
|
1994-06-08 15:41:58 +04:00
|
|
|
nospace:
|
|
|
|
/*
|
|
|
|
* no space available
|
|
|
|
*/
|
|
|
|
ffs_fserr(fs, cred->cr_uid, "file system full");
|
|
|
|
uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
|
|
|
|
return (ENOSPC);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reallocate a sequence of blocks into a contiguous sequence of blocks.
|
|
|
|
*
|
|
|
|
* The vnode and an array of buffer pointers for a range of sequential
|
|
|
|
* logical blocks to be made contiguous is given. The allocator attempts
|
|
|
|
* to find a range of sequential blocks starting as close as possible to
|
|
|
|
* an fs_rotdelay offset from the end of the allocation for the logical
|
2001-08-20 16:00:46 +04:00
|
|
|
* block immediately preceding the current range. If successful, the
|
1994-06-08 15:41:58 +04:00
|
|
|
* physical block numbers in the buffer pointers and in the inode are
|
|
|
|
* changed to reflect the new allocation. If unsuccessful, the allocation
|
|
|
|
* is left unchanged. The success in doing the reallocation is returned.
|
|
|
|
* Note that the error return is not reflected back to the user. Rather
|
|
|
|
* the previous block allocation will be used.
|
|
|
|
*/
|
1994-07-05 01:06:07 +04:00
|
|
|
#ifdef DEBUG
|
1994-06-08 15:41:58 +04:00
|
|
|
#include <sys/sysctl.h>
|
1994-12-14 16:03:35 +03:00
|
|
|
int prtrealloc = 0;
|
|
|
|
struct ctldebug debug15 = { "prtrealloc", &prtrealloc };
|
1994-06-08 15:41:58 +04:00
|
|
|
#endif
|
|
|
|
|
1998-03-01 05:20:01 +03:00
|
|
|
int doasyncfree = 1;
|
|
|
|
|
1994-06-08 15:41:58 +04:00
|
|
|
int
|
1996-02-10 01:22:18 +03:00
|
|
|
ffs_reallocblks(v)
|
|
|
|
void *v;
|
|
|
|
{
|
1994-06-08 15:41:58 +04:00
|
|
|
struct vop_reallocblks_args /* {
|
|
|
|
struct vnode *a_vp;
|
|
|
|
struct cluster_save *a_buflist;
|
1996-02-10 01:22:18 +03:00
|
|
|
} */ *ap = v;
|
1994-06-08 15:41:58 +04:00
|
|
|
struct fs *fs;
|
|
|
|
struct inode *ip;
|
|
|
|
struct vnode *vp;
|
|
|
|
struct buf *sbp, *ebp;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t *bap, *sbap, *ebap = NULL;
|
1994-06-08 15:41:58 +04:00
|
|
|
struct cluster_save *buflist;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t start_lbn, end_lbn, soff, newblk, blkno;
|
1994-06-08 15:41:58 +04:00
|
|
|
struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp;
|
|
|
|
int i, len, start_lvl, end_lvl, pref, ssize;
|
|
|
|
|
2000-11-27 11:39:39 +03:00
|
|
|
/* XXXUBC don't reallocblks for now */
|
|
|
|
return ENOSPC;
|
|
|
|
|
1994-06-08 15:41:58 +04:00
|
|
|
vp = ap->a_vp;
|
|
|
|
ip = VTOI(vp);
|
|
|
|
fs = ip->i_fs;
|
|
|
|
if (fs->fs_contigsumsize <= 0)
|
|
|
|
return (ENOSPC);
|
|
|
|
buflist = ap->a_buflist;
|
|
|
|
len = buflist->bs_nchildren;
|
|
|
|
start_lbn = buflist->bs_children[0]->b_lblkno;
|
|
|
|
end_lbn = start_lbn + len - 1;
|
|
|
|
#ifdef DIAGNOSTIC
|
1998-03-01 05:20:01 +03:00
|
|
|
for (i = 0; i < len; i++)
|
|
|
|
if (!ffs_checkblk(ip,
|
|
|
|
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
|
|
|
|
panic("ffs_reallocblks: unallocated block 1");
|
1994-06-08 15:41:58 +04:00
|
|
|
for (i = 1; i < len; i++)
|
|
|
|
if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
|
1998-03-01 05:20:01 +03:00
|
|
|
panic("ffs_reallocblks: non-logical cluster");
|
|
|
|
blkno = buflist->bs_children[0]->b_blkno;
|
|
|
|
ssize = fsbtodb(fs, fs->fs_frag);
|
|
|
|
for (i = 1; i < len - 1; i++)
|
|
|
|
if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
|
|
|
|
panic("ffs_reallocblks: non-physical cluster %d", i);
|
1994-06-08 15:41:58 +04:00
|
|
|
#endif
|
|
|
|
/*
|
|
|
|
* If the latest allocation is in a new cylinder group, assume that
|
|
|
|
* the filesystem has decided to move and do not force it back to
|
|
|
|
* the previous cylinder group.
|
|
|
|
*/
|
|
|
|
if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
|
|
|
|
dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
|
|
|
|
return (ENOSPC);
|
|
|
|
if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
|
|
|
|
ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
|
|
|
|
return (ENOSPC);
|
|
|
|
/*
|
|
|
|
* Get the starting offset and block map for the first block.
|
|
|
|
*/
|
|
|
|
if (start_lvl == 0) {
|
1997-06-11 14:09:37 +04:00
|
|
|
sbap = &ip->i_ffs_db[0];
|
1994-06-08 15:41:58 +04:00
|
|
|
soff = start_lbn;
|
|
|
|
} else {
|
|
|
|
idp = &start_ap[start_lvl - 1];
|
|
|
|
if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
|
|
|
|
brelse(sbp);
|
|
|
|
return (ENOSPC);
|
|
|
|
}
|
1998-03-01 05:20:01 +03:00
|
|
|
sbap = (ufs_daddr_t *)sbp->b_data;
|
1994-06-08 15:41:58 +04:00
|
|
|
soff = idp->in_off;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Find the preferred location for the cluster.
|
|
|
|
*/
|
|
|
|
pref = ffs_blkpref(ip, start_lbn, soff, sbap);
|
|
|
|
/*
|
|
|
|
* If the block range spans two block maps, get the second map.
|
|
|
|
*/
|
|
|
|
if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
|
|
|
|
ssize = len;
|
|
|
|
} else {
|
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if (start_ap[start_lvl-1].in_lbn == idp->in_lbn)
|
|
|
|
panic("ffs_reallocblk: start == end");
|
|
|
|
#endif
|
|
|
|
ssize = len - (idp->in_off + 1);
|
|
|
|
if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
|
|
|
|
goto fail;
|
1998-03-01 05:20:01 +03:00
|
|
|
ebap = (ufs_daddr_t *)ebp->b_data;
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Search the block map looking for an allocation of the desired size.
|
|
|
|
*/
|
1998-03-01 05:20:01 +03:00
|
|
|
if ((newblk = (ufs_daddr_t)ffs_hashalloc(ip, dtog(fs, pref), (long)pref,
|
1996-02-10 01:22:18 +03:00
|
|
|
len, ffs_clusteralloc)) == 0)
|
1994-06-08 15:41:58 +04:00
|
|
|
goto fail;
|
|
|
|
/*
|
|
|
|
* We have found a new contiguous block.
|
|
|
|
*
|
|
|
|
* First we have to replace the old block pointers with the new
|
|
|
|
* block pointers in the inode and indirect blocks associated
|
|
|
|
* with the file.
|
|
|
|
*/
|
1994-12-14 16:03:35 +03:00
|
|
|
#ifdef DEBUG
|
|
|
|
if (prtrealloc)
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("realloc: ino %d, lbns %d-%d\n\told:", ip->i_number,
|
1994-12-14 16:03:35 +03:00
|
|
|
start_lbn, end_lbn);
|
|
|
|
#endif
|
1994-06-08 15:41:58 +04:00
|
|
|
blkno = newblk;
|
|
|
|
for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
|
1999-11-15 21:49:07 +03:00
|
|
|
ufs_daddr_t ba;
|
|
|
|
|
|
|
|
if (i == ssize) {
|
1994-06-08 15:41:58 +04:00
|
|
|
bap = ebap;
|
1999-11-15 21:49:07 +03:00
|
|
|
soff = -i;
|
|
|
|
}
|
|
|
|
ba = ufs_rw32(*bap, UFS_FSNEEDSWAP(fs));
|
1994-06-08 15:41:58 +04:00
|
|
|
#ifdef DIAGNOSTIC
|
1998-03-01 05:20:01 +03:00
|
|
|
if (!ffs_checkblk(ip,
|
|
|
|
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
|
|
|
|
panic("ffs_reallocblks: unallocated block 2");
|
1999-11-15 21:49:07 +03:00
|
|
|
if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != ba)
|
1994-06-08 15:41:58 +04:00
|
|
|
panic("ffs_reallocblks: alloc mismatch");
|
1994-12-14 16:03:35 +03:00
|
|
|
#endif
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (prtrealloc)
|
1999-11-15 21:49:07 +03:00
|
|
|
printf(" %d,", ba);
|
1994-06-08 15:41:58 +04:00
|
|
|
#endif
|
1999-11-15 21:49:07 +03:00
|
|
|
if (DOINGSOFTDEP(vp)) {
|
|
|
|
if (sbap == &ip->i_ffs_db[0] && i < ssize)
|
|
|
|
softdep_setup_allocdirect(ip, start_lbn + i,
|
|
|
|
blkno, ba, fs->fs_bsize, fs->fs_bsize,
|
|
|
|
buflist->bs_children[i]);
|
|
|
|
else
|
|
|
|
softdep_setup_allocindir_page(ip, start_lbn + i,
|
|
|
|
i < ssize ? sbp : ebp, soff + i, blkno,
|
|
|
|
ba, buflist->bs_children[i]);
|
|
|
|
}
|
|
|
|
*bap++ = ufs_rw32(blkno, UFS_FSNEEDSWAP(fs));
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Next we must write out the modified inode and indirect blocks.
|
|
|
|
* For strict correctness, the writes should be synchronous since
|
|
|
|
* the old block values may have been written to disk. In practise
|
|
|
|
* they are almost never written, but if we are concerned about
|
|
|
|
* strict correctness, the `doasyncfree' flag should be set to zero.
|
|
|
|
*
|
|
|
|
* The test on `doasyncfree' should be changed to test a flag
|
|
|
|
* that shows whether the associated buffers and inodes have
|
|
|
|
* been written. The flag should be set when the cluster is
|
|
|
|
* started and cleared whenever the buffer or inode is flushed.
|
|
|
|
* We can then check below to see if it is set, and do the
|
|
|
|
* synchronous write only when it has been cleared.
|
|
|
|
*/
|
1997-06-11 14:09:37 +04:00
|
|
|
if (sbap != &ip->i_ffs_db[0]) {
|
1994-06-08 15:41:58 +04:00
|
|
|
if (doasyncfree)
|
|
|
|
bdwrite(sbp);
|
|
|
|
else
|
|
|
|
bwrite(sbp);
|
|
|
|
} else {
|
|
|
|
ip->i_flag |= IN_CHANGE | IN_UPDATE;
|
1999-03-06 00:09:48 +03:00
|
|
|
if (!doasyncfree)
|
|
|
|
VOP_UPDATE(vp, NULL, NULL, 1);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
1998-08-18 10:47:53 +04:00
|
|
|
if (ssize < len) {
|
1994-06-08 15:41:58 +04:00
|
|
|
if (doasyncfree)
|
|
|
|
bdwrite(ebp);
|
|
|
|
else
|
|
|
|
bwrite(ebp);
|
1998-08-18 10:47:53 +04:00
|
|
|
}
|
1994-06-08 15:41:58 +04:00
|
|
|
/*
|
|
|
|
* Last, free the old blocks and assign the new blocks to the buffers.
|
|
|
|
*/
|
1994-12-14 16:03:35 +03:00
|
|
|
#ifdef DEBUG
|
|
|
|
if (prtrealloc)
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("\n\tnew:");
|
1994-12-14 16:03:35 +03:00
|
|
|
#endif
|
1994-06-08 15:41:58 +04:00
|
|
|
for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
|
1999-11-15 21:49:07 +03:00
|
|
|
if (!DOINGSOFTDEP(vp))
|
|
|
|
ffs_blkfree(ip,
|
|
|
|
dbtofsb(fs, buflist->bs_children[i]->b_blkno),
|
|
|
|
fs->fs_bsize);
|
1994-06-08 15:41:58 +04:00
|
|
|
buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
|
1994-12-14 16:03:35 +03:00
|
|
|
#ifdef DEBUG
|
1998-03-01 05:20:01 +03:00
|
|
|
if (!ffs_checkblk(ip,
|
|
|
|
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
|
|
|
|
panic("ffs_reallocblks: unallocated block 3");
|
1994-12-14 16:03:35 +03:00
|
|
|
if (prtrealloc)
|
1996-10-13 01:58:44 +04:00
|
|
|
printf(" %d,", blkno);
|
1994-12-14 16:03:35 +03:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (prtrealloc) {
|
|
|
|
prtrealloc--;
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("\n");
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
1994-12-14 16:03:35 +03:00
|
|
|
#endif
|
1994-06-08 15:41:58 +04:00
|
|
|
return (0);
|
|
|
|
|
|
|
|
fail:
|
|
|
|
if (ssize < len)
|
|
|
|
brelse(ebp);
|
1997-06-11 14:09:37 +04:00
|
|
|
if (sbap != &ip->i_ffs_db[0])
|
1994-06-08 15:41:58 +04:00
|
|
|
brelse(sbp);
|
|
|
|
return (ENOSPC);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate an inode in the file system.
|
|
|
|
*
|
|
|
|
* If allocating a directory, use ffs_dirpref to select the inode.
|
|
|
|
* If allocating in a directory, the following hierarchy is followed:
|
|
|
|
* 1) allocate the preferred inode.
|
|
|
|
* 2) allocate an inode in the same cylinder group.
|
|
|
|
* 3) quadradically rehash into other cylinder groups, until an
|
|
|
|
* available inode is located.
|
2001-08-24 14:24:45 +04:00
|
|
|
* If no inode preference is given the following hierarchy is used
|
1994-06-08 15:41:58 +04:00
|
|
|
* to allocate an inode:
|
|
|
|
* 1) allocate an inode in cylinder group 0.
|
|
|
|
* 2) quadradically rehash into other cylinder groups, until an
|
|
|
|
* available inode is located.
|
|
|
|
*/
|
1996-02-10 01:22:18 +03:00
|
|
|
int
|
|
|
|
ffs_valloc(v)
|
|
|
|
void *v;
|
|
|
|
{
|
1994-06-08 15:41:58 +04:00
|
|
|
struct vop_valloc_args /* {
|
|
|
|
struct vnode *a_pvp;
|
|
|
|
int a_mode;
|
|
|
|
struct ucred *a_cred;
|
|
|
|
struct vnode **a_vpp;
|
1996-02-10 01:22:18 +03:00
|
|
|
} */ *ap = v;
|
2000-03-30 16:41:09 +04:00
|
|
|
struct vnode *pvp = ap->a_pvp;
|
|
|
|
struct inode *pip;
|
|
|
|
struct fs *fs;
|
|
|
|
struct inode *ip;
|
1994-06-08 15:41:58 +04:00
|
|
|
mode_t mode = ap->a_mode;
|
|
|
|
ino_t ino, ipref;
|
|
|
|
int cg, error;
|
|
|
|
|
|
|
|
*ap->a_vpp = NULL;
|
|
|
|
pip = VTOI(pvp);
|
|
|
|
fs = pip->i_fs;
|
|
|
|
if (fs->fs_cstotal.cs_nifree == 0)
|
|
|
|
goto noinodes;
|
|
|
|
|
|
|
|
if ((mode & IFMT) == IFDIR)
|
Incorporate the enhanced ffs_dirpref() by Grigoriy Orlov, as found in
FreeBSD (three commits; the initial work, man page updates, and a fix
to ffs_reload()), with the following differences:
- Be consistent between newfs(8) and tunefs(8) as to the options which
set and control the tuning parameters for this work (avgfilesize & avgfpdir)
- Use u_int16_t instead of u_int8_t to keep track of the number of
contiguous directories (suggested by Chuck Silvers)
- Work within our FFS_EI framework
- Ensure that fs->fs_maxclusters and fs->fs_contigdirs don't point to
the same area of memory
The new algorithm has a marked performance increase, especially when
performing tasks such as untarring pkgsrc.tar.gz, etc.
The original FreeBSD commit messages are attached:
=====
mckusick 2001/04/10 01:39:00 PDT
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
=====
=====
iedowse 2001/04/23 17:37:17 PDT
Pre-dirpref versions of fsck may zero out the new superblock fields
fs_contigdirs, fs_avgfilesize and fs_avgfpdir. This could cause
panics if these fields were zeroed while a filesystem was mounted
read-only, and then remounted read-write.
Add code to ffs_reload() which copies the fs_contigdirs pointer
from the previous superblock, and reinitialises fs_avgf* if necessary.
Reviewed by: mckusick
=====
=====
nik 2001/04/10 03:36:44 PDT
Add information about the new options to newfs and tunefs which set the
expected average file size and number of files per directory. Could do
with some fleshing out.
=====
2001-09-06 06:16:00 +04:00
|
|
|
ipref = ffs_dirpref(pip);
|
|
|
|
else
|
|
|
|
ipref = pip->i_number;
|
1994-06-08 15:41:58 +04:00
|
|
|
if (ipref >= fs->fs_ncg * fs->fs_ipg)
|
|
|
|
ipref = 0;
|
|
|
|
cg = ino_to_cg(fs, ipref);
|
Incorporate the enhanced ffs_dirpref() by Grigoriy Orlov, as found in
FreeBSD (three commits; the initial work, man page updates, and a fix
to ffs_reload()), with the following differences:
- Be consistent between newfs(8) and tunefs(8) as to the options which
set and control the tuning parameters for this work (avgfilesize & avgfpdir)
- Use u_int16_t instead of u_int8_t to keep track of the number of
contiguous directories (suggested by Chuck Silvers)
- Work within our FFS_EI framework
- Ensure that fs->fs_maxclusters and fs->fs_contigdirs don't point to
the same area of memory
The new algorithm has a marked performance increase, especially when
performing tasks such as untarring pkgsrc.tar.gz, etc.
The original FreeBSD commit messages are attached:
=====
mckusick 2001/04/10 01:39:00 PDT
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
=====
=====
iedowse 2001/04/23 17:37:17 PDT
Pre-dirpref versions of fsck may zero out the new superblock fields
fs_contigdirs, fs_avgfilesize and fs_avgfpdir. This could cause
panics if these fields were zeroed while a filesystem was mounted
read-only, and then remounted read-write.
Add code to ffs_reload() which copies the fs_contigdirs pointer
from the previous superblock, and reinitialises fs_avgf* if necessary.
Reviewed by: mckusick
=====
=====
nik 2001/04/10 03:36:44 PDT
Add information about the new options to newfs and tunefs which set the
expected average file size and number of files per directory. Could do
with some fleshing out.
=====
2001-09-06 06:16:00 +04:00
|
|
|
/*
|
|
|
|
* Track number of dirs created one after another
|
|
|
|
* in a same cg without intervening by files.
|
|
|
|
*/
|
|
|
|
if ((mode & IFMT) == IFDIR) {
|
|
|
|
if (fs->fs_contigdirs[cg] < 65535)
|
|
|
|
fs->fs_contigdirs[cg]++;
|
|
|
|
} else {
|
|
|
|
if (fs->fs_contigdirs[cg] > 0)
|
|
|
|
fs->fs_contigdirs[cg]--;
|
|
|
|
}
|
1994-06-08 15:41:58 +04:00
|
|
|
ino = (ino_t)ffs_hashalloc(pip, cg, (long)ipref, mode, ffs_nodealloccg);
|
|
|
|
if (ino == 0)
|
|
|
|
goto noinodes;
|
|
|
|
error = VFS_VGET(pvp->v_mount, ino, ap->a_vpp);
|
|
|
|
if (error) {
|
|
|
|
VOP_VFREE(pvp, ino, mode);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
ip = VTOI(*ap->a_vpp);
|
1997-06-11 14:09:37 +04:00
|
|
|
if (ip->i_ffs_mode) {
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("mode = 0%o, inum = %d, fs = %s\n",
|
1997-06-11 14:09:37 +04:00
|
|
|
ip->i_ffs_mode, ip->i_number, fs->fs_fsmnt);
|
1994-06-08 15:41:58 +04:00
|
|
|
panic("ffs_valloc: dup alloc");
|
|
|
|
}
|
1997-06-11 14:09:37 +04:00
|
|
|
if (ip->i_ffs_blocks) { /* XXX */
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("free inode %s/%d had %d blocks\n",
|
1997-06-11 14:09:37 +04:00
|
|
|
fs->fs_fsmnt, ino, ip->i_ffs_blocks);
|
|
|
|
ip->i_ffs_blocks = 0;
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
1997-06-11 14:09:37 +04:00
|
|
|
ip->i_ffs_flags = 0;
|
1994-06-08 15:41:58 +04:00
|
|
|
/*
|
|
|
|
* Set up a new generation number for this inode.
|
|
|
|
*/
|
1997-06-11 14:09:37 +04:00
|
|
|
ip->i_ffs_gen++;
|
1994-06-08 15:41:58 +04:00
|
|
|
return (0);
|
|
|
|
noinodes:
|
|
|
|
ffs_fserr(fs, ap->a_cred->cr_uid, "out of inodes");
|
|
|
|
uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
|
|
|
|
return (ENOSPC);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
Incorporate the enhanced ffs_dirpref() by Grigoriy Orlov, as found in
FreeBSD (three commits; the initial work, man page updates, and a fix
to ffs_reload()), with the following differences:
- Be consistent between newfs(8) and tunefs(8) as to the options which
set and control the tuning parameters for this work (avgfilesize & avgfpdir)
- Use u_int16_t instead of u_int8_t to keep track of the number of
contiguous directories (suggested by Chuck Silvers)
- Work within our FFS_EI framework
- Ensure that fs->fs_maxclusters and fs->fs_contigdirs don't point to
the same area of memory
The new algorithm has a marked performance increase, especially when
performing tasks such as untarring pkgsrc.tar.gz, etc.
The original FreeBSD commit messages are attached:
=====
mckusick 2001/04/10 01:39:00 PDT
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
=====
=====
iedowse 2001/04/23 17:37:17 PDT
Pre-dirpref versions of fsck may zero out the new superblock fields
fs_contigdirs, fs_avgfilesize and fs_avgfpdir. This could cause
panics if these fields were zeroed while a filesystem was mounted
read-only, and then remounted read-write.
Add code to ffs_reload() which copies the fs_contigdirs pointer
from the previous superblock, and reinitialises fs_avgf* if necessary.
Reviewed by: mckusick
=====
=====
nik 2001/04/10 03:36:44 PDT
Add information about the new options to newfs and tunefs which set the
expected average file size and number of files per directory. Could do
with some fleshing out.
=====
2001-09-06 06:16:00 +04:00
|
|
|
* Find a cylinder group in which to place a directory.
|
1994-06-08 15:41:58 +04:00
|
|
|
*
|
Incorporate the enhanced ffs_dirpref() by Grigoriy Orlov, as found in
FreeBSD (three commits; the initial work, man page updates, and a fix
to ffs_reload()), with the following differences:
- Be consistent between newfs(8) and tunefs(8) as to the options which
set and control the tuning parameters for this work (avgfilesize & avgfpdir)
- Use u_int16_t instead of u_int8_t to keep track of the number of
contiguous directories (suggested by Chuck Silvers)
- Work within our FFS_EI framework
- Ensure that fs->fs_maxclusters and fs->fs_contigdirs don't point to
the same area of memory
The new algorithm has a marked performance increase, especially when
performing tasks such as untarring pkgsrc.tar.gz, etc.
The original FreeBSD commit messages are attached:
=====
mckusick 2001/04/10 01:39:00 PDT
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
=====
=====
iedowse 2001/04/23 17:37:17 PDT
Pre-dirpref versions of fsck may zero out the new superblock fields
fs_contigdirs, fs_avgfilesize and fs_avgfpdir. This could cause
panics if these fields were zeroed while a filesystem was mounted
read-only, and then remounted read-write.
Add code to ffs_reload() which copies the fs_contigdirs pointer
from the previous superblock, and reinitialises fs_avgf* if necessary.
Reviewed by: mckusick
=====
=====
nik 2001/04/10 03:36:44 PDT
Add information about the new options to newfs and tunefs which set the
expected average file size and number of files per directory. Could do
with some fleshing out.
=====
2001-09-06 06:16:00 +04:00
|
|
|
* The policy implemented by this algorithm is to allocate a
|
|
|
|
* directory inode in the same cylinder group as its parent
|
|
|
|
* directory, but also to reserve space for its files inodes
|
|
|
|
* and data. Restrict the number of directories which may be
|
|
|
|
* allocated one after another in the same cylinder group
|
|
|
|
* without intervening allocation of files.
|
2001-03-14 00:16:23 +03:00
|
|
|
*
|
Incorporate the enhanced ffs_dirpref() by Grigoriy Orlov, as found in
FreeBSD (three commits; the initial work, man page updates, and a fix
to ffs_reload()), with the following differences:
- Be consistent between newfs(8) and tunefs(8) as to the options which
set and control the tuning parameters for this work (avgfilesize & avgfpdir)
- Use u_int16_t instead of u_int8_t to keep track of the number of
contiguous directories (suggested by Chuck Silvers)
- Work within our FFS_EI framework
- Ensure that fs->fs_maxclusters and fs->fs_contigdirs don't point to
the same area of memory
The new algorithm has a marked performance increase, especially when
performing tasks such as untarring pkgsrc.tar.gz, etc.
The original FreeBSD commit messages are attached:
=====
mckusick 2001/04/10 01:39:00 PDT
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
=====
=====
iedowse 2001/04/23 17:37:17 PDT
Pre-dirpref versions of fsck may zero out the new superblock fields
fs_contigdirs, fs_avgfilesize and fs_avgfpdir. This could cause
panics if these fields were zeroed while a filesystem was mounted
read-only, and then remounted read-write.
Add code to ffs_reload() which copies the fs_contigdirs pointer
from the previous superblock, and reinitialises fs_avgf* if necessary.
Reviewed by: mckusick
=====
=====
nik 2001/04/10 03:36:44 PDT
Add information about the new options to newfs and tunefs which set the
expected average file size and number of files per directory. Could do
with some fleshing out.
=====
2001-09-06 06:16:00 +04:00
|
|
|
* If we allocate a first level directory then force allocation
|
|
|
|
* in another cylinder group.
|
1994-06-08 15:41:58 +04:00
|
|
|
*/
|
|
|
|
static ino_t
|
Incorporate the enhanced ffs_dirpref() by Grigoriy Orlov, as found in
FreeBSD (three commits; the initial work, man page updates, and a fix
to ffs_reload()), with the following differences:
- Be consistent between newfs(8) and tunefs(8) as to the options which
set and control the tuning parameters for this work (avgfilesize & avgfpdir)
- Use u_int16_t instead of u_int8_t to keep track of the number of
contiguous directories (suggested by Chuck Silvers)
- Work within our FFS_EI framework
- Ensure that fs->fs_maxclusters and fs->fs_contigdirs don't point to
the same area of memory
The new algorithm has a marked performance increase, especially when
performing tasks such as untarring pkgsrc.tar.gz, etc.
The original FreeBSD commit messages are attached:
=====
mckusick 2001/04/10 01:39:00 PDT
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
=====
=====
iedowse 2001/04/23 17:37:17 PDT
Pre-dirpref versions of fsck may zero out the new superblock fields
fs_contigdirs, fs_avgfilesize and fs_avgfpdir. This could cause
panics if these fields were zeroed while a filesystem was mounted
read-only, and then remounted read-write.
Add code to ffs_reload() which copies the fs_contigdirs pointer
from the previous superblock, and reinitialises fs_avgf* if necessary.
Reviewed by: mckusick
=====
=====
nik 2001/04/10 03:36:44 PDT
Add information about the new options to newfs and tunefs which set the
expected average file size and number of files per directory. Could do
with some fleshing out.
=====
2001-09-06 06:16:00 +04:00
|
|
|
ffs_dirpref(pip)
|
|
|
|
struct inode *pip;
|
1994-06-08 15:41:58 +04:00
|
|
|
{
|
Incorporate the enhanced ffs_dirpref() by Grigoriy Orlov, as found in
FreeBSD (three commits; the initial work, man page updates, and a fix
to ffs_reload()), with the following differences:
- Be consistent between newfs(8) and tunefs(8) as to the options which
set and control the tuning parameters for this work (avgfilesize & avgfpdir)
- Use u_int16_t instead of u_int8_t to keep track of the number of
contiguous directories (suggested by Chuck Silvers)
- Work within our FFS_EI framework
- Ensure that fs->fs_maxclusters and fs->fs_contigdirs don't point to
the same area of memory
The new algorithm has a marked performance increase, especially when
performing tasks such as untarring pkgsrc.tar.gz, etc.
The original FreeBSD commit messages are attached:
=====
mckusick 2001/04/10 01:39:00 PDT
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
=====
=====
iedowse 2001/04/23 17:37:17 PDT
Pre-dirpref versions of fsck may zero out the new superblock fields
fs_contigdirs, fs_avgfilesize and fs_avgfpdir. This could cause
panics if these fields were zeroed while a filesystem was mounted
read-only, and then remounted read-write.
Add code to ffs_reload() which copies the fs_contigdirs pointer
from the previous superblock, and reinitialises fs_avgf* if necessary.
Reviewed by: mckusick
=====
=====
nik 2001/04/10 03:36:44 PDT
Add information about the new options to newfs and tunefs which set the
expected average file size and number of files per directory. Could do
with some fleshing out.
=====
2001-09-06 06:16:00 +04:00
|
|
|
register struct fs *fs;
|
|
|
|
int cg, prefcg, dirsize, cgsize;
|
|
|
|
int avgifree, avgbfree, avgndir, curdirsize;
|
|
|
|
int minifree, minbfree, maxndir;
|
|
|
|
int mincg, minndir;
|
|
|
|
int maxcontigdirs;
|
|
|
|
|
|
|
|
fs = pip->i_fs;
|
1994-06-08 15:41:58 +04:00
|
|
|
|
|
|
|
avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
|
Incorporate the enhanced ffs_dirpref() by Grigoriy Orlov, as found in
FreeBSD (three commits; the initial work, man page updates, and a fix
to ffs_reload()), with the following differences:
- Be consistent between newfs(8) and tunefs(8) as to the options which
set and control the tuning parameters for this work (avgfilesize & avgfpdir)
- Use u_int16_t instead of u_int8_t to keep track of the number of
contiguous directories (suggested by Chuck Silvers)
- Work within our FFS_EI framework
- Ensure that fs->fs_maxclusters and fs->fs_contigdirs don't point to
the same area of memory
The new algorithm has a marked performance increase, especially when
performing tasks such as untarring pkgsrc.tar.gz, etc.
The original FreeBSD commit messages are attached:
=====
mckusick 2001/04/10 01:39:00 PDT
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
=====
=====
iedowse 2001/04/23 17:37:17 PDT
Pre-dirpref versions of fsck may zero out the new superblock fields
fs_contigdirs, fs_avgfilesize and fs_avgfpdir. This could cause
panics if these fields were zeroed while a filesystem was mounted
read-only, and then remounted read-write.
Add code to ffs_reload() which copies the fs_contigdirs pointer
from the previous superblock, and reinitialises fs_avgf* if necessary.
Reviewed by: mckusick
=====
=====
nik 2001/04/10 03:36:44 PDT
Add information about the new options to newfs and tunefs which set the
expected average file size and number of files per directory. Could do
with some fleshing out.
=====
2001-09-06 06:16:00 +04:00
|
|
|
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
|
|
|
|
avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Force allocation in another cg if creating a first level dir.
|
|
|
|
*/
|
|
|
|
if (ITOV(pip)->v_flag & VROOT) {
|
|
|
|
prefcg = random() % fs->fs_ncg;
|
|
|
|
mincg = prefcg;
|
|
|
|
minndir = fs->fs_ipg;
|
|
|
|
for (cg = prefcg; cg < fs->fs_ncg; cg++)
|
|
|
|
if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
|
|
|
|
fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
|
|
|
|
fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
|
2001-03-14 00:16:23 +03:00
|
|
|
mincg = cg;
|
Incorporate the enhanced ffs_dirpref() by Grigoriy Orlov, as found in
FreeBSD (three commits; the initial work, man page updates, and a fix
to ffs_reload()), with the following differences:
- Be consistent between newfs(8) and tunefs(8) as to the options which
set and control the tuning parameters for this work (avgfilesize & avgfpdir)
- Use u_int16_t instead of u_int8_t to keep track of the number of
contiguous directories (suggested by Chuck Silvers)
- Work within our FFS_EI framework
- Ensure that fs->fs_maxclusters and fs->fs_contigdirs don't point to
the same area of memory
The new algorithm has a marked performance increase, especially when
performing tasks such as untarring pkgsrc.tar.gz, etc.
The original FreeBSD commit messages are attached:
=====
mckusick 2001/04/10 01:39:00 PDT
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
=====
=====
iedowse 2001/04/23 17:37:17 PDT
Pre-dirpref versions of fsck may zero out the new superblock fields
fs_contigdirs, fs_avgfilesize and fs_avgfpdir. This could cause
panics if these fields were zeroed while a filesystem was mounted
read-only, and then remounted read-write.
Add code to ffs_reload() which copies the fs_contigdirs pointer
from the previous superblock, and reinitialises fs_avgf* if necessary.
Reviewed by: mckusick
=====
=====
nik 2001/04/10 03:36:44 PDT
Add information about the new options to newfs and tunefs which set the
expected average file size and number of files per directory. Could do
with some fleshing out.
=====
2001-09-06 06:16:00 +04:00
|
|
|
minndir = fs->fs_cs(fs, cg).cs_ndir;
|
2001-03-14 00:16:23 +03:00
|
|
|
}
|
Incorporate the enhanced ffs_dirpref() by Grigoriy Orlov, as found in
FreeBSD (three commits; the initial work, man page updates, and a fix
to ffs_reload()), with the following differences:
- Be consistent between newfs(8) and tunefs(8) as to the options which
set and control the tuning parameters for this work (avgfilesize & avgfpdir)
- Use u_int16_t instead of u_int8_t to keep track of the number of
contiguous directories (suggested by Chuck Silvers)
- Work within our FFS_EI framework
- Ensure that fs->fs_maxclusters and fs->fs_contigdirs don't point to
the same area of memory
The new algorithm has a marked performance increase, especially when
performing tasks such as untarring pkgsrc.tar.gz, etc.
The original FreeBSD commit messages are attached:
=====
mckusick 2001/04/10 01:39:00 PDT
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
=====
=====
iedowse 2001/04/23 17:37:17 PDT
Pre-dirpref versions of fsck may zero out the new superblock fields
fs_contigdirs, fs_avgfilesize and fs_avgfpdir. This could cause
panics if these fields were zeroed while a filesystem was mounted
read-only, and then remounted read-write.
Add code to ffs_reload() which copies the fs_contigdirs pointer
from the previous superblock, and reinitialises fs_avgf* if necessary.
Reviewed by: mckusick
=====
=====
nik 2001/04/10 03:36:44 PDT
Add information about the new options to newfs and tunefs which set the
expected average file size and number of files per directory. Could do
with some fleshing out.
=====
2001-09-06 06:16:00 +04:00
|
|
|
for (cg = 0; cg < prefcg; cg++)
|
|
|
|
if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
|
|
|
|
fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
|
|
|
|
fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
|
|
|
|
mincg = cg;
|
|
|
|
minndir = fs->fs_cs(fs, cg).cs_ndir;
|
2001-03-14 00:16:23 +03:00
|
|
|
}
|
Incorporate the enhanced ffs_dirpref() by Grigoriy Orlov, as found in
FreeBSD (three commits; the initial work, man page updates, and a fix
to ffs_reload()), with the following differences:
- Be consistent between newfs(8) and tunefs(8) as to the options which
set and control the tuning parameters for this work (avgfilesize & avgfpdir)
- Use u_int16_t instead of u_int8_t to keep track of the number of
contiguous directories (suggested by Chuck Silvers)
- Work within our FFS_EI framework
- Ensure that fs->fs_maxclusters and fs->fs_contigdirs don't point to
the same area of memory
The new algorithm has a marked performance increase, especially when
performing tasks such as untarring pkgsrc.tar.gz, etc.
The original FreeBSD commit messages are attached:
=====
mckusick 2001/04/10 01:39:00 PDT
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
=====
=====
iedowse 2001/04/23 17:37:17 PDT
Pre-dirpref versions of fsck may zero out the new superblock fields
fs_contigdirs, fs_avgfilesize and fs_avgfpdir. This could cause
panics if these fields were zeroed while a filesystem was mounted
read-only, and then remounted read-write.
Add code to ffs_reload() which copies the fs_contigdirs pointer
from the previous superblock, and reinitialises fs_avgf* if necessary.
Reviewed by: mckusick
=====
=====
nik 2001/04/10 03:36:44 PDT
Add information about the new options to newfs and tunefs which set the
expected average file size and number of files per directory. Could do
with some fleshing out.
=====
2001-09-06 06:16:00 +04:00
|
|
|
return ((ino_t)(fs->fs_ipg * mincg));
|
2001-03-14 00:16:23 +03:00
|
|
|
}
|
Incorporate the enhanced ffs_dirpref() by Grigoriy Orlov, as found in
FreeBSD (three commits; the initial work, man page updates, and a fix
to ffs_reload()), with the following differences:
- Be consistent between newfs(8) and tunefs(8) as to the options which
set and control the tuning parameters for this work (avgfilesize & avgfpdir)
- Use u_int16_t instead of u_int8_t to keep track of the number of
contiguous directories (suggested by Chuck Silvers)
- Work within our FFS_EI framework
- Ensure that fs->fs_maxclusters and fs->fs_contigdirs don't point to
the same area of memory
The new algorithm has a marked performance increase, especially when
performing tasks such as untarring pkgsrc.tar.gz, etc.
The original FreeBSD commit messages are attached:
=====
mckusick 2001/04/10 01:39:00 PDT
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
=====
=====
iedowse 2001/04/23 17:37:17 PDT
Pre-dirpref versions of fsck may zero out the new superblock fields
fs_contigdirs, fs_avgfilesize and fs_avgfpdir. This could cause
panics if these fields were zeroed while a filesystem was mounted
read-only, and then remounted read-write.
Add code to ffs_reload() which copies the fs_contigdirs pointer
from the previous superblock, and reinitialises fs_avgf* if necessary.
Reviewed by: mckusick
=====
=====
nik 2001/04/10 03:36:44 PDT
Add information about the new options to newfs and tunefs which set the
expected average file size and number of files per directory. Could do
with some fleshing out.
=====
2001-09-06 06:16:00 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Count various limits which used for
|
|
|
|
* optimal allocation of a directory inode.
|
|
|
|
*/
|
|
|
|
maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
|
|
|
|
minifree = avgifree - fs->fs_ipg / 4;
|
|
|
|
if (minifree < 0)
|
|
|
|
minifree = 0;
|
|
|
|
minbfree = avgbfree - fs->fs_fpg / fs->fs_frag / 4;
|
|
|
|
if (minbfree < 0)
|
|
|
|
minbfree = 0;
|
|
|
|
cgsize = fs->fs_fsize * fs->fs_fpg;
|
|
|
|
dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir;
|
|
|
|
curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0;
|
|
|
|
if (dirsize < curdirsize)
|
|
|
|
dirsize = curdirsize;
|
|
|
|
maxcontigdirs = min(cgsize / dirsize, 255);
|
|
|
|
if (fs->fs_avgfpdir > 0)
|
|
|
|
maxcontigdirs = min(maxcontigdirs,
|
|
|
|
fs->fs_ipg / fs->fs_avgfpdir);
|
|
|
|
if (maxcontigdirs == 0)
|
|
|
|
maxcontigdirs = 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Limit number of dirs in one cg and reserve space for
|
|
|
|
* regular files, but only if we have no deficit in
|
|
|
|
* inodes or space.
|
|
|
|
*/
|
|
|
|
prefcg = ino_to_cg(fs, pip->i_number);
|
|
|
|
for (cg = prefcg; cg < fs->fs_ncg; cg++)
|
|
|
|
if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
|
|
|
|
fs->fs_cs(fs, cg).cs_nifree >= minifree &&
|
|
|
|
fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
|
|
|
|
if (fs->fs_contigdirs[cg] < maxcontigdirs)
|
|
|
|
return ((ino_t)(fs->fs_ipg * cg));
|
|
|
|
}
|
|
|
|
for (cg = 0; cg < prefcg; cg++)
|
|
|
|
if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
|
|
|
|
fs->fs_cs(fs, cg).cs_nifree >= minifree &&
|
|
|
|
fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
|
|
|
|
if (fs->fs_contigdirs[cg] < maxcontigdirs)
|
|
|
|
return ((ino_t)(fs->fs_ipg * cg));
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* This is a backstop when we are deficient in space.
|
|
|
|
*/
|
|
|
|
for (cg = prefcg; cg < fs->fs_ncg; cg++)
|
|
|
|
if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
|
|
|
|
return ((ino_t)(fs->fs_ipg * cg));
|
|
|
|
for (cg = 0; cg < prefcg; cg++)
|
|
|
|
if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
|
|
|
|
break;
|
|
|
|
return ((ino_t)(fs->fs_ipg * cg));
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Select the desired position for the next block in a file. The file is
|
|
|
|
* logically divided into sections. The first section is composed of the
|
|
|
|
* direct blocks. Each additional section contains fs_maxbpg blocks.
|
|
|
|
*
|
|
|
|
* If no blocks have been allocated in the first section, the policy is to
|
|
|
|
* request a block in the same cylinder group as the inode that describes
|
|
|
|
* the file. If no blocks have been allocated in any other section, the
|
|
|
|
* policy is to place the section in a cylinder group with a greater than
|
|
|
|
* average number of free blocks. An appropriate cylinder group is found
|
|
|
|
* by using a rotor that sweeps the cylinder groups. When a new group of
|
|
|
|
* blocks is needed, the sweep begins in the cylinder group following the
|
|
|
|
* cylinder group from which the previous allocation was made. The sweep
|
|
|
|
* continues until a cylinder group with greater than the average number
|
|
|
|
* of free blocks is found. If the allocation is for the first block in an
|
|
|
|
* indirect block, the information on the previous allocation is unavailable;
|
|
|
|
* here a best guess is made based upon the logical block number being
|
|
|
|
* allocated.
|
|
|
|
*
|
|
|
|
* If a section is already partially allocated, the policy is to
|
|
|
|
* contiguously allocate fs_maxcontig blocks. The end of one of these
|
|
|
|
* contiguous blocks and the beginning of the next is physically separated
|
|
|
|
* so that the disk head will be in transit between them for at least
|
|
|
|
* fs_rotdelay milliseconds. This is to allow time for the processor to
|
|
|
|
* schedule another I/O transfer.
|
|
|
|
*/
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t
|
1994-06-08 15:41:58 +04:00
|
|
|
ffs_blkpref(ip, lbn, indx, bap)
|
|
|
|
struct inode *ip;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t lbn;
|
1994-06-08 15:41:58 +04:00
|
|
|
int indx;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t *bap;
|
1994-06-08 15:41:58 +04:00
|
|
|
{
|
2000-03-30 16:41:09 +04:00
|
|
|
struct fs *fs;
|
|
|
|
int cg;
|
1994-06-08 15:41:58 +04:00
|
|
|
int avgbfree, startcg;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t nextblk;
|
1994-06-08 15:41:58 +04:00
|
|
|
|
|
|
|
fs = ip->i_fs;
|
|
|
|
if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
|
2000-02-15 01:00:21 +03:00
|
|
|
if (lbn < NDADDR + NINDIR(fs)) {
|
1994-06-08 15:41:58 +04:00
|
|
|
cg = ino_to_cg(fs, ip->i_number);
|
|
|
|
return (fs->fs_fpg * cg + fs->fs_frag);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Find a cylinder with greater than average number of
|
|
|
|
* unused data blocks.
|
|
|
|
*/
|
|
|
|
if (indx == 0 || bap[indx - 1] == 0)
|
|
|
|
startcg =
|
|
|
|
ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
|
|
|
|
else
|
1998-03-18 18:57:26 +03:00
|
|
|
startcg = dtog(fs,
|
1999-11-15 21:49:07 +03:00
|
|
|
ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
|
1994-06-08 15:41:58 +04:00
|
|
|
startcg %= fs->fs_ncg;
|
|
|
|
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
|
|
|
|
for (cg = startcg; cg < fs->fs_ncg; cg++)
|
|
|
|
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
|
|
|
|
return (fs->fs_fpg * cg + fs->fs_frag);
|
|
|
|
}
|
2001-09-19 05:38:16 +04:00
|
|
|
for (cg = 0; cg < startcg; cg++)
|
1994-06-08 15:41:58 +04:00
|
|
|
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
|
|
|
|
return (fs->fs_fpg * cg + fs->fs_frag);
|
|
|
|
}
|
2000-05-19 08:34:39 +04:00
|
|
|
return (0);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* One or more previous blocks have been laid out. If less
|
|
|
|
* than fs_maxcontig previous blocks are contiguous, the
|
|
|
|
* next block is requested contiguously, otherwise it is
|
|
|
|
* requested rotationally delayed by fs_rotdelay milliseconds.
|
|
|
|
*/
|
1999-11-15 21:49:07 +03:00
|
|
|
nextblk = ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
|
1998-03-18 18:57:26 +03:00
|
|
|
if (indx < fs->fs_maxcontig ||
|
1999-11-15 21:49:07 +03:00
|
|
|
ufs_rw32(bap[indx - fs->fs_maxcontig], UFS_FSNEEDSWAP(fs)) +
|
1994-06-08 15:41:58 +04:00
|
|
|
blkstofrags(fs, fs->fs_maxcontig) != nextblk)
|
|
|
|
return (nextblk);
|
|
|
|
if (fs->fs_rotdelay != 0)
|
|
|
|
/*
|
|
|
|
* Here we convert ms of delay to frags as:
|
|
|
|
* (frags) = (ms) * (rev/sec) * (sect/rev) /
|
|
|
|
* ((sect/frag) * (ms/sec))
|
|
|
|
* then round up to the next block.
|
|
|
|
*/
|
|
|
|
nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect /
|
|
|
|
(NSPF(fs) * 1000), fs->fs_frag);
|
|
|
|
return (nextblk);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Implement the cylinder overflow algorithm.
|
|
|
|
*
|
|
|
|
* The policy implemented by this algorithm is:
|
|
|
|
* 1) allocate the block in its requested cylinder group.
|
|
|
|
* 2) quadradically rehash on the cylinder group number.
|
|
|
|
* 3) brute force search for a free block.
|
|
|
|
*/
|
|
|
|
/*VARARGS5*/
|
|
|
|
static u_long
|
|
|
|
ffs_hashalloc(ip, cg, pref, size, allocator)
|
|
|
|
struct inode *ip;
|
|
|
|
int cg;
|
|
|
|
long pref;
|
|
|
|
int size; /* size for data blocks, mode for inodes */
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t (*allocator) __P((struct inode *, int, ufs_daddr_t, int));
|
1994-06-08 15:41:58 +04:00
|
|
|
{
|
2000-03-30 16:41:09 +04:00
|
|
|
struct fs *fs;
|
1994-06-08 15:41:58 +04:00
|
|
|
long result;
|
|
|
|
int i, icg = cg;
|
|
|
|
|
|
|
|
fs = ip->i_fs;
|
|
|
|
/*
|
|
|
|
* 1: preferred cylinder group
|
|
|
|
*/
|
|
|
|
result = (*allocator)(ip, cg, pref, size);
|
|
|
|
if (result)
|
|
|
|
return (result);
|
|
|
|
/*
|
|
|
|
* 2: quadratic rehash
|
|
|
|
*/
|
|
|
|
for (i = 1; i < fs->fs_ncg; i *= 2) {
|
|
|
|
cg += i;
|
|
|
|
if (cg >= fs->fs_ncg)
|
|
|
|
cg -= fs->fs_ncg;
|
|
|
|
result = (*allocator)(ip, cg, 0, size);
|
|
|
|
if (result)
|
|
|
|
return (result);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* 3: brute force search
|
|
|
|
* Note that we start at i == 2, since 0 was checked initially,
|
|
|
|
* and 1 is always checked in the quadratic rehash.
|
|
|
|
*/
|
|
|
|
cg = (icg + 2) % fs->fs_ncg;
|
|
|
|
for (i = 2; i < fs->fs_ncg; i++) {
|
|
|
|
result = (*allocator)(ip, cg, 0, size);
|
|
|
|
if (result)
|
|
|
|
return (result);
|
|
|
|
cg++;
|
|
|
|
if (cg == fs->fs_ncg)
|
|
|
|
cg = 0;
|
|
|
|
}
|
2000-05-19 08:34:39 +04:00
|
|
|
return (0);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine whether a fragment can be extended.
|
|
|
|
*
|
|
|
|
* Check to see if the necessary fragments are available, and
|
|
|
|
* if they are, allocate them.
|
|
|
|
*/
|
1998-03-01 05:20:01 +03:00
|
|
|
static ufs_daddr_t
|
1994-06-08 15:41:58 +04:00
|
|
|
ffs_fragextend(ip, cg, bprev, osize, nsize)
|
|
|
|
struct inode *ip;
|
|
|
|
int cg;
|
|
|
|
long bprev;
|
|
|
|
int osize, nsize;
|
|
|
|
{
|
2000-03-30 16:41:09 +04:00
|
|
|
struct fs *fs;
|
|
|
|
struct cg *cgp;
|
1994-06-08 15:41:58 +04:00
|
|
|
struct buf *bp;
|
|
|
|
long bno;
|
|
|
|
int frags, bbase;
|
|
|
|
int i, error;
|
|
|
|
|
|
|
|
fs = ip->i_fs;
|
|
|
|
if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
|
2000-05-19 08:34:39 +04:00
|
|
|
return (0);
|
1994-06-08 15:41:58 +04:00
|
|
|
frags = numfrags(fs, nsize);
|
|
|
|
bbase = fragnum(fs, bprev);
|
|
|
|
if (bbase > fragnum(fs, (bprev + frags - 1))) {
|
|
|
|
/* cannot extend across a block boundary */
|
2000-05-19 08:34:39 +04:00
|
|
|
return (0);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
|
|
|
error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
|
|
|
|
(int)fs->fs_cgsize, NOCRED, &bp);
|
|
|
|
if (error) {
|
|
|
|
brelse(bp);
|
2000-05-19 08:34:39 +04:00
|
|
|
return (0);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
|
|
|
cgp = (struct cg *)bp->b_data;
|
1999-11-15 21:49:07 +03:00
|
|
|
if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
|
1994-06-08 15:41:58 +04:00
|
|
|
brelse(bp);
|
2000-05-19 08:34:39 +04:00
|
|
|
return (0);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
1999-11-15 21:49:07 +03:00
|
|
|
cgp->cg_time = ufs_rw32(time.tv_sec, UFS_FSNEEDSWAP(fs));
|
1994-06-08 15:41:58 +04:00
|
|
|
bno = dtogd(fs, bprev);
|
|
|
|
for (i = numfrags(fs, osize); i < frags; i++)
|
1999-11-15 21:49:07 +03:00
|
|
|
if (isclr(cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)), bno + i)) {
|
1994-06-08 15:41:58 +04:00
|
|
|
brelse(bp);
|
2000-05-19 08:34:39 +04:00
|
|
|
return (0);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* the current fragment can be extended
|
|
|
|
* deduct the count on fragment being extended into
|
|
|
|
* increase the count on the remaining fragment (if any)
|
|
|
|
* allocate the extended piece
|
|
|
|
*/
|
|
|
|
for (i = frags; i < fs->fs_frag - bbase; i++)
|
1999-11-15 21:49:07 +03:00
|
|
|
if (isclr(cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)), bno + i))
|
1994-06-08 15:41:58 +04:00
|
|
|
break;
|
1999-11-15 21:49:07 +03:00
|
|
|
ufs_add32(cgp->cg_frsum[i - numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs));
|
1994-06-08 15:41:58 +04:00
|
|
|
if (i != frags)
|
1999-11-15 21:49:07 +03:00
|
|
|
ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs));
|
1994-06-08 15:41:58 +04:00
|
|
|
for (i = numfrags(fs, osize); i < frags; i++) {
|
1999-11-15 21:49:07 +03:00
|
|
|
clrbit(cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)), bno + i);
|
|
|
|
ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs));
|
1994-06-08 15:41:58 +04:00
|
|
|
fs->fs_cstotal.cs_nffree--;
|
|
|
|
fs->fs_cs(fs, cg).cs_nffree--;
|
|
|
|
}
|
|
|
|
fs->fs_fmod = 1;
|
1999-11-15 21:49:07 +03:00
|
|
|
if (DOINGSOFTDEP(ITOV(ip)))
|
|
|
|
softdep_setup_blkmapdep(bp, fs, bprev);
|
1994-06-08 15:41:58 +04:00
|
|
|
bdwrite(bp);
|
|
|
|
return (bprev);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine whether a block can be allocated.
|
|
|
|
*
|
|
|
|
* Check to see if a block of the appropriate size is available,
|
|
|
|
* and if it is, allocate it.
|
|
|
|
*/
|
1998-03-01 05:20:01 +03:00
|
|
|
static ufs_daddr_t
|
1994-06-08 15:41:58 +04:00
|
|
|
ffs_alloccg(ip, cg, bpref, size)
|
|
|
|
struct inode *ip;
|
|
|
|
int cg;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t bpref;
|
1994-06-08 15:41:58 +04:00
|
|
|
int size;
|
|
|
|
{
|
1999-11-15 21:49:07 +03:00
|
|
|
struct cg *cgp;
|
1994-06-08 15:41:58 +04:00
|
|
|
struct buf *bp;
|
1999-11-15 21:49:07 +03:00
|
|
|
ufs_daddr_t bno, blkno;
|
|
|
|
int error, frags, allocsiz, i;
|
|
|
|
struct fs *fs = ip->i_fs;
|
|
|
|
#ifdef FFS_EI
|
|
|
|
const int needswap = UFS_FSNEEDSWAP(fs);
|
|
|
|
#endif
|
1994-06-08 15:41:58 +04:00
|
|
|
|
|
|
|
if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
|
2000-05-19 08:34:39 +04:00
|
|
|
return (0);
|
1994-06-08 15:41:58 +04:00
|
|
|
error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
|
|
|
|
(int)fs->fs_cgsize, NOCRED, &bp);
|
|
|
|
if (error) {
|
|
|
|
brelse(bp);
|
2000-05-19 08:34:39 +04:00
|
|
|
return (0);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
|
|
|
cgp = (struct cg *)bp->b_data;
|
1998-03-18 18:57:26 +03:00
|
|
|
if (!cg_chkmagic(cgp, needswap) ||
|
1994-06-08 15:41:58 +04:00
|
|
|
(cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) {
|
|
|
|
brelse(bp);
|
2000-05-19 08:34:39 +04:00
|
|
|
return (0);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
1998-03-18 18:57:26 +03:00
|
|
|
cgp->cg_time = ufs_rw32(time.tv_sec, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
if (size == fs->fs_bsize) {
|
1999-11-15 21:49:07 +03:00
|
|
|
bno = ffs_alloccgblk(ip, bp, bpref);
|
1994-06-08 15:41:58 +04:00
|
|
|
bdwrite(bp);
|
|
|
|
return (bno);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* check to see if any fragments are already available
|
|
|
|
* allocsiz is the size which will be allocated, hacking
|
|
|
|
* it down to a smaller size if necessary
|
|
|
|
*/
|
|
|
|
frags = numfrags(fs, size);
|
|
|
|
for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
|
|
|
|
if (cgp->cg_frsum[allocsiz] != 0)
|
|
|
|
break;
|
|
|
|
if (allocsiz == fs->fs_frag) {
|
|
|
|
/*
|
|
|
|
* no fragments were available, so a block will be
|
|
|
|
* allocated, and hacked up
|
|
|
|
*/
|
|
|
|
if (cgp->cg_cs.cs_nbfree == 0) {
|
|
|
|
brelse(bp);
|
2000-05-19 08:34:39 +04:00
|
|
|
return (0);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
1999-11-15 21:49:07 +03:00
|
|
|
bno = ffs_alloccgblk(ip, bp, bpref);
|
1994-06-08 15:41:58 +04:00
|
|
|
bpref = dtogd(fs, bno);
|
|
|
|
for (i = frags; i < fs->fs_frag; i++)
|
1998-03-18 18:57:26 +03:00
|
|
|
setbit(cg_blksfree(cgp, needswap), bpref + i);
|
1994-06-08 15:41:58 +04:00
|
|
|
i = fs->fs_frag - frags;
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
fs->fs_cstotal.cs_nffree += i;
|
1999-11-15 21:49:07 +03:00
|
|
|
fs->fs_cs(fs, cg).cs_nffree += i;
|
1994-06-08 15:41:58 +04:00
|
|
|
fs->fs_fmod = 1;
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add32(cgp->cg_frsum[i], 1, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
bdwrite(bp);
|
|
|
|
return (bno);
|
|
|
|
}
|
1999-11-15 21:49:07 +03:00
|
|
|
bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
|
|
|
|
#if 0
|
|
|
|
/*
|
|
|
|
* XXX fvdl mapsearch will panic, and never return -1
|
|
|
|
* also: returning NULL as ufs_daddr_t ?
|
|
|
|
*/
|
1994-06-08 15:41:58 +04:00
|
|
|
if (bno < 0) {
|
|
|
|
brelse(bp);
|
2000-05-19 08:34:39 +04:00
|
|
|
return (0);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
1999-11-15 21:49:07 +03:00
|
|
|
#endif
|
1994-06-08 15:41:58 +04:00
|
|
|
for (i = 0; i < frags; i++)
|
1998-03-18 18:57:26 +03:00
|
|
|
clrbit(cg_blksfree(cgp, needswap), bno + i);
|
|
|
|
ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
fs->fs_cstotal.cs_nffree -= frags;
|
|
|
|
fs->fs_cs(fs, cg).cs_nffree -= frags;
|
|
|
|
fs->fs_fmod = 1;
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
if (frags != allocsiz)
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap);
|
1999-11-15 21:49:07 +03:00
|
|
|
blkno = cg * fs->fs_fpg + bno;
|
|
|
|
if (DOINGSOFTDEP(ITOV(ip)))
|
|
|
|
softdep_setup_blkmapdep(bp, fs, blkno);
|
1994-06-08 15:41:58 +04:00
|
|
|
bdwrite(bp);
|
1999-11-15 21:49:07 +03:00
|
|
|
return blkno;
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate a block in a cylinder group.
|
|
|
|
*
|
|
|
|
* This algorithm implements the following policy:
|
|
|
|
* 1) allocate the requested block.
|
|
|
|
* 2) allocate a rotationally optimal block in the same cylinder.
|
|
|
|
* 3) allocate the next available block on the block rotor for the
|
|
|
|
* specified cylinder group.
|
|
|
|
* Note that this routine only allocates fs_bsize blocks; these
|
|
|
|
* blocks may be fragmented by the routine that allocates them.
|
|
|
|
*/
|
1998-03-01 05:20:01 +03:00
|
|
|
static ufs_daddr_t
|
1999-11-15 21:49:07 +03:00
|
|
|
ffs_alloccgblk(ip, bp, bpref)
|
|
|
|
struct inode *ip;
|
|
|
|
struct buf *bp;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t bpref;
|
1994-06-08 15:41:58 +04:00
|
|
|
{
|
1999-11-15 21:49:07 +03:00
|
|
|
struct cg *cgp;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t bno, blkno;
|
1994-06-08 15:41:58 +04:00
|
|
|
int cylno, pos, delta;
|
|
|
|
short *cylbp;
|
2000-03-30 16:41:09 +04:00
|
|
|
int i;
|
1999-11-15 21:49:07 +03:00
|
|
|
struct fs *fs = ip->i_fs;
|
|
|
|
#ifdef FFS_EI
|
|
|
|
const int needswap = UFS_FSNEEDSWAP(fs);
|
|
|
|
#endif
|
1994-06-08 15:41:58 +04:00
|
|
|
|
1999-11-15 21:49:07 +03:00
|
|
|
cgp = (struct cg *)bp->b_data;
|
|
|
|
if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) {
|
1998-03-18 18:57:26 +03:00
|
|
|
bpref = ufs_rw32(cgp->cg_rotor, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
goto norot;
|
|
|
|
}
|
|
|
|
bpref = blknum(fs, bpref);
|
|
|
|
bpref = dtogd(fs, bpref);
|
|
|
|
/*
|
|
|
|
* if the requested block is available, use it
|
|
|
|
*/
|
1998-03-18 18:57:26 +03:00
|
|
|
if (ffs_isblock(fs, cg_blksfree(cgp, needswap),
|
|
|
|
fragstoblks(fs, bpref))) {
|
1994-06-08 15:41:58 +04:00
|
|
|
bno = bpref;
|
|
|
|
goto gotit;
|
|
|
|
}
|
1998-03-01 05:20:01 +03:00
|
|
|
if (fs->fs_nrpos <= 1 || fs->fs_cpc == 0) {
|
1994-06-08 15:41:58 +04:00
|
|
|
/*
|
|
|
|
* Block layout information is not available.
|
|
|
|
* Leaving bpref unchanged means we take the
|
|
|
|
* next available free block following the one
|
|
|
|
* we just allocated. Hopefully this will at
|
|
|
|
* least hit a track cache on drives of unknown
|
|
|
|
* geometry (e.g. SCSI).
|
|
|
|
*/
|
|
|
|
goto norot;
|
|
|
|
}
|
1994-12-16 08:55:15 +03:00
|
|
|
/*
|
|
|
|
* check for a block available on the same cylinder
|
|
|
|
*/
|
|
|
|
cylno = cbtocylno(fs, bpref);
|
1998-03-18 18:57:26 +03:00
|
|
|
if (cg_blktot(cgp, needswap)[cylno] == 0)
|
1994-12-16 08:55:15 +03:00
|
|
|
goto norot;
|
1994-06-08 15:41:58 +04:00
|
|
|
/*
|
|
|
|
* check the summary information to see if a block is
|
|
|
|
* available in the requested cylinder starting at the
|
|
|
|
* requested rotational position and proceeding around.
|
|
|
|
*/
|
1998-03-18 18:57:26 +03:00
|
|
|
cylbp = cg_blks(fs, cgp, cylno, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
pos = cbtorpos(fs, bpref);
|
|
|
|
for (i = pos; i < fs->fs_nrpos; i++)
|
1998-03-18 18:57:26 +03:00
|
|
|
if (ufs_rw16(cylbp[i], needswap) > 0)
|
1994-06-08 15:41:58 +04:00
|
|
|
break;
|
|
|
|
if (i == fs->fs_nrpos)
|
|
|
|
for (i = 0; i < pos; i++)
|
1998-03-18 18:57:26 +03:00
|
|
|
if (ufs_rw16(cylbp[i], needswap) > 0)
|
1994-06-08 15:41:58 +04:00
|
|
|
break;
|
1998-03-18 18:57:26 +03:00
|
|
|
if (ufs_rw16(cylbp[i], needswap) > 0) {
|
1994-06-08 15:41:58 +04:00
|
|
|
/*
|
|
|
|
* found a rotational position, now find the actual
|
|
|
|
* block. A panic if none is actually there.
|
|
|
|
*/
|
|
|
|
pos = cylno % fs->fs_cpc;
|
|
|
|
bno = (cylno - pos) * fs->fs_spc / NSPB(fs);
|
|
|
|
if (fs_postbl(fs, pos)[i] == -1) {
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("pos = %d, i = %d, fs = %s\n",
|
1994-06-08 15:41:58 +04:00
|
|
|
pos, i, fs->fs_fsmnt);
|
|
|
|
panic("ffs_alloccgblk: cyl groups corrupted");
|
|
|
|
}
|
|
|
|
for (i = fs_postbl(fs, pos)[i];; ) {
|
1998-03-18 18:57:26 +03:00
|
|
|
if (ffs_isblock(fs, cg_blksfree(cgp, needswap), bno + i)) {
|
1994-06-08 15:41:58 +04:00
|
|
|
bno = blkstofrags(fs, (bno + i));
|
|
|
|
goto gotit;
|
|
|
|
}
|
|
|
|
delta = fs_rotbl(fs)[i];
|
|
|
|
if (delta <= 0 ||
|
|
|
|
delta + i > fragstoblks(fs, fs->fs_fpg))
|
|
|
|
break;
|
|
|
|
i += delta;
|
|
|
|
}
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("pos = %d, i = %d, fs = %s\n", pos, i, fs->fs_fsmnt);
|
1994-06-08 15:41:58 +04:00
|
|
|
panic("ffs_alloccgblk: can't find blk in cyl");
|
|
|
|
}
|
|
|
|
norot:
|
|
|
|
/*
|
|
|
|
* no blocks in the requested cylinder, so take next
|
|
|
|
* available one in this cylinder group.
|
|
|
|
*/
|
1999-11-15 21:49:07 +03:00
|
|
|
bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
|
1994-06-08 15:41:58 +04:00
|
|
|
if (bno < 0)
|
2000-05-19 08:34:39 +04:00
|
|
|
return (0);
|
1998-03-18 18:57:26 +03:00
|
|
|
cgp->cg_rotor = ufs_rw32(bno, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
gotit:
|
|
|
|
blkno = fragstoblks(fs, bno);
|
1998-03-18 18:57:26 +03:00
|
|
|
ffs_clrblock(fs, cg_blksfree(cgp, needswap), (long)blkno);
|
1999-11-15 21:49:07 +03:00
|
|
|
ffs_clusteracct(fs, cgp, blkno, -1);
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
fs->fs_cstotal.cs_nbfree--;
|
1998-03-18 18:57:26 +03:00
|
|
|
fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--;
|
1994-06-08 15:41:58 +04:00
|
|
|
cylno = cbtocylno(fs, bno);
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add16(cg_blks(fs, cgp, cylno, needswap)[cbtorpos(fs, bno)], -1,
|
1999-11-15 21:49:07 +03:00
|
|
|
needswap);
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add32(cg_blktot(cgp, needswap)[cylno], -1, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
fs->fs_fmod = 1;
|
1999-11-15 21:49:07 +03:00
|
|
|
blkno = ufs_rw32(cgp->cg_cgx, needswap) * fs->fs_fpg + bno;
|
|
|
|
if (DOINGSOFTDEP(ITOV(ip)))
|
|
|
|
softdep_setup_blkmapdep(bp, fs, blkno);
|
|
|
|
return (blkno);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine whether a cluster can be allocated.
|
|
|
|
*
|
|
|
|
* We do not currently check for optimal rotational layout if there
|
|
|
|
* are multiple choices in the same cylinder group. Instead we just
|
|
|
|
* take the first one that we find following bpref.
|
|
|
|
*/
|
1998-03-01 05:20:01 +03:00
|
|
|
static ufs_daddr_t
|
1994-06-08 15:41:58 +04:00
|
|
|
ffs_clusteralloc(ip, cg, bpref, len)
|
|
|
|
struct inode *ip;
|
|
|
|
int cg;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t bpref;
|
1994-06-08 15:41:58 +04:00
|
|
|
int len;
|
|
|
|
{
|
2000-03-30 16:41:09 +04:00
|
|
|
struct fs *fs;
|
|
|
|
struct cg *cgp;
|
1994-06-08 15:41:58 +04:00
|
|
|
struct buf *bp;
|
1998-03-01 05:20:01 +03:00
|
|
|
int i, got, run, bno, bit, map;
|
1994-06-08 15:41:58 +04:00
|
|
|
u_char *mapp;
|
1994-12-14 16:03:35 +03:00
|
|
|
int32_t *lp;
|
1994-06-08 15:41:58 +04:00
|
|
|
|
|
|
|
fs = ip->i_fs;
|
1994-12-14 16:03:35 +03:00
|
|
|
if (fs->fs_maxcluster[cg] < len)
|
2000-05-19 08:34:39 +04:00
|
|
|
return (0);
|
1994-06-08 15:41:58 +04:00
|
|
|
if (bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize,
|
|
|
|
NOCRED, &bp))
|
|
|
|
goto fail;
|
|
|
|
cgp = (struct cg *)bp->b_data;
|
1999-11-15 21:49:07 +03:00
|
|
|
if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs)))
|
1994-06-08 15:41:58 +04:00
|
|
|
goto fail;
|
|
|
|
/*
|
|
|
|
* Check to see if a cluster of the needed size (or bigger) is
|
|
|
|
* available in this cylinder group.
|
|
|
|
*/
|
1999-11-15 21:49:07 +03:00
|
|
|
lp = &cg_clustersum(cgp, UFS_FSNEEDSWAP(fs))[len];
|
1994-06-08 15:41:58 +04:00
|
|
|
for (i = len; i <= fs->fs_contigsumsize; i++)
|
1999-11-15 21:49:07 +03:00
|
|
|
if (ufs_rw32(*lp++, UFS_FSNEEDSWAP(fs)) > 0)
|
1994-06-08 15:41:58 +04:00
|
|
|
break;
|
1994-12-14 16:03:35 +03:00
|
|
|
if (i > fs->fs_contigsumsize) {
|
|
|
|
/*
|
|
|
|
* This is the first time looking for a cluster in this
|
|
|
|
* cylinder group. Update the cluster summary information
|
|
|
|
* to reflect the true maximum sized cluster so that
|
|
|
|
* future cluster allocation requests can avoid reading
|
|
|
|
* the cylinder group map only to find no clusters.
|
|
|
|
*/
|
1999-11-15 21:49:07 +03:00
|
|
|
lp = &cg_clustersum(cgp, UFS_FSNEEDSWAP(fs))[len - 1];
|
1994-12-14 16:03:35 +03:00
|
|
|
for (i = len - 1; i > 0; i--)
|
1999-11-15 21:49:07 +03:00
|
|
|
if (ufs_rw32(*lp--, UFS_FSNEEDSWAP(fs)) > 0)
|
1994-12-14 16:03:35 +03:00
|
|
|
break;
|
|
|
|
fs->fs_maxcluster[cg] = i;
|
1994-06-08 15:41:58 +04:00
|
|
|
goto fail;
|
1994-12-14 16:03:35 +03:00
|
|
|
}
|
1994-06-08 15:41:58 +04:00
|
|
|
/*
|
|
|
|
* Search the cluster map to find a big enough cluster.
|
|
|
|
* We take the first one that we find, even if it is larger
|
|
|
|
* than we need as we prefer to get one close to the previous
|
|
|
|
* block allocation. We do not search before the current
|
|
|
|
* preference point as we do not want to allocate a block
|
|
|
|
* that is allocated before the previous one (as we will
|
|
|
|
* then have to wait for another pass of the elevator
|
|
|
|
* algorithm before it will be read). We prefer to fail and
|
|
|
|
* be recalled to try an allocation in the next cylinder group.
|
|
|
|
*/
|
|
|
|
if (dtog(fs, bpref) != cg)
|
|
|
|
bpref = 0;
|
|
|
|
else
|
|
|
|
bpref = fragstoblks(fs, dtogd(fs, blknum(fs, bpref)));
|
1999-11-15 21:49:07 +03:00
|
|
|
mapp = &cg_clustersfree(cgp, UFS_FSNEEDSWAP(fs))[bpref / NBBY];
|
1994-06-08 15:41:58 +04:00
|
|
|
map = *mapp++;
|
|
|
|
bit = 1 << (bpref % NBBY);
|
1998-03-18 18:57:26 +03:00
|
|
|
for (run = 0, got = bpref;
|
1999-11-15 21:49:07 +03:00
|
|
|
got < ufs_rw32(cgp->cg_nclusterblks, UFS_FSNEEDSWAP(fs)); got++) {
|
1994-06-08 15:41:58 +04:00
|
|
|
if ((map & bit) == 0) {
|
|
|
|
run = 0;
|
|
|
|
} else {
|
|
|
|
run++;
|
|
|
|
if (run == len)
|
|
|
|
break;
|
|
|
|
}
|
1998-03-01 05:20:01 +03:00
|
|
|
if ((got & (NBBY - 1)) != (NBBY - 1)) {
|
1994-06-08 15:41:58 +04:00
|
|
|
bit <<= 1;
|
|
|
|
} else {
|
|
|
|
map = *mapp++;
|
|
|
|
bit = 1;
|
|
|
|
}
|
|
|
|
}
|
1999-11-15 21:49:07 +03:00
|
|
|
if (got == ufs_rw32(cgp->cg_nclusterblks, UFS_FSNEEDSWAP(fs)))
|
1994-06-08 15:41:58 +04:00
|
|
|
goto fail;
|
|
|
|
/*
|
|
|
|
* Allocate the cluster that we have found.
|
|
|
|
*/
|
1999-11-15 21:49:07 +03:00
|
|
|
#ifdef DIAGNOSTIC
|
1998-03-01 05:20:01 +03:00
|
|
|
for (i = 1; i <= len; i++)
|
1999-11-15 21:49:07 +03:00
|
|
|
if (!ffs_isblock(fs, cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)),
|
|
|
|
got - run + i))
|
1998-03-01 05:20:01 +03:00
|
|
|
panic("ffs_clusteralloc: map mismatch");
|
1999-11-15 21:49:07 +03:00
|
|
|
#endif
|
1998-03-01 05:20:01 +03:00
|
|
|
bno = cg * fs->fs_fpg + blkstofrags(fs, got - run + 1);
|
|
|
|
if (dtog(fs, bno) != cg)
|
|
|
|
panic("ffs_clusteralloc: allocated out of group");
|
1994-06-08 15:41:58 +04:00
|
|
|
len = blkstofrags(fs, len);
|
|
|
|
for (i = 0; i < len; i += fs->fs_frag)
|
1999-11-15 21:49:07 +03:00
|
|
|
if ((got = ffs_alloccgblk(ip, bp, bno + i)) != bno + i)
|
1994-06-08 15:41:58 +04:00
|
|
|
panic("ffs_clusteralloc: lost block");
|
1995-07-19 19:47:36 +04:00
|
|
|
bdwrite(bp);
|
1994-06-08 15:41:58 +04:00
|
|
|
return (bno);
|
|
|
|
|
|
|
|
fail:
|
|
|
|
brelse(bp);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine whether an inode can be allocated.
|
|
|
|
*
|
|
|
|
* Check to see if an inode is available, and if it is,
|
|
|
|
* allocate it using the following policy:
|
|
|
|
* 1) allocate the requested inode.
|
|
|
|
* 2) allocate the next available inode after the requested
|
|
|
|
* inode in the specified cylinder group.
|
|
|
|
*/
|
1998-03-01 05:20:01 +03:00
|
|
|
static ufs_daddr_t
|
1994-06-08 15:41:58 +04:00
|
|
|
ffs_nodealloccg(ip, cg, ipref, mode)
|
|
|
|
struct inode *ip;
|
|
|
|
int cg;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t ipref;
|
1994-06-08 15:41:58 +04:00
|
|
|
int mode;
|
|
|
|
{
|
2000-03-30 16:41:09 +04:00
|
|
|
struct cg *cgp;
|
1994-06-08 15:41:58 +04:00
|
|
|
struct buf *bp;
|
|
|
|
int error, start, len, loc, map, i;
|
2000-03-30 16:41:09 +04:00
|
|
|
struct fs *fs = ip->i_fs;
|
1998-03-18 18:57:26 +03:00
|
|
|
#ifdef FFS_EI
|
1999-11-15 21:49:07 +03:00
|
|
|
const int needswap = UFS_FSNEEDSWAP(fs);
|
1998-03-18 18:57:26 +03:00
|
|
|
#endif
|
1994-06-08 15:41:58 +04:00
|
|
|
|
|
|
|
if (fs->fs_cs(fs, cg).cs_nifree == 0)
|
2000-05-19 08:34:39 +04:00
|
|
|
return (0);
|
1994-06-08 15:41:58 +04:00
|
|
|
error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
|
|
|
|
(int)fs->fs_cgsize, NOCRED, &bp);
|
|
|
|
if (error) {
|
|
|
|
brelse(bp);
|
2000-05-19 08:34:39 +04:00
|
|
|
return (0);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
|
|
|
cgp = (struct cg *)bp->b_data;
|
1998-03-18 18:57:26 +03:00
|
|
|
if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0) {
|
1994-06-08 15:41:58 +04:00
|
|
|
brelse(bp);
|
2000-05-19 08:34:39 +04:00
|
|
|
return (0);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
1998-03-18 18:57:26 +03:00
|
|
|
cgp->cg_time = ufs_rw32(time.tv_sec, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
if (ipref) {
|
|
|
|
ipref %= fs->fs_ipg;
|
1998-03-18 18:57:26 +03:00
|
|
|
if (isclr(cg_inosused(cgp, needswap), ipref))
|
1994-06-08 15:41:58 +04:00
|
|
|
goto gotit;
|
|
|
|
}
|
1998-03-18 18:57:26 +03:00
|
|
|
start = ufs_rw32(cgp->cg_irotor, needswap) / NBBY;
|
|
|
|
len = howmany(fs->fs_ipg - ufs_rw32(cgp->cg_irotor, needswap),
|
|
|
|
NBBY);
|
|
|
|
loc = skpc(0xff, len, &cg_inosused(cgp, needswap)[start]);
|
1994-06-08 15:41:58 +04:00
|
|
|
if (loc == 0) {
|
|
|
|
len = start + 1;
|
|
|
|
start = 0;
|
1998-03-18 18:57:26 +03:00
|
|
|
loc = skpc(0xff, len, &cg_inosused(cgp, needswap)[0]);
|
1994-06-08 15:41:58 +04:00
|
|
|
if (loc == 0) {
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("cg = %d, irotor = %d, fs = %s\n",
|
1998-03-18 18:57:26 +03:00
|
|
|
cg, ufs_rw32(cgp->cg_irotor, needswap),
|
|
|
|
fs->fs_fsmnt);
|
1994-06-08 15:41:58 +04:00
|
|
|
panic("ffs_nodealloccg: map corrupted");
|
|
|
|
/* NOTREACHED */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
i = start + len - loc;
|
1998-03-18 18:57:26 +03:00
|
|
|
map = cg_inosused(cgp, needswap)[i];
|
1994-06-08 15:41:58 +04:00
|
|
|
ipref = i * NBBY;
|
|
|
|
for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
|
|
|
|
if ((map & i) == 0) {
|
1998-03-18 18:57:26 +03:00
|
|
|
cgp->cg_irotor = ufs_rw32(ipref, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
goto gotit;
|
|
|
|
}
|
|
|
|
}
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("fs = %s\n", fs->fs_fsmnt);
|
1994-06-08 15:41:58 +04:00
|
|
|
panic("ffs_nodealloccg: block not in map");
|
|
|
|
/* NOTREACHED */
|
|
|
|
gotit:
|
1999-11-15 21:49:07 +03:00
|
|
|
if (DOINGSOFTDEP(ITOV(ip)))
|
|
|
|
softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref);
|
1998-03-18 18:57:26 +03:00
|
|
|
setbit(cg_inosused(cgp, needswap), ipref);
|
|
|
|
ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
fs->fs_cstotal.cs_nifree--;
|
1999-11-15 21:49:07 +03:00
|
|
|
fs->fs_cs(fs, cg).cs_nifree--;
|
1994-06-08 15:41:58 +04:00
|
|
|
fs->fs_fmod = 1;
|
|
|
|
if ((mode & IFMT) == IFDIR) {
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
fs->fs_cstotal.cs_ndir++;
|
|
|
|
fs->fs_cs(fs, cg).cs_ndir++;
|
|
|
|
}
|
|
|
|
bdwrite(bp);
|
|
|
|
return (cg * fs->fs_ipg + ipref);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free a block or fragment.
|
|
|
|
*
|
|
|
|
* The specified block or fragment is placed back in the
|
|
|
|
* free map. If a fragment is deallocated, a possible
|
|
|
|
* block reassembly is checked.
|
|
|
|
*/
|
1996-02-10 01:22:18 +03:00
|
|
|
void
|
1994-06-08 15:41:58 +04:00
|
|
|
ffs_blkfree(ip, bno, size)
|
2000-03-30 16:41:09 +04:00
|
|
|
struct inode *ip;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t bno;
|
1994-06-08 15:41:58 +04:00
|
|
|
long size;
|
|
|
|
{
|
2000-03-30 16:41:09 +04:00
|
|
|
struct cg *cgp;
|
1994-06-08 15:41:58 +04:00
|
|
|
struct buf *bp;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t blkno;
|
1994-06-08 15:41:58 +04:00
|
|
|
int i, error, cg, blk, frags, bbase;
|
2000-03-30 16:41:09 +04:00
|
|
|
struct fs *fs = ip->i_fs;
|
1999-11-15 21:49:07 +03:00
|
|
|
const int needswap = UFS_FSNEEDSWAP(fs);
|
1994-06-08 15:41:58 +04:00
|
|
|
|
1999-11-15 21:49:07 +03:00
|
|
|
if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
|
|
|
|
fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
|
|
|
|
printf("dev = 0x%x, bno = %u bsize = %d, size = %ld, fs = %s\n",
|
|
|
|
ip->i_dev, bno, fs->fs_bsize, size, fs->fs_fsmnt);
|
1994-06-08 15:41:58 +04:00
|
|
|
panic("blkfree: bad size");
|
|
|
|
}
|
|
|
|
cg = dtog(fs, bno);
|
|
|
|
if ((u_int)bno >= fs->fs_size) {
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("bad block %d, ino %d\n", bno, ip->i_number);
|
1997-06-11 14:09:37 +04:00
|
|
|
ffs_fserr(fs, ip->i_ffs_uid, "bad block");
|
1994-06-08 15:41:58 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
|
|
|
|
(int)fs->fs_cgsize, NOCRED, &bp);
|
|
|
|
if (error) {
|
|
|
|
brelse(bp);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
cgp = (struct cg *)bp->b_data;
|
1998-03-18 18:57:26 +03:00
|
|
|
if (!cg_chkmagic(cgp, needswap)) {
|
1994-06-08 15:41:58 +04:00
|
|
|
brelse(bp);
|
|
|
|
return;
|
|
|
|
}
|
1998-03-18 18:57:26 +03:00
|
|
|
cgp->cg_time = ufs_rw32(time.tv_sec, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
bno = dtogd(fs, bno);
|
|
|
|
if (size == fs->fs_bsize) {
|
|
|
|
blkno = fragstoblks(fs, bno);
|
1999-11-15 21:49:07 +03:00
|
|
|
if (!ffs_isfreeblock(fs, cg_blksfree(cgp, needswap), blkno)) {
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("dev = 0x%x, block = %d, fs = %s\n",
|
1994-06-08 15:41:58 +04:00
|
|
|
ip->i_dev, bno, fs->fs_fsmnt);
|
|
|
|
panic("blkfree: freeing free block");
|
|
|
|
}
|
1998-03-18 18:57:26 +03:00
|
|
|
ffs_setblock(fs, cg_blksfree(cgp, needswap), blkno);
|
1999-11-15 21:49:07 +03:00
|
|
|
ffs_clusteracct(fs, cgp, blkno, 1);
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
fs->fs_cstotal.cs_nbfree++;
|
|
|
|
fs->fs_cs(fs, cg).cs_nbfree++;
|
|
|
|
i = cbtocylno(fs, bno);
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add16(cg_blks(fs, cgp, i, needswap)[cbtorpos(fs, bno)], 1,
|
1999-11-15 21:49:07 +03:00
|
|
|
needswap);
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add32(cg_blktot(cgp, needswap)[i], 1, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
} else {
|
|
|
|
bbase = bno - fragnum(fs, bno);
|
|
|
|
/*
|
|
|
|
* decrement the counts associated with the old frags
|
|
|
|
*/
|
1998-03-18 18:57:26 +03:00
|
|
|
blk = blkmap(fs, cg_blksfree(cgp, needswap), bbase);
|
|
|
|
ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
/*
|
|
|
|
* deallocate the fragment
|
|
|
|
*/
|
|
|
|
frags = numfrags(fs, size);
|
|
|
|
for (i = 0; i < frags; i++) {
|
1998-03-18 18:57:26 +03:00
|
|
|
if (isset(cg_blksfree(cgp, needswap), bno + i)) {
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("dev = 0x%x, block = %d, fs = %s\n",
|
1994-06-08 15:41:58 +04:00
|
|
|
ip->i_dev, bno + i, fs->fs_fsmnt);
|
|
|
|
panic("blkfree: freeing free frag");
|
|
|
|
}
|
1998-03-18 18:57:26 +03:00
|
|
|
setbit(cg_blksfree(cgp, needswap), bno + i);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
fs->fs_cstotal.cs_nffree += i;
|
1999-11-15 21:49:07 +03:00
|
|
|
fs->fs_cs(fs, cg).cs_nffree += i;
|
1994-06-08 15:41:58 +04:00
|
|
|
/*
|
|
|
|
* add back in counts associated with the new frags
|
|
|
|
*/
|
1998-03-18 18:57:26 +03:00
|
|
|
blk = blkmap(fs, cg_blksfree(cgp, needswap), bbase);
|
|
|
|
ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
/*
|
|
|
|
* if a complete block has been reassembled, account for it
|
|
|
|
*/
|
|
|
|
blkno = fragstoblks(fs, bbase);
|
1998-03-18 18:57:26 +03:00
|
|
|
if (ffs_isblock(fs, cg_blksfree(cgp, needswap), blkno)) {
|
|
|
|
ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
fs->fs_cstotal.cs_nffree -= fs->fs_frag;
|
|
|
|
fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
|
1999-11-15 21:49:07 +03:00
|
|
|
ffs_clusteracct(fs, cgp, blkno, 1);
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
fs->fs_cstotal.cs_nbfree++;
|
|
|
|
fs->fs_cs(fs, cg).cs_nbfree++;
|
|
|
|
i = cbtocylno(fs, bbase);
|
1999-11-15 21:49:07 +03:00
|
|
|
ufs_add16(cg_blks(fs, cgp, i, needswap)[cbtorpos(fs,
|
|
|
|
bbase)], 1,
|
|
|
|
needswap);
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add32(cg_blktot(cgp, needswap)[i], 1, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
fs->fs_fmod = 1;
|
|
|
|
bdwrite(bp);
|
|
|
|
}
|
|
|
|
|
1998-03-01 05:20:01 +03:00
|
|
|
#if defined(DIAGNOSTIC) || defined(DEBUG)
|
|
|
|
/*
|
|
|
|
* Verify allocation of a block or fragment. Returns true if block or
|
|
|
|
* fragment is allocated, false if it is free.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
ffs_checkblk(ip, bno, size)
|
|
|
|
struct inode *ip;
|
|
|
|
ufs_daddr_t bno;
|
|
|
|
long size;
|
|
|
|
{
|
|
|
|
struct fs *fs;
|
|
|
|
struct cg *cgp;
|
|
|
|
struct buf *bp;
|
|
|
|
int i, error, frags, free;
|
|
|
|
|
|
|
|
fs = ip->i_fs;
|
|
|
|
if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
|
|
|
|
printf("bsize = %d, size = %ld, fs = %s\n",
|
|
|
|
fs->fs_bsize, size, fs->fs_fsmnt);
|
|
|
|
panic("checkblk: bad size");
|
|
|
|
}
|
|
|
|
if ((u_int)bno >= fs->fs_size)
|
|
|
|
panic("checkblk: bad block %d", bno);
|
|
|
|
error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, dtog(fs, bno))),
|
|
|
|
(int)fs->fs_cgsize, NOCRED, &bp);
|
|
|
|
if (error) {
|
|
|
|
brelse(bp);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
cgp = (struct cg *)bp->b_data;
|
1999-11-15 21:49:07 +03:00
|
|
|
if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
|
1998-03-01 05:20:01 +03:00
|
|
|
brelse(bp);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
bno = dtogd(fs, bno);
|
|
|
|
if (size == fs->fs_bsize) {
|
1999-11-15 21:49:07 +03:00
|
|
|
free = ffs_isblock(fs, cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)),
|
1998-03-18 18:57:26 +03:00
|
|
|
fragstoblks(fs, bno));
|
1998-03-01 05:20:01 +03:00
|
|
|
} else {
|
|
|
|
frags = numfrags(fs, size);
|
|
|
|
for (free = 0, i = 0; i < frags; i++)
|
1999-11-15 21:49:07 +03:00
|
|
|
if (isset(cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)), bno + i))
|
1998-03-01 05:20:01 +03:00
|
|
|
free++;
|
|
|
|
if (free != 0 && free != frags)
|
|
|
|
panic("checkblk: partially free fragment");
|
|
|
|
}
|
|
|
|
brelse(bp);
|
|
|
|
return (!free);
|
|
|
|
}
|
|
|
|
#endif /* DIAGNOSTIC */
|
|
|
|
|
1994-06-08 15:41:58 +04:00
|
|
|
/*
|
|
|
|
* Free an inode.
|
|
|
|
*/
|
|
|
|
int
|
1996-02-10 01:22:18 +03:00
|
|
|
ffs_vfree(v)
|
|
|
|
void *v;
|
|
|
|
{
|
1994-06-08 15:41:58 +04:00
|
|
|
struct vop_vfree_args /* {
|
|
|
|
struct vnode *a_pvp;
|
|
|
|
ino_t a_ino;
|
|
|
|
int a_mode;
|
1996-02-10 01:22:18 +03:00
|
|
|
} */ *ap = v;
|
1999-11-15 21:49:07 +03:00
|
|
|
|
|
|
|
if (DOINGSOFTDEP(ap->a_pvp)) {
|
|
|
|
softdep_freefile(ap);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
return (ffs_freefile(ap));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do the actual free operation.
|
|
|
|
* The specified inode is placed back in the free map.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
ffs_freefile(v)
|
|
|
|
void *v;
|
|
|
|
{
|
|
|
|
struct vop_vfree_args /* {
|
|
|
|
struct vnode *a_pvp;
|
|
|
|
ino_t a_ino;
|
|
|
|
int a_mode;
|
|
|
|
} */ *ap = v;
|
2000-03-30 16:41:09 +04:00
|
|
|
struct cg *cgp;
|
|
|
|
struct inode *pip = VTOI(ap->a_pvp);
|
|
|
|
struct fs *fs = pip->i_fs;
|
1994-06-08 15:41:58 +04:00
|
|
|
ino_t ino = ap->a_ino;
|
|
|
|
struct buf *bp;
|
|
|
|
int error, cg;
|
1998-03-18 18:57:26 +03:00
|
|
|
#ifdef FFS_EI
|
1999-11-15 21:49:07 +03:00
|
|
|
const int needswap = UFS_FSNEEDSWAP(fs);
|
1998-03-18 18:57:26 +03:00
|
|
|
#endif
|
1994-06-08 15:41:58 +04:00
|
|
|
|
|
|
|
if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
|
|
|
|
panic("ifree: range: dev = 0x%x, ino = %d, fs = %s\n",
|
|
|
|
pip->i_dev, ino, fs->fs_fsmnt);
|
|
|
|
cg = ino_to_cg(fs, ino);
|
|
|
|
error = bread(pip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
|
|
|
|
(int)fs->fs_cgsize, NOCRED, &bp);
|
|
|
|
if (error) {
|
|
|
|
brelse(bp);
|
1999-11-15 21:49:07 +03:00
|
|
|
return (error);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
|
|
|
cgp = (struct cg *)bp->b_data;
|
1998-03-18 18:57:26 +03:00
|
|
|
if (!cg_chkmagic(cgp, needswap)) {
|
1994-06-08 15:41:58 +04:00
|
|
|
brelse(bp);
|
|
|
|
return (0);
|
|
|
|
}
|
1998-03-18 18:57:26 +03:00
|
|
|
cgp->cg_time = ufs_rw32(time.tv_sec, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
ino %= fs->fs_ipg;
|
1998-03-18 18:57:26 +03:00
|
|
|
if (isclr(cg_inosused(cgp, needswap), ino)) {
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("dev = 0x%x, ino = %d, fs = %s\n",
|
1994-06-08 15:41:58 +04:00
|
|
|
pip->i_dev, ino, fs->fs_fsmnt);
|
|
|
|
if (fs->fs_ronly == 0)
|
|
|
|
panic("ifree: freeing free inode");
|
|
|
|
}
|
1998-03-18 18:57:26 +03:00
|
|
|
clrbit(cg_inosused(cgp, needswap), ino);
|
|
|
|
if (ino < ufs_rw32(cgp->cg_irotor, needswap))
|
|
|
|
cgp->cg_irotor = ufs_rw32(ino, needswap);
|
|
|
|
ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
fs->fs_cstotal.cs_nifree++;
|
|
|
|
fs->fs_cs(fs, cg).cs_nifree++;
|
|
|
|
if ((ap->a_mode & IFMT) == IFDIR) {
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
fs->fs_cstotal.cs_ndir--;
|
|
|
|
fs->fs_cs(fs, cg).cs_ndir--;
|
|
|
|
}
|
|
|
|
fs->fs_fmod = 1;
|
|
|
|
bdwrite(bp);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find a block of the specified size in the specified cylinder group.
|
|
|
|
*
|
|
|
|
* It is a panic if a request is made to find a block if none are
|
|
|
|
* available.
|
|
|
|
*/
|
1998-03-01 05:20:01 +03:00
|
|
|
static ufs_daddr_t
|
1999-11-15 21:49:07 +03:00
|
|
|
ffs_mapsearch(fs, cgp, bpref, allocsiz)
|
2000-03-30 16:41:09 +04:00
|
|
|
struct fs *fs;
|
|
|
|
struct cg *cgp;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t bpref;
|
1994-06-08 15:41:58 +04:00
|
|
|
int allocsiz;
|
|
|
|
{
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t bno;
|
1994-06-08 15:41:58 +04:00
|
|
|
int start, len, loc, i;
|
|
|
|
int blk, field, subfield, pos;
|
1998-03-18 18:57:26 +03:00
|
|
|
int ostart, olen;
|
1999-11-15 21:49:07 +03:00
|
|
|
#ifdef FFS_EI
|
|
|
|
const int needswap = UFS_FSNEEDSWAP(fs);
|
|
|
|
#endif
|
1994-06-08 15:41:58 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* find the fragment by searching through the free block
|
|
|
|
* map for an appropriate bit pattern
|
|
|
|
*/
|
|
|
|
if (bpref)
|
|
|
|
start = dtogd(fs, bpref) / NBBY;
|
|
|
|
else
|
1998-03-18 18:57:26 +03:00
|
|
|
start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY;
|
1994-06-08 15:41:58 +04:00
|
|
|
len = howmany(fs->fs_fpg, NBBY) - start;
|
1998-03-18 18:57:26 +03:00
|
|
|
ostart = start;
|
|
|
|
olen = len;
|
2001-08-09 12:16:42 +04:00
|
|
|
loc = scanc((u_int)len,
|
|
|
|
(const u_char *)&cg_blksfree(cgp, needswap)[start],
|
|
|
|
(const u_char *)fragtbl[fs->fs_frag],
|
|
|
|
(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
|
1994-06-08 15:41:58 +04:00
|
|
|
if (loc == 0) {
|
|
|
|
len = start + 1;
|
|
|
|
start = 0;
|
2001-08-09 12:16:42 +04:00
|
|
|
loc = scanc((u_int)len,
|
|
|
|
(const u_char *)&cg_blksfree(cgp, needswap)[0],
|
|
|
|
(const u_char *)fragtbl[fs->fs_frag],
|
|
|
|
(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
|
1994-06-08 15:41:58 +04:00
|
|
|
if (loc == 0) {
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("start = %d, len = %d, fs = %s\n",
|
1998-03-18 18:57:26 +03:00
|
|
|
ostart, olen, fs->fs_fsmnt);
|
1998-03-19 06:42:35 +03:00
|
|
|
printf("offset=%d %ld\n",
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_rw32(cgp->cg_freeoff, needswap),
|
1998-03-19 06:42:35 +03:00
|
|
|
(long)cg_blksfree(cgp, needswap) - (long)cgp);
|
1994-06-08 15:41:58 +04:00
|
|
|
panic("ffs_alloccg: map corrupted");
|
|
|
|
/* NOTREACHED */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bno = (start + len - loc) * NBBY;
|
1998-03-18 18:57:26 +03:00
|
|
|
cgp->cg_frotor = ufs_rw32(bno, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
/*
|
|
|
|
* found the byte in the map
|
|
|
|
* sift through the bits to find the selected frag
|
|
|
|
*/
|
|
|
|
for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
|
1998-03-18 18:57:26 +03:00
|
|
|
blk = blkmap(fs, cg_blksfree(cgp, needswap), bno);
|
1994-06-08 15:41:58 +04:00
|
|
|
blk <<= 1;
|
|
|
|
field = around[allocsiz];
|
|
|
|
subfield = inside[allocsiz];
|
|
|
|
for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
|
|
|
|
if ((blk & field) == subfield)
|
|
|
|
return (bno + pos);
|
|
|
|
field <<= 1;
|
|
|
|
subfield <<= 1;
|
|
|
|
}
|
|
|
|
}
|
1996-10-13 01:58:44 +04:00
|
|
|
printf("bno = %d, fs = %s\n", bno, fs->fs_fsmnt);
|
1994-06-08 15:41:58 +04:00
|
|
|
panic("ffs_alloccg: block not in map");
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update the cluster map because of an allocation or free.
|
|
|
|
*
|
|
|
|
* Cnt == 1 means free; cnt == -1 means allocating.
|
|
|
|
*/
|
1996-02-10 01:22:18 +03:00
|
|
|
void
|
1999-11-15 21:49:07 +03:00
|
|
|
ffs_clusteracct(fs, cgp, blkno, cnt)
|
1994-06-08 15:41:58 +04:00
|
|
|
struct fs *fs;
|
|
|
|
struct cg *cgp;
|
1998-03-01 05:20:01 +03:00
|
|
|
ufs_daddr_t blkno;
|
1994-06-08 15:41:58 +04:00
|
|
|
int cnt;
|
|
|
|
{
|
1994-10-20 07:20:55 +03:00
|
|
|
int32_t *sump;
|
1994-12-14 16:03:35 +03:00
|
|
|
int32_t *lp;
|
1994-06-08 15:41:58 +04:00
|
|
|
u_char *freemapp, *mapp;
|
|
|
|
int i, start, end, forw, back, map, bit;
|
1999-11-15 21:49:07 +03:00
|
|
|
#ifdef FFS_EI
|
|
|
|
const int needswap = UFS_FSNEEDSWAP(fs);
|
|
|
|
#endif
|
1994-06-08 15:41:58 +04:00
|
|
|
|
|
|
|
if (fs->fs_contigsumsize <= 0)
|
|
|
|
return;
|
1998-03-18 18:57:26 +03:00
|
|
|
freemapp = cg_clustersfree(cgp, needswap);
|
|
|
|
sump = cg_clustersum(cgp, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
/*
|
|
|
|
* Allocate or clear the actual block.
|
|
|
|
*/
|
|
|
|
if (cnt > 0)
|
|
|
|
setbit(freemapp, blkno);
|
|
|
|
else
|
|
|
|
clrbit(freemapp, blkno);
|
|
|
|
/*
|
|
|
|
* Find the size of the cluster going forward.
|
|
|
|
*/
|
|
|
|
start = blkno + 1;
|
|
|
|
end = start + fs->fs_contigsumsize;
|
1998-03-18 18:57:26 +03:00
|
|
|
if (end >= ufs_rw32(cgp->cg_nclusterblks, needswap))
|
|
|
|
end = ufs_rw32(cgp->cg_nclusterblks, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
mapp = &freemapp[start / NBBY];
|
|
|
|
map = *mapp++;
|
|
|
|
bit = 1 << (start % NBBY);
|
|
|
|
for (i = start; i < end; i++) {
|
|
|
|
if ((map & bit) == 0)
|
|
|
|
break;
|
|
|
|
if ((i & (NBBY - 1)) != (NBBY - 1)) {
|
|
|
|
bit <<= 1;
|
|
|
|
} else {
|
|
|
|
map = *mapp++;
|
|
|
|
bit = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
forw = i - start;
|
|
|
|
/*
|
|
|
|
* Find the size of the cluster going backward.
|
|
|
|
*/
|
|
|
|
start = blkno - 1;
|
|
|
|
end = start - fs->fs_contigsumsize;
|
|
|
|
if (end < 0)
|
|
|
|
end = -1;
|
|
|
|
mapp = &freemapp[start / NBBY];
|
|
|
|
map = *mapp--;
|
|
|
|
bit = 1 << (start % NBBY);
|
|
|
|
for (i = start; i > end; i--) {
|
|
|
|
if ((map & bit) == 0)
|
|
|
|
break;
|
|
|
|
if ((i & (NBBY - 1)) != 0) {
|
|
|
|
bit >>= 1;
|
|
|
|
} else {
|
|
|
|
map = *mapp--;
|
|
|
|
bit = 1 << (NBBY - 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
back = start - i;
|
|
|
|
/*
|
|
|
|
* Account for old cluster and the possibly new forward and
|
|
|
|
* back clusters.
|
|
|
|
*/
|
|
|
|
i = back + forw + 1;
|
|
|
|
if (i > fs->fs_contigsumsize)
|
|
|
|
i = fs->fs_contigsumsize;
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add32(sump[i], cnt, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
if (back > 0)
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add32(sump[back], -cnt, needswap);
|
1994-06-08 15:41:58 +04:00
|
|
|
if (forw > 0)
|
1998-03-18 18:57:26 +03:00
|
|
|
ufs_add32(sump[forw], -cnt, needswap);
|
|
|
|
|
1994-12-14 16:03:35 +03:00
|
|
|
/*
|
|
|
|
* Update cluster summary information.
|
|
|
|
*/
|
|
|
|
lp = &sump[fs->fs_contigsumsize];
|
|
|
|
for (i = fs->fs_contigsumsize; i > 0; i--)
|
1998-03-18 18:57:26 +03:00
|
|
|
if (ufs_rw32(*lp--, needswap) > 0)
|
1994-12-14 16:03:35 +03:00
|
|
|
break;
|
1998-03-18 18:57:26 +03:00
|
|
|
fs->fs_maxcluster[ufs_rw32(cgp->cg_cgx, needswap)] = i;
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Fserr prints the name of a file system with an error diagnostic.
|
|
|
|
*
|
|
|
|
* The form of the error message is:
|
|
|
|
* fs: error message
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
ffs_fserr(fs, uid, cp)
|
|
|
|
struct fs *fs;
|
|
|
|
u_int uid;
|
|
|
|
char *cp;
|
|
|
|
{
|
|
|
|
|
2000-11-27 11:39:39 +03:00
|
|
|
log(LOG_ERR, "uid %d comm %s on %s: %s\n",
|
|
|
|
uid, curproc->p_comm, fs->fs_fsmnt, cp);
|
1994-06-08 15:41:58 +04:00
|
|
|
}
|