Age | Commit message (Collapse) | Author | Files | Lines |
|
All file_operations should get a .llseek operation so we can make
nonseekable_open the default for future file operations without a
.llseek pointer.
The three cases that we can automatically detect are no_llseek, seq_lseek
and default_llseek. For cases where we can we can automatically prove that
the file offset is always ignored, we use noop_llseek, which maintains
the current behavior of not returning an error from a seek.
New drivers should normally not use noop_llseek but instead use no_llseek
and call nonseekable_open at open time. Existing drivers can be converted
to do the same when the maintainer knows for certain that no user code
relies on calling seek on the device file.
The generated code is often incorrectly indented and right now contains
comments that clarify for each added line why a specific variant was
chosen. In the version that gets submitted upstream, the comments will
be gone and I will manually fix the indentation, because there does not
seem to be a way to do that using coccinelle.
Some amount of new code is currently sitting in linux-next that should get
the same modifications, which I will do at the end of the merge window.
Many thanks to Julia Lawall for helping me learn to write a semantic
patch that does all this.
===== begin semantic patch =====
// This adds an llseek= method to all file operations,
// as a preparation for making no_llseek the default.
//
// The rules are
// - use no_llseek explicitly if we do nonseekable_open
// - use seq_lseek for sequential files
// - use default_llseek if we know we access f_pos
// - use noop_llseek if we know we don't access f_pos,
// but we still want to allow users to call lseek
//
@ open1 exists @
identifier nested_open;
@@
nested_open(...)
{
<+...
nonseekable_open(...)
...+>
}
@ open exists@
identifier open_f;
identifier i, f;
identifier open1.nested_open;
@@
int open_f(struct inode *i, struct file *f)
{
<+...
(
nonseekable_open(...)
|
nested_open(...)
)
...+>
}
@ read disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ read_no_fpos disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
... when != off
}
@ write @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ write_no_fpos @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
... when != off
}
@ fops0 @
identifier fops;
@@
struct file_operations fops = {
...
};
@ has_llseek depends on fops0 @
identifier fops0.fops;
identifier llseek_f;
@@
struct file_operations fops = {
...
.llseek = llseek_f,
...
};
@ has_read depends on fops0 @
identifier fops0.fops;
identifier read_f;
@@
struct file_operations fops = {
...
.read = read_f,
...
};
@ has_write depends on fops0 @
identifier fops0.fops;
identifier write_f;
@@
struct file_operations fops = {
...
.write = write_f,
...
};
@ has_open depends on fops0 @
identifier fops0.fops;
identifier open_f;
@@
struct file_operations fops = {
...
.open = open_f,
...
};
// use no_llseek if we call nonseekable_open
////////////////////////////////////////////
@ nonseekable1 depends on !has_llseek && has_open @
identifier fops0.fops;
identifier nso ~= "nonseekable_open";
@@
struct file_operations fops = {
... .open = nso, ...
+.llseek = no_llseek, /* nonseekable */
};
@ nonseekable2 depends on !has_llseek @
identifier fops0.fops;
identifier open.open_f;
@@
struct file_operations fops = {
... .open = open_f, ...
+.llseek = no_llseek, /* open uses nonseekable */
};
// use seq_lseek for sequential files
/////////////////////////////////////
@ seq depends on !has_llseek @
identifier fops0.fops;
identifier sr ~= "seq_read";
@@
struct file_operations fops = {
... .read = sr, ...
+.llseek = seq_lseek, /* we have seq_read */
};
// use default_llseek if there is a readdir
///////////////////////////////////////////
@ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier readdir_e;
@@
// any other fop is used that changes pos
struct file_operations fops = {
... .readdir = readdir_e, ...
+.llseek = default_llseek, /* readdir is present */
};
// use default_llseek if at least one of read/write touches f_pos
/////////////////////////////////////////////////////////////////
@ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read.read_f;
@@
// read fops use offset
struct file_operations fops = {
... .read = read_f, ...
+.llseek = default_llseek, /* read accesses f_pos */
};
@ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write.write_f;
@@
// write fops use offset
struct file_operations fops = {
... .write = write_f, ...
+ .llseek = default_llseek, /* write accesses f_pos */
};
// Use noop_llseek if neither read nor write accesses f_pos
///////////////////////////////////////////////////////////
@ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
identifier write_no_fpos.write_f;
@@
// write fops use offset
struct file_operations fops = {
...
.write = write_f,
.read = read_f,
...
+.llseek = noop_llseek, /* read and write both use no f_pos */
};
@ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write_no_fpos.write_f;
@@
struct file_operations fops = {
... .write = write_f, ...
+.llseek = noop_llseek, /* write uses no f_pos */
};
@ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
@@
struct file_operations fops = {
... .read = read_f, ...
+.llseek = noop_llseek, /* read uses no f_pos */
};
@ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
@@
struct file_operations fops = {
...
+.llseek = noop_llseek, /* no read or write fn */
};
===== End semantic patch =====
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Julia Lawall <julia@diku.dk>
Cc: Christoph Hellwig <hch@infradead.org>
|
|
No need for seek here, so let's just use nonseekable_open.
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
|
|
The default llseek operation is changing from
default_llseek to no_llseek, so all code relying on
the current behaviour needs to make that explicit.
The wireless driver infrastructure and some of the drivers
make use of generated debugfs files, so they cannot
be converted by our script that automatically determines
the right operation.
All these files use debugfs and they typically rely
on simple_read_from_buffer, so the best llseek operation
here is generic_file_llseek.
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: "John W. Linville" <linville@tuxdriver.com>
Cc: linux-wireless@vger.kernel.org
Cc: netdev@vger.kernel.org
|
|
Add a list_has_sctp_addr function to simplify loop
Based on a patches by Dan Carpenter and David Miller
Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6
|
|
commit 30fff923 introduced in linux-2.6.33 (udp: bind() optimisation)
added a secondary hash on UDP, hashed on (local addr, local port).
Problem is that following sequence :
fd = socket(...)
connect(fd, &remote, ...)
not only selects remote end point (address and port), but also sets
local address, while UDP stack stored in secondary hash table the socket
while its local address was INADDR_ANY (or ipv6 equivalent)
Sequence is :
- autobind() : choose a random local port, insert socket in hash tables
[while local address is INADDR_ANY]
- connect() : set remote address and port, change local address to IP
given by a route lookup.
When an incoming UDP frame comes, if more than 10 sockets are found in
primary hash table, we switch to secondary table, and fail to find
socket because its local address changed.
One solution to this problem is to rehash datagram socket if needed.
We add a new rehash(struct socket *) method in "struct proto", and
implement this method for UDP v4 & v6, using a common helper.
This rehashing only takes care of secondary hash table, since primary
hash (based on local port only) is not changed.
Reported-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
Blackhole routes are used when xfrm_lookup() returns -EREMOTE (error
triggered by IKE for example), hence this kind of route is always
temporary and so we should check if a better route exists for next
packets.
Bug has been introduced by commit d11a4dc18bf41719c9f0d7ed494d295dd2973b92.
Signed-off-by: Jianzhao Wang <jianzhao.wang@6wind.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
Hi,
Here is one more of these warnings and a patch below:
Sep 5 23:52:33 del kernel: [46044.244833] ===================================================
Sep 5 23:52:33 del kernel: [46044.269681] [ INFO: suspicious rcu_dereference_check() usage. ]
Sep 5 23:52:33 del kernel: [46044.277000] ---------------------------------------------------
Sep 5 23:52:33 del kernel: [46044.285185] net/ipv4/fib_trie.c:1756 invoked rcu_dereference_check() without protection!
Sep 5 23:52:33 del kernel: [46044.293627]
Sep 5 23:52:33 del kernel: [46044.293632] other info that might help us debug this:
Sep 5 23:52:33 del kernel: [46044.293634]
Sep 5 23:52:33 del kernel: [46044.325333]
Sep 5 23:52:33 del kernel: [46044.325335] rcu_scheduler_active = 1, debug_locks = 0
Sep 5 23:52:33 del kernel: [46044.348013] 1 lock held by pppd/1717:
Sep 5 23:52:33 del kernel: [46044.357548] #0: (rtnl_mutex){+.+.+.}, at: [<c125dc1f>] rtnl_lock+0xf/0x20
Sep 5 23:52:33 del kernel: [46044.367647]
Sep 5 23:52:33 del kernel: [46044.367652] stack backtrace:
Sep 5 23:52:33 del kernel: [46044.387429] Pid: 1717, comm: pppd Not tainted 2.6.35.4.4a #3
Sep 5 23:52:33 del kernel: [46044.398764] Call Trace:
Sep 5 23:52:33 del kernel: [46044.409596] [<c12f9aba>] ? printk+0x18/0x1e
Sep 5 23:52:33 del kernel: [46044.420761] [<c1053969>] lockdep_rcu_dereference+0xa9/0xb0
Sep 5 23:52:33 del kernel: [46044.432229] [<c12b7235>] trie_firstleaf+0x65/0x70
Sep 5 23:52:33 del kernel: [46044.443941] [<c12b74d4>] fib_table_flush+0x14/0x170
Sep 5 23:52:33 del kernel: [46044.455823] [<c1033e92>] ? local_bh_enable_ip+0x62/0xd0
Sep 5 23:52:33 del kernel: [46044.467995] [<c12fc39f>] ? _raw_spin_unlock_bh+0x2f/0x40
Sep 5 23:52:33 del kernel: [46044.480404] [<c12b24d0>] ? fib_sync_down_dev+0x120/0x180
Sep 5 23:52:33 del kernel: [46044.493025] [<c12b069d>] fib_flush+0x2d/0x60
Sep 5 23:52:33 del kernel: [46044.505796] [<c12b06f5>] fib_disable_ip+0x25/0x50
Sep 5 23:52:33 del kernel: [46044.518772] [<c12b10d3>] fib_netdev_event+0x73/0xd0
Sep 5 23:52:33 del kernel: [46044.531918] [<c1048dfd>] notifier_call_chain+0x2d/0x70
Sep 5 23:52:33 del kernel: [46044.545358] [<c1048f0a>] raw_notifier_call_chain+0x1a/0x20
Sep 5 23:52:33 del kernel: [46044.559092] [<c124f687>] call_netdevice_notifiers+0x27/0x60
Sep 5 23:52:33 del kernel: [46044.573037] [<c124faec>] __dev_notify_flags+0x5c/0x80
Sep 5 23:52:33 del kernel: [46044.586489] [<c124fb47>] dev_change_flags+0x37/0x60
Sep 5 23:52:33 del kernel: [46044.599394] [<c12a8a8d>] devinet_ioctl+0x54d/0x630
Sep 5 23:52:33 del kernel: [46044.612277] [<c12aabb7>] inet_ioctl+0x97/0xc0
Sep 5 23:52:34 del kernel: [46044.625208] [<c123f6af>] sock_ioctl+0x6f/0x270
Sep 5 23:52:34 del kernel: [46044.638046] [<c109d2b0>] ? handle_mm_fault+0x420/0x6c0
Sep 5 23:52:34 del kernel: [46044.650968] [<c123f640>] ? sock_ioctl+0x0/0x270
Sep 5 23:52:34 del kernel: [46044.663865] [<c10c3188>] vfs_ioctl+0x28/0xa0
Sep 5 23:52:34 del kernel: [46044.676556] [<c10c38fa>] do_vfs_ioctl+0x6a/0x5c0
Sep 5 23:52:34 del kernel: [46044.688989] [<c1048676>] ? up_read+0x16/0x30
Sep 5 23:52:34 del kernel: [46044.701411] [<c1021376>] ? do_page_fault+0x1d6/0x3a0
Sep 5 23:52:34 del kernel: [46044.714223] [<c10b6588>] ? fget_light+0xf8/0x2f0
Sep 5 23:52:34 del kernel: [46044.726601] [<c1241f98>] ? sys_socketcall+0x208/0x2c0
Sep 5 23:52:34 del kernel: [46044.739140] [<c10c3eb3>] sys_ioctl+0x63/0x70
Sep 5 23:52:34 del kernel: [46044.751967] [<c12fca3d>] syscall_call+0x7/0xb
Sep 5 23:52:34 del kernel: [46044.764734] [<c12f0000>] ? cookie_v6_check+0x3d0/0x630
-------------->
This patch fixes the warning:
===================================================
[ INFO: suspicious rcu_dereference_check() usage. ]
---------------------------------------------------
net/ipv4/fib_trie.c:1756 invoked rcu_dereference_check() without protection!
other info that might help us debug this:
rcu_scheduler_active = 1, debug_locks = 0
1 lock held by pppd/1717:
#0: (rtnl_mutex){+.+.+.}, at: [<c125dc1f>] rtnl_lock+0xf/0x20
stack backtrace:
Pid: 1717, comm: pppd Not tainted 2.6.35.4a #3
Call Trace:
[<c12f9aba>] ? printk+0x18/0x1e
[<c1053969>] lockdep_rcu_dereference+0xa9/0xb0
[<c12b7235>] trie_firstleaf+0x65/0x70
[<c12b74d4>] fib_table_flush+0x14/0x170
...
Allow trie_firstleaf() to be called either under rcu_read_lock()
protection or with RTNL held. The same annotation is added to
node_parent_rcu() to prevent a similar warning a bit later.
Followup of commits 634a4b20 and 4eaa0e3c.
Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
- Do not create expectation when forwarding the PORT
command to avoid blocking the connection. The problem is that
nf_conntrack_ftp.c:help() tries to create the same expectation later in
POST_ROUTING and drops the packet with "dropping packet" message after
failure in nf_ct_expect_related.
- Change ip_vs_update_conntrack to alter the conntrack
for related connections from real server. If we do not alter the reply in
this direction the next packet from client sent to vport 20 comes as NEW
connection. We alter it but may be some collision happens for both
conntracks and the second conntrack gets destroyed immediately. The
connection stucks too.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
The patch: "gro: fix different skb headrooms" in its part:
"2) allocate a minimal skb for head of frag_list" is buggy. The copied
skb has p->data set at the ip header at the moment, and skb_gro_offset
is the length of ip + tcp headers. So, after the change the length of
mac header is skipped. Later skb_set_mac_header() sets it into the
NET_SKB_PAD area (if it's long enough) and ip header is misaligned at
NET_SKB_PAD + NET_IP_ALIGN offset. There is no reason to assume the
original skb was wrongly allocated, so let's copy it as it was.
bugzilla : https://bugzilla.kernel.org/show_bug.cgi?id=16626
fixes commit: 3d3be4333fdf6faa080947b331a6a19bce1a4f57
Reported-by: Plamen Petrov <pvp-lsts@fs.uni-ruse.bg>
Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
CC: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Plamen Petrov <pvp-lsts@fs.uni-ruse.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6: (26 commits)
pkt_sched: Fix lockdep warning on est_tree_lock in gen_estimator
ipvs: avoid oops for passive FTP
Revert "sky2: don't do GRO on second port"
gro: fix different skb headrooms
bridge: Clear INET control block of SKBs passed into ip_fragment().
3c59x: Remove incorrect locking; correct documented lock hierarchy
sky2: don't do GRO on second port
ipv4: minor fix about RPF in help of Kconfig
xfrm_user: avoid a warning with some compiler
net/sched/sch_hfsc.c: initialize parent's cl_cfmin properly in init_vf()
pxa168_eth: fix a mdiobus leak
net sched: fix kernel leak in act_police
vhost: stop worker only if created
MAINTAINERS: Add ehea driver as Supported
ath9k_hw: fix parsing of HT40 5 GHz CTLs
ath9k_hw: Fix EEPROM uncompress block reading on AR9003
wireless: register wiphy rfkill w/o holding cfg80211_mutex
netlink: Make NETLINK_USERSOCK work again.
irda: Correctly clean up self->ias_obj on irda_bind() failure.
wireless extensions: fix kernel heap content leak
...
|
|
Actually iterate over the next-hops to make sure we have
a device match. Otherwise RP filtering is always elided
when the route matched has multiple next-hops.
Reported-by: Igor M Podlesny <for.poige@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
We assumed that unix_autobind() never fails if kzalloc() succeeded.
But unix_autobind() allows only 1048576 names. If /proc/sys/fs/file-max is
larger than 1048576 (e.g. systems with more than 10GB of RAM), a local user can
consume all names using fork()/socket()/bind().
If all names are in use, those who call bind() with addr_len == sizeof(short)
or connect()/sendmsg() with setsockopt(SO_PASSCRED) will continue
while (1)
yield();
loop at unix_autobind() till a name becomes available.
This patch adds a loop counter in order to give up after 1048576 attempts.
Calling yield() for once per 256 attempts may not be sufficient when many names
are already in use, for __unix_find_socket_byname() can take long time under
such circumstance. Therefore, this patch also adds cond_resched() call.
Note that currently a local user can consume 2GB of kernel memory if the user
is allowed to create and autobind 1048576 UNIX domain sockets. We should
consider adding some restriction for autobind operation.
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
This is an off by one. We would go past the end when we NUL terminate
the "value" string at end of the function. The "value" buffer is
allocated in irlan_client_parse_response() or
irlan_provider_parse_command().
CC: stable@kernel.org
Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
RFC5722 prohibits reassembling IPv6 fragments when some data overlaps.
Bug spotted by Zhang Zuotao <zuotao.zhang@6wind.com>.
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
RFC5722 prohibits reassembling fragments when some data overlaps.
Bug spotted by Zhang Zuotao <zuotao.zhang@6wind.com>.
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
When a net device is implementing the select_queue callback and is part of
a bridge, frames coming from the bridge already have a tx queue associated
to the socket (introduced in commit a4ee3ce3293dc931fab19beb472a8bde1295aebe,
"net: Use sk_tx_queue_mapping for connected sockets"). The call to
sk_tx_queue_get will then return the tx queue used by the bridge instead
of calling the select_queue callback.
In case of mac80211 this broke QoS which is implemented by using the
select_queue callback. Furthermore it introduced problems with rt2x00
because frames with the same TID and RA sometimes appeared on different
tx queues which the hw cannot handle correctly.
Fix this by always calling select_queue first if it is available and only
afterwards use the socket tx queue mapping.
Signed-off-by: Helmut Schaa <helmut.schaa@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
This patch fixes a lockdep warning:
[ 516.287584] =========================================================
[ 516.288386] [ INFO: possible irq lock inversion dependency detected ]
[ 516.288386] 2.6.35b #7
[ 516.288386] ---------------------------------------------------------
[ 516.288386] swapper/0 just changed the state of lock:
[ 516.288386] (&qdisc_tx_lock){+.-...}, at: [<c12eacda>] est_timer+0x62/0x1b4
[ 516.288386] but this lock took another, SOFTIRQ-unsafe lock in the past:
[ 516.288386] (est_tree_lock){+.+...}
[ 516.288386]
[ 516.288386] and interrupts could create inverse lock ordering between them.
...
So, est_tree_lock needs BH protection because it's taken by
qdisc_tx_lock, which is used both in BH and process contexts.
(Full warning with this patch at netdev, 02 Sep 2010.)
Fixes commit: ae638c47dc040b8def16d05dc6acdd527628f231
("pkt_sched: gen_estimator: add a new lock")
Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
Fix Passive FTP problem in ip_vs_ftp:
- Do not oops in nf_nat_set_seq_adjust (adjust_tcp_sequence) when
iptable_nat module is not loaded
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
Packets entering GRO might have different headrooms, even for a given
flow (because of implementation details in drivers, like copybreak).
We cant force drivers to deliver packets with a fixed headroom.
1) fix skb_segment()
skb_segment() makes the false assumption headrooms of fragments are same
than the head. When CHECKSUM_PARTIAL is used, this can give csum_start
errors, and crash later in skb_copy_and_csum_dev()
2) allocate a minimal skb for head of frag_list
skb_gro_receive() uses netdev_alloc_skb(headroom + skb_gro_offset(p)) to
allocate a fresh skb. This adds NET_SKB_PAD to a padding already
provided by netdevice, depending on various things, like copybreak.
Use alloc_skb() to allocate an exact padding, to reduce cache line
needs:
NET_SKB_PAD + NET_IP_ALIGN
bugzilla : https://bugzilla.kernel.org/show_bug.cgi?id=16626
Many thanks to Plamen Petrov, testing many debugging patches !
With help of Jarek Poplawski.
Reported-by: Plamen Petrov <pvp-lsts@fs.uni-ruse.bg>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
In a similar vain to commit 17762060c25590bfddd68cc1131f28ec720f405f
("bridge: Clear IPCB before possible entry into IP stack")
Any time we call into the IP stack we have to make sure the state
there is as expected by the ipv4 code.
With help from Eric Dumazet and Herbert Xu.
Reported-by: Bandan Das <bandan.das@stratus.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
Attached is a small patch to remove a warning ("warning: ISO C90 forbids
mixed declarations and code" with gcc 4.3.2).
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
This patch fixes init_vf() function, so on each new backlog period parent's
cl_cfmin is properly updated (including further propgation towards the root),
even if the activated leaf has no upperlimit curve defined.
Signed-off-by: Michal Soltys <soltys@ziu.info>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
While reviewing commit 1c40be12f7d8ca1d387510d39787b12e512a7ce8, I
audited other users of tc_action_ops->dump for information leaks.
That commit covered almost all of them but act_police still had a leak.
opt.limit and opt.capab aren't zeroed out before the structure is
passed out.
This patch uses the C99 initializers to zero everything unused out.
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Acked-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6
|
|
Otherwise lockdep complains...
https://bugzilla.kernel.org/show_bug.cgi?id=17311
[ INFO: possible circular locking dependency detected ]
2.6.36-rc2-git4 #12
-------------------------------------------------------
kworker/0:3/3630 is trying to acquire lock:
(rtnl_mutex){+.+.+.}, at: [<ffffffff813396c7>] rtnl_lock+0x12/0x14
but task is already holding lock:
(rfkill_global_mutex){+.+.+.}, at: [<ffffffffa014b129>]
rfkill_switch_all+0x24/0x49 [rfkill]
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (rfkill_global_mutex){+.+.+.}:
[<ffffffff81079ad7>] lock_acquire+0x120/0x15b
[<ffffffff813ae869>] __mutex_lock_common+0x54/0x52e
[<ffffffff813aede9>] mutex_lock_nested+0x34/0x39
[<ffffffffa014b4ab>] rfkill_register+0x2b/0x29c [rfkill]
[<ffffffffa0185ba0>] wiphy_register+0x1ae/0x270 [cfg80211]
[<ffffffffa0206f01>] ieee80211_register_hw+0x1b4/0x3cf [mac80211]
[<ffffffffa0292e98>] iwl_ucode_callback+0x9e9/0xae3 [iwlagn]
[<ffffffff812d3e9d>] request_firmware_work_func+0x54/0x6f
[<ffffffff81065d15>] kthread+0x8c/0x94
[<ffffffff8100ac24>] kernel_thread_helper+0x4/0x10
-> #1 (cfg80211_mutex){+.+.+.}:
[<ffffffff81079ad7>] lock_acquire+0x120/0x15b
[<ffffffff813ae869>] __mutex_lock_common+0x54/0x52e
[<ffffffff813aede9>] mutex_lock_nested+0x34/0x39
[<ffffffffa018605e>] cfg80211_get_dev_from_ifindex+0x1b/0x7c [cfg80211]
[<ffffffffa0189f36>] cfg80211_wext_giwscan+0x58/0x990 [cfg80211]
[<ffffffff8139a3ce>] ioctl_standard_iw_point+0x1a8/0x272
[<ffffffff8139a529>] ioctl_standard_call+0x91/0xa7
[<ffffffff8139a687>] T.723+0xbd/0x12c
[<ffffffff8139a727>] wext_handle_ioctl+0x31/0x6d
[<ffffffff8133014e>] dev_ioctl+0x63d/0x67a
[<ffffffff8131afd9>] sock_ioctl+0x48/0x21d
[<ffffffff81102abd>] do_vfs_ioctl+0x4ba/0x509
[<ffffffff81102b5d>] sys_ioctl+0x51/0x74
[<ffffffff81009e02>] system_call_fastpath+0x16/0x1b
-> #0 (rtnl_mutex){+.+.+.}:
[<ffffffff810796b0>] __lock_acquire+0xa93/0xd9a
[<ffffffff81079ad7>] lock_acquire+0x120/0x15b
[<ffffffff813ae869>] __mutex_lock_common+0x54/0x52e
[<ffffffff813aede9>] mutex_lock_nested+0x34/0x39
[<ffffffff813396c7>] rtnl_lock+0x12/0x14
[<ffffffffa0185cb5>] cfg80211_rfkill_set_block+0x1a/0x7b [cfg80211]
[<ffffffffa014aed0>] rfkill_set_block+0x80/0xd5 [rfkill]
[<ffffffffa014b07e>] __rfkill_switch_all+0x3f/0x6f [rfkill]
[<ffffffffa014b13d>] rfkill_switch_all+0x38/0x49 [rfkill]
[<ffffffffa014b821>] rfkill_op_handler+0x105/0x136 [rfkill]
[<ffffffff81060708>] process_one_work+0x248/0x403
[<ffffffff81062620>] worker_thread+0x139/0x214
[<ffffffff81065d15>] kthread+0x8c/0x94
[<ffffffff8100ac24>] kernel_thread_helper+0x4/0x10
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
|
|
Once we started enforcing the a nl_table[] entry exist for
a protocol, NETLINK_USERSOCK stopped working. Add a dummy
table entry so that it works again.
Reported-by: Thomas Voegtle <tv@lio96.de>
Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
If irda_open_tsap() fails, the irda_bind() code tries to destroy
the ->ias_obj object by hand, but does so wrongly.
In particular, it fails to a) release the hashbin attached to the
object and b) reset the self->ias_obj pointer to NULL.
Fix both problems by using irias_delete_object() and explicitly
setting self->ias_obj to NULL, just as irda_release() does.
Reported-by: Tavis Ormandy <taviso@cmpxchg8b.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
Wireless extensions have an unfortunate, undocumented
requirement which requires drivers to always fill
iwp->length when returning a successful status. When
a driver doesn't do this, it leads to a kernel heap
content leak when userspace offers a larger buffer
than would have been necessary.
Arguably, this is a driver bug, as it should, if it
returns 0, fill iwp->length, even if it separately
indicated that the buffer contents was not valid.
However, we can also at least avoid the memory content
leak if the driver doesn't do this by setting the iwp
length to max_tokens, which then reflects how big the
buffer is that the driver may fill, regardless of how
big the userspace buffer is.
To illustrate the point, this patch also fixes a
corresponding cfg80211 bug (since this requirement
isn't documented nor was ever pointed out by anyone
during code review, I don't trust all drivers nor
all cfg80211 handlers to implement it correctly).
Cc: stable@kernel.org [all the way back]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
|
|
The new workqueue changes helped me find this bug
that's been lingering since the changes to the work
processing in mac80211 -- the work timer is never
deleted properly. Do that to avoid having it fire
after all data structures have been freed. It can't
be re-armed because all it will do, if running, is
schedule the work, but that gets flushed later and
won't have anything to do since all work items are
gone by now (by way of interface removal).
Cc: stable@kernel.org [2.6.34+]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
|
|
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6:
net/ipv4: Eliminate kstrdup memory leak
net/caif/cfrfml.c: use asm/unaligned.h
ax25: missplaced sock_put(sk)
qlge: reset the chip before freeing the buffers
l2tp: test for ethernet header in l2tp_eth_dev_recv()
tcp: select(writefds) don't hang up when a peer close connection
tcp: fix three tcp sysctls tuning
tcp: Combat per-cpu skew in orphan tests.
pxa168_eth: silence gcc warnings
pxa168_eth: update call to phy_mii_ioctl()
pxa168_eth: fix error handling in prope
pxa168_eth: remove unneeded null check
phylib: Fix race between returning phydev and calling adjust_link
caif-driver: add HAS_DMA dependency
3c59x: Fix deadlock between boomerang_interrupt and boomerang_start_tx
qlcnic: fix poll implementation
netxen: fix poll implementation
bridge: netfilter: fix a memory leak
|
|
The string clone is only used as a temporary copy of the argument val
within the while loop, and so it should be freed before leaving the
function. The call to strsep, however, modifies clone, so a pointer to the
front of the string is kept in saved_clone, to make it possible to free it.
The sematic match that finds this problem is as follows:
(http://coccinelle.lip6.fr/)
// <smpl>
@r exists@
local idexpression x;
expression E;
identifier l;
statement S;
@@
*x= \(kasprintf\|kstrdup\)(...);
...
if (x == NULL) S
... when != kfree(x)
when != E = x
if (...) {
<... when != kfree(x)
* goto l;
...>
* return ...;
}
// </smpl>
Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
caif does not build on ia64 starting with 2.6.32-rc1. Using
asm/unaligned.h instead of linux/unaligned/le_byteshift.h fixes the issue.
include/linux/unaligned/le_byteshift.h:40:50: error: redefinition of 'get_unaligned_le16'
include/linux/unaligned/le_byteshift.h:45:50: error: redefinition of 'get_unaligned_le32'
include/linux/unaligned/le_byteshift.h:50:50: error: redefinition of 'get_unaligned_le64'
include/linux/unaligned/le_byteshift.h:55:51: error: redefinition of 'put_unaligned_le16'
include/linux/unaligned/le_byteshift.h:60:51: error: redefinition of 'put_unaligned_le32'
include/linux/unaligned/le_byteshift.h:65:51: error: redefinition of 'put_unaligned_le64'
include/linux/unaligned/le_struct.h:31:51: note: previous definition of 'put_unaligned_le64' was here
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
This patch moves a missplaced sock_put(sk) after
bh_unlock_sock(sk)
like in other parts of AX25 driver.
Signed-off-by: Bernard Pidoux <f6bvp@free.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
close https://bugzilla.kernel.org/show_bug.cgi?id=16529
Before calling dev_forward_skb(), we should make sure skb head contains
at least an ethernet header, even if length included in upper layer said
so. Use pskb_may_pull() to make sure this ethernet header is present in
skb head.
Reported-by: Thomas Heil <heil@terminal-consulting.de>
Reported-by: Ian Campbell <Ian.Campbell@eu.citrix.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
This issue come from ruby language community. Below test program
hang up when only run on Linux.
% uname -mrsv
Linux 2.6.26-2-486 #1 Sat Dec 26 08:37:39 UTC 2009 i686
% ruby -rsocket -ve '
BasicSocket.do_not_reverse_lookup = true
serv = TCPServer.open("127.0.0.1", 0)
s1 = TCPSocket.open("127.0.0.1", serv.addr[1])
s2 = serv.accept
s2.close
s1.write("a") rescue p $!
s1.write("a") rescue p $!
Thread.new {
s1.write("a")
}.join'
ruby 1.9.3dev (2010-07-06 trunk 28554) [i686-linux]
#<Errno::EPIPE: Broken pipe>
[Hang Here]
FreeBSD, Solaris, Mac doesn't. because Ruby's write() method call
select() internally. and tcp_poll has a bug.
SUS defined 'ready for writing' of select() as following.
| A descriptor shall be considered ready for writing when a call to an output
| function with O_NONBLOCK clear would not block, whether or not the function
| would transfer data successfully.
That said, EPIPE situation is clearly one of 'ready for writing'.
We don't have read-side issue because tcp_poll() already has read side
shutdown care.
| if (sk->sk_shutdown & RCV_SHUTDOWN)
| mask |= POLLIN | POLLRDNORM | POLLRDHUP;
So, Let's insert same logic in write side.
- reference url
http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-core/31065
http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-core/31068
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
As discovered by Anton Blanchard, current code to autotune
tcp_death_row.sysctl_max_tw_buckets, sysctl_tcp_max_orphans and
sysctl_max_syn_backlog makes little sense.
The bigger a page is, the less tcp_max_orphans is : 4096 on a 512GB
machine in Anton's case.
(tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket))
is much bigger if spinlock debugging is on. Its wrong to select bigger
limits in this case (where kernel structures are also bigger)
bhash_size max is 65536, and we get this value even for small machines.
A better ground is to use size of ehash table, this also makes code
shorter and more obvious.
Based on a patch from Anton, and another from David.
Reported-and-tested-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
As reported by Anton Blanchard when we use
percpu_counter_read_positive() to make our orphan socket limit checks,
the check can be off by up to num_cpus_online() * batch (which is 32
by default) which on a 128 cpu machine can be as large as the default
orphan limit itself.
Fix this by doing the full expensive sum check if the optimized check
triggers.
Reported-by: Anton Blanchard <anton@samba.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
|
|
nf_bridge_alloc() always reset the skb->nf_bridge, so we should always
put the old one.
Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Bart De Schuymer <bdschuym@pandora.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6: (27 commits)
netfilter: fix CONFIG_COMPAT support
isdn/avm: fix build when PCMCIA is not enabled
header: fix broken headers for user space
e1000e: don't check for alternate MAC addr on parts that don't support it
e1000e: disable ASPM L1 on 82573
ll_temac: Fix poll implementation
netxen: fix a race in netxen_nic_get_stats()
qlnic: fix a race in qlcnic_get_stats()
irda: fix a race in irlan_eth_xmit()
net: sh_eth: remove unused variable
netxen: update version 4.0.74
netxen: fix inconsistent lock state
vlan: Match underlying dev carrier on vlan add
ibmveth: Fix opps during MTU change on an active device
ehea: Fix synchronization between HW and SW send queue
bnx2x: Update bnx2x version to 1.52.53-4
bnx2x: Fix PHY locking problem
rds: fix a leak of kernel memory
netlink: fix compat recvmsg
netfilter: fix userspace header warning
...
|
|
commit f3c5c1bfd430858d3a05436f82c51e53104feb6b
(netfilter: xtables: make ip_tables reentrant) forgot to
also compute the jumpstack size in the compat handlers.
Result is that "iptables -I INPUT -j userchain" turns into -j DROP.
Reported by Sebastian Roesner on #netfilter, closes
http://bugzilla.netfilter.org/show_bug.cgi?id=669.
Note: arptables change is compile-tested only.
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Mikael Pettersson <mikpe@it.uu.se>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
After skb is queued, its illegal to dereference it.
Cache skb->len into a temporary variable.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
When adding a new vlan, if the underlying interface has no carrier,
then the newly added vlan interface should also have no carrier.
At present, this is not true - the newly added vlan is added with
carrier up. Fix by checking state of real device.
Signed-off-by: Phil Oester <kernel@linuxace.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
struct rds_rdma_notify contains a 32 bits hole on 64bit arches,
make sure it is zeroed before copying it to user.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Andy Grover <andy.grover@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
Since
commit 1dacc76d0014a034b8aca14237c127d7c19d7726
Author: Johannes Berg <johannes@sipsolutions.net>
Date: Wed Jul 1 11:26:02 2009 +0000
net/compat/wext: send different messages to compat tasks
we had a race condition when setting and then
restoring frag_list. Eric attempted to fix it,
but the fix created even worse problems.
However, the original motivation I had when I
added the code that turned out to be racy is
no longer clear to me, since we only copy up
to skb->len to userspace, which doesn't include
the frag_list length. As a result, not doing
any frag_list clearing and restoring avoids
the race condition, while not introducing any
other problems.
Additionally, while preparing this patch I found
that since none of the remaining netlink code is
really aware of the frag_list, we need to use the
original skb's information for packet information
and credentials. This fixes, for example, the
group information received by compat tasks.
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: stable@kernel.org [2.6.31+, for 2.6.35 revert 1235f504aa]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
* 'bugfixes' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6:
NFS: Fix an Oops in the NFSv4 atomic open code
NFS: Fix the selection of security flavours in Kconfig
NFS: fix the return value of nfs_file_fsync()
rpcrdma: Fix SQ size calculation when memreg is FRMR
xprtrdma: Do not truncate iova_start values in frmr registrations.
nfs: Remove redundant NULL check upon kfree()
nfs: Add "lookupcache" to displayed mount options
NFS: allow close-to-open cache semantics to apply to root of NFS filesystem
SUNRPC: fix NFS client over TCP hangs due to packet loss (Bug 16494)
|
|
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6:
net: Fix a memmove bug in dev_gro_receive()
net sched: fix some kernel memory leaks
netfilter: {ip,ip6,arp}_tables: avoid lockdep false positive
Revert "netlink: netlink_recvmsg() fix"
ipv6: remove sysctl jiffies conversion on gc_elasticity and min_adv_mss
xfrm: Use GFP_ATOMIC in xfrm_compile_policy
ath5k: disable ASPM L0s for all cards
ath9k_htc: load proper firmware for device ID 7015
wl1251: fix trigger scan timeout usage
ath9k_htc: Fix disconnect issue in HT40 mode.
ath9k_htc: fix panic on packet injection using airbase-ng tool.
ipw2100: register pm_qos request before registering pci driver
|
|
>Xin Xiaohui wrote:
> I looked into the code dev_gro_receive(), found the code here:
> if the frags[0] is pulled to 0, then the page will be released,
> and memmove() frags left.
> Is that right? I'm not sure if memmove do right or not, but
> frags[0].size is never set after memove at least. what I think
> a simple way is not to do anything if we found frags[0].size == 0.
> The patch is as followed.
...
This version of the patch fixes the bug directly in memmove.
Reported-by: "Xin, Xiaohui" <xiaohui.xin@intel.com>
Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
We leak at least 32bits of kernel memory to user land in tc dump,
because we dont init all fields (capab ?) of the dumped structure.
Use C99 initializers so that holes and non explicit fields are zeroed.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|