Blob: parse_companies.pl
Blob id: 9006492b4a4ad98a096c910eb6bfc00b0341cc51
Size: 1.6 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | #!/usr/bin/perl # SPDX-License-Identifier: GPL-2.0-or-later # parse companies from # https://www.bluetooth.com/specifications/assigned-numbers/company-identifiers use strict; # use URI::Encode qw(uri_decode); my %known_entities = ( 'nbsp' => ' ', 'aacute' => 'á', 'eacute' => 'é', 'iacute' => 'í', 'oacute' => 'ó', 'uacute' => 'ú', 'auml' => 'ä', 'uuml' => 'ü', 'Uuml' => 'Ü', ); # better to use URI::Encode if you have it sub uri_decode { my $name = $_[0]; foreach my $entity (keys %known_entities) { my $to = $known_entities{$entity}; $name =~ s/&$entity;/$to/g; } foreach my $entity (map { lc $_ } $name =~ /&([^;]+);/g) { if ($entity ne 'amp') { die "\nparse_companies.pl: Unable to convert &$entity; giving up\n"; } } $name =~ s/&/&/ig; $name =~ s/ / /ig; return $name; } # never parse HTML with regex! # except when you should my $identifier; my $next_is_name = 0; while (<>) { s/\xe2\x80\x8b//g; # kill zero width space # grab identifier (in hex) if (/\<td.*(0x[0-9A-F]{4})/i) { $identifier = $1; $next_is_name = 1; # next <td> should be company name } elsif ($next_is_name && m|\<td.*\>(.*)\<|) { my $name = uri_decode($1); $name =~ s/^\s+//g; # kill leading $name =~ s/\s+$//g; # and trailing space $name =~ s/"/\\"/g; # escape double quotes my $id = hex($identifier); if ($id != 65535) { print "\tcase $id:\n"; print "\t\treturn \"$name\";\n"; } $next_is_name = 0; } } |