sxw2txt

#!/usr/bin/perl -w

# [[sxw2txt]] -- Coverts OpenOffice.org Writer files to plain text.
# Copyright (C) 2004 Liam Morland
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
# USA.
#
# Liam Morland <Liam@Morland.ca> <http://Liam.Morland.ca/>
# 86A McDougall Road, Waterloo, Ontario, N2L 5C5, CANADA

#modified radeff 2005

use strict;

# First argument is taken to be the input file. All other args are ignored.
my $input_file = shift;

# If we have a filename, try to get the content.xml from it,
# otherwise print usage information.
if ($input_file){
	$_ = `unzip -p $input_file content.xml 2>/dev/null`;
} else {
	print "sxw2txt: Coverts OpenOffice.org Writer files to plain text.\n";
	print "Usage: sxw2txt input-file\n";
	exit(1);
}

# If we don't have any content.xml, exit with an error.
if (!$_){
	print "sxw2txt: Error: $input_file is probably not an OpenOffice.org file.\n";
	exit(2);
}

# Convert the OOo XML to text with a series of regex substitutions.
s,\n+, ,g;

# Tables are wrapped with [begin-table] and [end-table].
# Rows and cells begin with [table-row] and [table-cell] respectively.
# modif radeff
#s,<table:table( [^>]*)?>,\n\n[begin-table],g;
#s,</table:table>,\n[end-table],g;
#s,<table:table-cell( [^>]*)?>(<[^>]+>)*<text:p[^>]*>,\n[table cell],g;
#s,<table:table-row( [^>]*)?>,\n\n[table row],g;
s,<table:table( [^>]*)?>,\n,g;
s,</table:table>,\n,g;
s,<table:table-cell( [^>]*)?>(<[^>]+>)*<text:p[^>]*>,\n,g;
s,<table:table-row( [^>]*)?>,\n,g;
# end modif
# OOo tabs are made into tab characters.
s,<text:tab-stop/>,\t,g;

# Each list item is given a '*' as a bullet. 
# Sorry, no fancy support for nested lists yet.
s,<text:list-item><text:p[^>]*>,\n\n* ,g;

# Skip two lines before each new paragraph.
s,<text:p[^>]*>,\n\n,g;

# ajout radeff
s,<text:line-break/>,\n,g;


# Get rid of any remaining tags. Want to add support for tags not
# handled above? Do it above this line.
s,<[^>]*>,,g;

# Convert common entities into the appropriate character.
s,&lt;,<,g;
s,&gt;,>,g;
s,&apos;,',g;
s,&quot;,",g;
s,&amp;,&,g;
s,é,é,g;
s,Ú,è,g;
s,â,',g;
s,Ã,à,g;
s,à¢,â,g;
s,à®,î,g;
s,à§,ç,g;
s,  , ,g;
s,àŒ,ü,g;
s,Â,\n\n,g;
s,à«,ë,g;


# Remove extra whitespace and print the result, always ending with \n.
s,\n\n\n+,\n\n,sg;
s,^\s*(.+)\s*$,$1,s;
print "$_\n";

sxw2txt.sh

#! /usr/bin/bash
# Script to automatically convert openoffice sxw files to plain txt files
# Usage:
# Required: http://lists.debian.org/debian-wnpp/2004/12/msg00289.html sxw2text
# apt-get: NO
#
# Authors:
# FR, radeff@akademia.ch
# History
# 2005.11.18: FR, created
# To do:
#########
echo "Script to automatically convert word doc files to plain txt files"
echo "************"
WD=pwd
echo "Now converting all files under" .$WD
echo "************"
declare -i j
j=0
find . -name "*.sxw" | while read i
do
j=$j+1
echo "$j Converting $i TO $i.txt"
sxw2txt "$i" > "$i.txt"

done
echo "************"
#echo "Finished, $k  files converted"
echo "Finished, all  files converted"
  • info/sxw2txt.txt
  • Dernière modification: 2018/07/18 09:46
  • par radeff