Logo Search packages:      
Sourcecode: libpodofo version File versions  Download package

void TextExtractor::ExtractText ( PdfMemDocument pDocument,
PdfPage pPage 
) [private]

Extract all text from the given page

Parameters:
pDocumentthe owning document
pPageextract the text of this page.

Definition at line 52 of file TextExtractor.cpp.

References AddTextElement(), PoDoFo::ePdfContentsType_Keyword, PoDoFo::ePdfContentsType_Variant, PoDoFo::ePdfError_InternalLogic, PoDoFo::ePdfError_InvalidHandle, PoDoFo::PdfReference::GenerationNumber(), PoDoFo::PdfMemDocument::GetFont(), PoDoFo::PdfPage::GetFromResources(), PoDoFo::PdfName::GetName(), PoDoFo::PdfArray::GetSize(), PoDoFo::PdfReference::ObjectNumber(), PODOFO_RAISE_ERROR, PODOFO_RAISE_ERROR_INFO, PoDoFo::PdfContentsTokenizer::ReadNext(), and PoDoFo::PdfObject::Reference().

{
    const char*      pszToken = NULL;
    PdfVariant       var;
    EPdfContentsType eType;

    PdfContentsTokenizer tokenizer( pPage );

    double dCurPosX     = 0.0;
    double dCurPosY     = 0.0;
    double dCurFontSize = 0.0;
    bool   bTextBlock   = false;
    PdfFont* pCurFont   = NULL;

    std::stack<PdfVariant> stack;

    while( tokenizer.ReadNext( eType, pszToken, var ) )
    {
        if( eType == ePdfContentsType_Keyword )
        {
            // support 'l' and 'm' tokens
            if( strcmp( pszToken, "l" ) == 0 || 
                strcmp( pszToken, "m" ) == 0 )
            {
                dCurPosX = stack.top().GetReal();
                stack.pop();
                dCurPosY = stack.top().GetReal();
                stack.pop();
            }
            else if( strcmp( pszToken, "BT" ) == 0 ) 
            {
                bTextBlock   = true;     
                // BT does not reset font
                // dCurFontSize = 0.0;
                // pCurFont     = NULL;
            }
            else if( strcmp( pszToken, "ET" ) == 0 ) 
            {
                if( !bTextBlock ) 
                    fprintf( stderr, "WARNING: Found ET without BT!\n" );
            }

            if( bTextBlock ) 
            {
                if( strcmp( pszToken, "Tf" ) == 0 ) 
                {
                    dCurFontSize = stack.top().GetReal();
                    stack.pop();
                    PdfName fontName = stack.top().GetName();
                    PdfObject* pFont = pPage->GetFromResources( PdfName("Font"), fontName );
                    if( !pFont ) 
                    {
                        PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidHandle, "Cannot create font!" );
                    }

                    pCurFont = pDocument->GetFont( pFont );
                    if( !pCurFont ) 
                    {
                        fprintf( stderr, "WARNING: Unable to create font for object %i %i R\n",
                                 pFont->Reference().ObjectNumber(),
                                 pFont->Reference().GenerationNumber() );
                    }
                }
                else if( strcmp( pszToken, "Tj" ) == 0 ||
                         strcmp( pszToken, "'" ) == 0 ) 
                {
                    AddTextElement( dCurPosX, dCurPosY, pCurFont, stack.top().GetString() );
                    stack.pop();
                }
                else if( strcmp( pszToken, "\"" ) == 0 )
                {
                    AddTextElement( dCurPosX, dCurPosY, pCurFont, stack.top().GetString() );
                    stack.pop();
                    stack.pop(); // remove char spacing from stack
                    stack.pop(); // remove word spacing from stack
                }
                else if( strcmp( pszToken, "TJ" ) == 0 ) 
                {
                    PdfArray array = stack.top().GetArray();
                    stack.pop();
                    
                    for( int i=0; i<static_cast<int>(array.GetSize()); i++ ) 
                    {
                        if( array[i].IsString() )
                            AddTextElement( dCurPosX, dCurPosY, pCurFont, array[i].GetString() );
                    }
                }
            }
        }
        else if ( eType == ePdfContentsType_Variant )
        {
            stack.push( var );
        }
        else
        {
            // Impossible; type must be keyword or variant
            PODOFO_RAISE_ERROR( ePdfError_InternalLogic );
        }
    }
}

Here is the call graph for this function:


Generated by  Doxygen 1.6.0   Back to index